hammer2 - major simplification of algorithms part 1/many
authorMatthew Dillon <dillon@apollo.backplane.com>
Wed, 30 Jul 2014 07:17:29 +0000 (00:17 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Wed, 30 Jul 2014 07:17:29 +0000 (00:17 -0700)
* Huge simplification of in-memory data structures and algorithms.
  Remove delete-duplicate, ownerq (shadow copies), dbq, dbtree, and most of
  the xid lo/hi sequencing.  Remove all the complexities related to
  managing the above elements.  Net removal of ~1500 lines of code or so.

* Blockmap deletions are now handled by the frontend, so the backend doesn't
  need to deal with shadowed deletions.  This is still fairly optimal since
  insertions are still handled by the backend during flushes.  So for quick
  create/delete operations the blockmap is never even initialized which means
  that deletions don't have to remove anything.

* Cleanup buffer cache on file removal / last-close, but allow file delete
  to simply wipe out the inode.  Don't bother iterating its indirect blocks
  or data blocks on-media but use the flush code to get rid of any chains
  still cached.

* Buffer invalidation on permanent chain deletions for modified chains.

* Major items still TODO: flush interlocks and meta-data updates.

sys/vfs/hammer2/TODO
sys/vfs/hammer2/hammer2.h
sys/vfs/hammer2/hammer2_chain.c
sys/vfs/hammer2/hammer2_cluster.c
sys/vfs/hammer2/hammer2_flush.c
sys/vfs/hammer2/hammer2_freemap.c
sys/vfs/hammer2/hammer2_inode.c
sys/vfs/hammer2/hammer2_ioctl.c
sys/vfs/hammer2/hammer2_subr.c
sys/vfs/hammer2/hammer2_vfsops.c
sys/vfs/hammer2/hammer2_vnops.c

index e1871e3..0308f00 100644 (file)
@@ -1,5 +1,6 @@
 
-* hammer2_xid_t needs to be 64 bits
+* flush synchronization boundary crossing check and current flush chain
+  interlock needed.
 
 * snapshot creation must allocate and separately pass a new pmp for the pfs
   degenerate 'cluster' representing the snapshot.  This theoretically will
index b6f0044..0b61e9b 100644 (file)
  * implementation.  See hammer2_disk.h for on-disk structures.
  *
  * There is an in-memory representation of all on-media data structure.
- * Basically everything is represented by a hammer2_chain structure
- * in-memory and other higher-level structures map to chains.
+ * Almost everything is represented by a hammer2_chain structure in-memory.
+ * Other higher-level structures typically map to chains.
  *
  * A great deal of data is accessed simply via its buffer cache buffer,
- * which is mapped for the duration of the chain's lock.  However, because
- * chains may represent blocks smaller than the 16KB minimum we impose
- * on buffer cache buffers, we cannot hold related buffer cache buffers
- * locked for smaller blocks.  In these situations we kmalloc() a copy
- * of the block.
+ * which is mapped for the duration of the chain's lock.  Hammer2 must
+ * implement its own buffer cache layer on top of the system layer to
+ * allow for different threads to lock different sub-block-sized buffers.
  *
  * When modifications are made to a chain a new filesystem block must be
- * allocated.  Multiple modifications do not necessarily allocate new
- * blocks.  However, when a flush occurs a flush synchronization point
- * is created and any new modifications made after this point will allocate
- * a new block even if the chain is already in a modified state.
+ * allocated.  Multiple modifications do not typically allocate new blocks
+ * until the current block has been flushed.  Flushes do not block the
+ * front-end unless the front-end operation crosses the current inode being
+ * flushed.
  *
  * The in-memory representation may remain cached (for example in order to
  * placemark clustering locks) even after the related data has been
  * detached.
- *
- *                             CORE SHARING
- *
- * In order to support concurrent flushes a flush synchronization point
- * is created represented by a transaction id.  Among other things,
- * operations may move filesystem objects from one part of the topology
- * to another (for example, if you rename a file or when indirect blocks
- * are created or destroyed, and a few other things).  When this occurs
- * across a flush synchronization point the flusher needs to be able to
- * recurse down BOTH the 'before' version of the topology and the 'after'
- * version.
- *
- * To facilitate this modifications to chains do what is called a
- * DELETE-DUPLICATE operation.  Chains are not actually moved in-memory.
- * Instead the chain we wish to move is deleted and a new chain is created
- * at the target location in the topology.  ANY SUBCHAINS PLACED UNDER THE
- * CHAIN BEING MOVED HAVE TO EXIST IN BOTH PLACES.  To make this work
- * all sub-chains are managed by the hammer2_chain_core structure.  This
- * structure can be multi-homed, meaning that it can have more than one
- * chain as its parent.  When a chain is delete-duplicated the chain's core
- * becomes shared under both the old and new chain.
- *
- *                             STALE CHAINS
- *
- * When a chain is delete-duplicated the old chain typically becomes stale.
- * This is detected via the HAMMER2_CHAIN_DUPLICATED flag in chain->flags.
- * To avoid executing live filesystem operations on stale chains, the inode
- * locking code will follow stale chains via core->ownerq until it finds
- * the live chain.  The lock prevents ripups by other threads.  Lookups
- * must properly order locking operations to prevent other threads from
- * racing the lookup operation and will also follow stale chains when
- * required.
  */
 
 #ifndef _VFS_HAMMER2_HAMMER2_H_
@@ -152,73 +118,73 @@ typedef uint32_t hammer2_xid_t;
  * root (volume) down.  Chains represent volumes, inodes, indirect blocks,
  * data blocks, and freemap nodes and leafs.
  *
- * The chain structure can be multi-homed and its topological recursion
- * (chain->core) can be shared amongst several chains.  Chain structures
- * are topologically stable once placed in the in-memory topology (they
- * don't move around).  Modifications which cross flush synchronization
- * boundaries, renames, resizing, or any move of the chain to elsewhere
- * in the topology is accomplished via the DELETE-DUPLICATE mechanism.
+ * The chain structure utilizes a simple singly-homed topology and the
+ * chain's in-memory topology will move around as the chains do, due mainly
+ * to renames and indirect block creation.
+ *
+ * Block Table Updates
  *
- * Deletions and delete-duplicates:
+ *     Block table updates for insertions and updates are delayed until the
+ *     flush.  This allows us to avoid having to modify the parent chain
+ *     all the way to the root.
  *
- *     Any movement of chains within the topology utilize a delete-duplicate
- *     operation instead of a simple rename.  That is, the chain must be
- *     deleted from its original location and then duplicated to the new
- *     location.  A new chain structure is allocated while the old is
- *     deleted.  Deleted chains are removed from the above chain_core's
- *     rbtree but remain linked via the shadow topology for flush
- *     synchronization purposes.
+ *     Block table deletions are performed immediately (modifying the parent
+ *     in the process) because the flush code uses the chain structure to
+ *     track delayed updates and the chain will be (likely) gone or moved to
+ *     another location in the topology after a deletion.
  *
- *     delete_bmap is allocated and a bit set if the chain was originally
- *     loaded via the blockmap.
+ *     A prior iteration of the code tried to keep the relationship intact
+ *     on deletes by doing a delete-duplicate operation on the chain, but
+ *     it added way too much complexity to the codebase.
  *
- * Flush synchronization:
+ * Flush Synchronization
  *
- *     Flushes must synchronize chains up through the root.  To do this
- *     the in-memory topology would normally have to be frozen during the
- *     flush.  To avoid freezing the topology and to allow concurrent
- *     foreground / flush activity, any new modifications made while a
- *     flush is in progress retains the original chain in a shadow topology
- *     that is only visible to the flush code.  Only one flush can be
- *     running at a time so the shadow hierarchy can be implemented with
- *     just a few link fields in our in-memory data structures.
+ *     The flush code must flush modified chains bottom-up.  Because chain
+ *     structures can shift around and are NOT topologically stable,
+ *     modified chains are independently indexed for the flush.  As the flush
+ *     runs it modifies (or further modifies) and updates the parents,
+ *     propagating the flush all the way to the volume root.
  *
- * Advantages:
+ *     Modifying front-end operations can occur during a flush but will block
+ *     in two cases: (1) when the front-end tries to operate on the inode
+ *     currently in the midst of being flushed and (2) if the front-end
+ *     crosses an inode currently being flushed (such as during a rename).
+ *     So, for example, if you rename directory "x" to "a/b/c/d/e/f/g/x" and
+ *     the flusher is currently working on "a/b/c", the rename will block
+ *     temporarily in order to ensure that "x" exists in one place or the
+ *     other.
  *
- *     (1) Fully coherent snapshots can be taken without requiring
- *         a pre-flush, resulting in extremely fast (sub-millisecond)
- *         snapshots.
+ *     Meta-data statistics are updated by the flusher.  The front-end will
+ *     make estimates but meta-data must be fully synchronized only during a
+ *     flush in order to ensure that it remains correct across a crash.
  *
- *     (2) Multiple synchronization points can be in-flight at the same
- *         time, representing multiple snapshots or flushes.
+ *     Multiple flush synchronizations can theoretically be in-flight at the
+ *     same time but the implementation is not coded to handle the case and
+ *     currently serializes them.
  *
- *     (3) The algorithms needed to keep track of everything are actually
- *         not that complex.
+ * Snapshots:
  *
- * Special Considerations:
+ *     Snapshots currently require the subdirectory tree being snapshotted
+ *     to be flushed.  The snapshot then creates a new super-root inode which
+ *     copies the flushed blockdata of the directory or file that was
+ *     snapshotted.
  *
- *     A chain is ref-counted on a per-chain basis, but the chain's lock
- *     is associated with the shared chain_core and is not per-chain.
+ * RBTREE NOTES:
  *
- *     The power-of-2 nature of the media radix tree ensures that there
- *     will be no overlaps which straddle edges.
+ *     - Note that the radix tree runs in powers of 2 only so sub-trees
+ *       cannot straddle edges.
  */
 RB_HEAD(hammer2_chain_tree, hammer2_chain);
-TAILQ_HEAD(h2_flush_deferral_list, hammer2_chain);
+TAILQ_HEAD(h2_flush_list, hammer2_chain);
 TAILQ_HEAD(h2_core_list, hammer2_chain);
 
 #define CHAIN_CORE_DELETE_BMAP_ENTRIES \
        (HAMMER2_PBUFSIZE / sizeof(hammer2_blockref_t) / sizeof(uint32_t))
 
 struct hammer2_chain_core {
-       int             good;
        struct ccms_cst cst;
-       struct h2_core_list ownerq;       /* all chains sharing this core */
-       struct hammer2_chain_tree rbtree; /* live chains */
-       struct hammer2_chain_tree dbtree; /* bmapped deletions */
-       struct h2_core_list dbq;          /* other deletions */
+       struct hammer2_chain_tree rbtree; /* sub-chains */
        int             live_zero;      /* blockref array opt */
-       u_int           sharecnt;
        u_int           flags;
        u_int           live_count;     /* live (not deleted) chains in tree */
        u_int           chain_count;    /* live + deleted chains under core */
@@ -264,20 +230,15 @@ typedef struct hammer2_io hammer2_io_t;
  * Primary chain structure keeps track of the topology in-memory.
  */
 struct hammer2_chain {
-       TAILQ_ENTRY(hammer2_chain) core_entry;  /* contemporary chains */
+       hammer2_chain_core_t    core;
        RB_ENTRY(hammer2_chain) rbnode;         /* live chain(s) */
-       TAILQ_ENTRY(hammer2_chain) db_entry;    /* non bmapped deletions */
        hammer2_blockref_t      bref;
-       hammer2_chain_core_t    *core;
-       hammer2_chain_core_t    *above;
+       struct hammer2_chain    *parent;
        struct hammer2_state    *state;         /* if active cache msg */
        struct hammer2_mount    *hmp;
        struct hammer2_pfsmount *pmp;           /* (pfs-cluster pmp or spmp) */
 
-       hammer2_xid_t   modify_xid;             /* flush filter */
-       hammer2_xid_t   delete_xid;             /* flush filter */
-       hammer2_xid_t   update_xlo;             /* flush propagation */
-       hammer2_xid_t   update_xhi;             /* setsubmod propagation */
+       hammer2_xid_t   flush_xid;              /* flush sequencing */
        hammer2_key_t   data_count;             /* delta's to apply */
        hammer2_key_t   inode_count;            /* delta's to apply */
        hammer2_io_t    *dio;                   /* physical data buffer */
@@ -286,9 +247,7 @@ struct hammer2_chain {
        u_int           refs;
        u_int           lockcnt;
        hammer2_media_data_t *data;             /* data pointer shortcut */
-       TAILQ_ENTRY(hammer2_chain) flush_node;  /* flush deferral list */
-
-       int             inode_reason;
+       TAILQ_ENTRY(hammer2_chain) flush_node;  /* flush list */
 };
 
 typedef struct hammer2_chain hammer2_chain_t;
@@ -305,31 +264,40 @@ RB_PROTOTYPE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
  *          is primarily used for indirect blocks.
  *
  * MODIFIED- The chain's media data has been modified.
+ * UPDATE  - Chain might not be modified but parent blocktable needs update
+ *
+ * BMAPPED - Indicates that the chain is present in the parent blockmap.
+ * BMAPUPD - Indicates that the chain is present but needs to be updated
+ *          in the parent blockmap.
  */
 #define HAMMER2_CHAIN_MODIFIED         0x00000001      /* dirty chain data */
 #define HAMMER2_CHAIN_ALLOCATED                0x00000002      /* kmalloc'd chain */
-#define HAMMER2_CHAIN_FLUSH_TEMPORARY  0x00000004
+#define HAMMER2_CHAIN_DESTROY          0x00000004
 #define HAMMER2_CHAIN_FORCECOW         0x00000008      /* force copy-on-wr */
 #define HAMMER2_CHAIN_DELETED          0x00000010      /* deleted chain */
 #define HAMMER2_CHAIN_INITIAL          0x00000020      /* initial create */
-#define HAMMER2_CHAIN_FLUSH_CREATE     0x00000040      /* needs flush blkadd */
-#define HAMMER2_CHAIN_FLUSH_DELETE     0x00000080      /* needs flush blkdel */
+#define HAMMER2_CHAIN_UPDATE           0x00000040      /* need parent update */
+#define HAMMER2_CHAIN_DEFERRED         0x00000080      /* flush depth defer */
 #define HAMMER2_CHAIN_IOFLUSH          0x00000100      /* bawrite on put */
-#define HAMMER2_CHAIN_DEFERRED         0x00000200      /* on a deferral list */
-#define HAMMER2_CHAIN_UNLINKED         0x00000400      /* delete on reclaim */
+#define HAMMER2_CHAIN_ONFLUSH          0x00000200      /* on a flush list */
+#define HAMMER2_CHAIN_UNUSED00000400   0x00000400
 #define HAMMER2_CHAIN_VOLUMESYNC       0x00000800      /* needs volume sync */
-#define HAMMER2_CHAIN_ONDBQ            0x00001000      /* !bmapped deletes */
+#define HAMMER2_CHAIN_UNUSED00001000   0x00001000
 #define HAMMER2_CHAIN_MOUNTED          0x00002000      /* PFS is mounted */
 #define HAMMER2_CHAIN_ONRBTREE         0x00004000      /* on parent RB tree */
 #define HAMMER2_CHAIN_SNAPSHOT         0x00008000      /* snapshot special */
 #define HAMMER2_CHAIN_EMBEDDED         0x00010000      /* embedded data */
 #define HAMMER2_CHAIN_RELEASE          0x00020000      /* don't keep around */
-#define HAMMER2_CHAIN_BMAPPED          0x00040000      /* in parent blkmap */
-#define HAMMER2_CHAIN_ONDBTREE         0x00080000      /* bmapped deletes */
-#define HAMMER2_CHAIN_DUPLICATED       0x00100000      /* fwd delete-dup */
+#define HAMMER2_CHAIN_BMAPPED          0x00040000      /* present in blkmap */
+#define HAMMER2_CHAIN_BMAPUPD          0x00080000      /* +needs updating */
+#define HAMMER2_CHAIN_UNUSED00100000   0x00100000
 #define HAMMER2_CHAIN_PFSROOT          0x00200000      /* in pfs->cluster */
 #define HAMMER2_CHAIN_PFSBOUNDARY      0x00400000      /* super->pfs inode */
 
+#define HAMMER2_CHAIN_FLUSH_MASK       (HAMMER2_CHAIN_MODIFIED |       \
+                                        HAMMER2_CHAIN_UPDATE |         \
+                                        HAMMER2_CHAIN_ONFLUSH)
+
 /*
  * Flags passed to hammer2_chain_lookup() and hammer2_chain_next()
  *
@@ -354,7 +322,6 @@ RB_PROTOTYPE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
 #define HAMMER2_MODIFY_NO_MODIFY_TID   0x00000004
 #define HAMMER2_MODIFY_ASSERTNOCOPY    0x00000008      /* assert no del-dup */
 #define HAMMER2_MODIFY_NOREALLOC       0x00000010
-#define HAMMER2_MODIFY_INPLACE         0x00000020      /* don't del-dup */
 
 /*
  * Flags passed to hammer2_chain_lock()
@@ -370,7 +337,7 @@ RB_PROTOTYPE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
 /*
  * Flags passed to hammer2_chain_delete()
  */
-#define HAMMER2_DELETE_UNUSED0001      0x0001
+#define HAMMER2_DELETE_PERMANENT       0x0001
 
 /*
  * Flags passed to hammer2_chain_delete_duplicate()
@@ -479,6 +446,7 @@ typedef struct hammer2_inode hammer2_inode_t;
 #define HAMMER2_INODE_ONRBTREE         0x0008
 #define HAMMER2_INODE_RESIZED          0x0010
 #define HAMMER2_INODE_MTIME            0x0020
+#define HAMMER2_INODE_UNLINKED         0x0040
 
 int hammer2_inode_cmp(hammer2_inode_t *ip1, hammer2_inode_t *ip2);
 RB_PROTOTYPE2(hammer2_inode_tree, hammer2_inode, rbnode, hammer2_inode_cmp,
@@ -491,7 +459,7 @@ struct hammer2_inode_unlink {
        TAILQ_ENTRY(hammer2_inode_unlink) entry;
        hammer2_inode_t *ip;
 };
-TAILQ_HEAD(hammer2_unlk_list, hammer2_inode_unlink);
+TAILQ_HEAD(h2_unlk_list, hammer2_inode_unlink);
 
 typedef struct hammer2_inode_unlink hammer2_inode_unlink_t;
 
@@ -599,6 +567,8 @@ struct hammer2_mount {
        int             iofree_count;
        hammer2_chain_t vchain;         /* anchor chain (topology) */
        hammer2_chain_t fchain;         /* anchor chain (freemap) */
+       struct spinlock list_spin;
+       struct h2_flush_list    flushq; /* flush seeds */
        struct hammer2_pfsmount *spmp;  /* super-root pmp for transactions */
        struct lock     vollk;          /* lockmgr lock */
        hammer2_off_t   heur_freemap[HAMMER2_FREEMAP_HEUR];
@@ -645,10 +615,10 @@ struct hammer2_pfsmount {
        hammer2_tid_t           flush_tid;
        hammer2_tid_t           inode_tid;
        long                    inmem_inodes;
-       long                    inmem_dirty_chains;
+       uint32_t                inmem_dirty_chains;
        int                     count_lwinprog; /* logical write in prog */
-       struct spinlock         unlinkq_spin;
-       struct hammer2_unlk_list unlinkq;
+       struct spinlock         list_spin;
+       struct h2_unlk_list     unlinkq;        /* last-close unlink */
        thread_t                wthread_td;     /* write thread td */
        struct bio_queue_head   wthread_bioq;   /* logical buffer bioq */
        struct mtx              wthread_mtx;    /* interlock */
@@ -823,14 +793,17 @@ void hammer2_inode_fsync(hammer2_trans_t *trans, hammer2_inode_t *ip,
                        hammer2_cluster_t *cparent);
 int hammer2_unlink_file(hammer2_trans_t *trans, hammer2_inode_t *dip,
                        const uint8_t *name, size_t name_len, int isdir,
-                       int *hlinkp, struct nchandle *nch);
+                       int *hlinkp, struct nchandle *nch, int nlinks);
 int hammer2_hardlink_consolidate(hammer2_trans_t *trans,
                        hammer2_inode_t *ip, hammer2_cluster_t **clusterp,
                        hammer2_inode_t *cdip, hammer2_cluster_t *cdcluster,
                        int nlinks);
 int hammer2_hardlink_deconsolidate(hammer2_trans_t *trans, hammer2_inode_t *dip,
                        hammer2_chain_t **chainp, hammer2_chain_t **ochainp);
-int hammer2_hardlink_find(hammer2_inode_t *dip, hammer2_cluster_t *cluster);
+int hammer2_hardlink_find(hammer2_inode_t *dip, hammer2_cluster_t **cparentp,
+                       hammer2_cluster_t *cluster);
+int hammer2_parent_find(hammer2_cluster_t **cparentp,
+                       hammer2_cluster_t *cluster);
 void hammer2_inode_install_hidden(hammer2_pfsmount_t *pmp);
 
 /*
@@ -843,8 +816,7 @@ hammer2_chain_t *hammer2_chain_alloc(hammer2_mount_t *hmp,
                                hammer2_pfsmount_t *pmp,
                                hammer2_trans_t *trans,
                                hammer2_blockref_t *bref);
-void hammer2_chain_core_alloc(hammer2_trans_t *trans, hammer2_chain_t *nchain,
-                               hammer2_chain_t *ochain);
+void hammer2_chain_core_alloc(hammer2_trans_t *trans, hammer2_chain_t *chain);
 void hammer2_chain_ref(hammer2_chain_t *chain);
 void hammer2_chain_drop(hammer2_chain_t *chain);
 int hammer2_chain_lock(hammer2_chain_t *chain, int how);
@@ -856,10 +828,10 @@ void hammer2_chain_load_async(hammer2_cluster_t *cluster,
                                void *arg_p);
 void hammer2_chain_moved(hammer2_chain_t *chain);
 void hammer2_chain_modify(hammer2_trans_t *trans,
-                               hammer2_chain_t **chainp, int flags);
+                               hammer2_chain_t *chain, int flags);
 void hammer2_chain_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
                                hammer2_chain_t *parent,
-                               hammer2_chain_t **chainp,
+                               hammer2_chain_t *chain,
                                int nradix, int flags);
 void hammer2_chain_unlock(hammer2_chain_t *chain);
 void hammer2_chain_wait(hammer2_chain_t *chain);
@@ -885,19 +857,18 @@ int hammer2_chain_create(hammer2_trans_t *trans, hammer2_chain_t **parentp,
                                hammer2_pfsmount_t *pmp,
                                hammer2_key_t key, int keybits,
                                int type, size_t bytes);
-void hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t **parentp,
-                               hammer2_chain_t **chainp,
-                               hammer2_blockref_t *bref, int snapshot,
-                               int duplicate_reason);
+void hammer2_chain_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref,
+                               hammer2_chain_t **parentp,
+                               hammer2_chain_t *chain);
 int hammer2_chain_snapshot(hammer2_trans_t *trans, hammer2_chain_t **chainp,
                                hammer2_ioc_pfs_t *pfs);
-void hammer2_chain_delete(hammer2_trans_t *trans, hammer2_chain_t *chain,
-                               int flags);
+void hammer2_chain_delete(hammer2_trans_t *trans, hammer2_chain_t *parent,
+                               hammer2_chain_t *chain, int flags);
 void hammer2_chain_delete_duplicate(hammer2_trans_t *trans,
                                hammer2_chain_t **chainp, int flags);
-void hammer2_flush(hammer2_trans_t *trans, hammer2_chain_t **chainp);
+void hammer2_flush(hammer2_trans_t *trans, hammer2_chain_t *chain);
 void hammer2_chain_commit(hammer2_trans_t *trans, hammer2_chain_t *chain);
-void hammer2_chain_setsubmod(hammer2_trans_t *trans, hammer2_chain_t *chain);
+void hammer2_chain_setflush(hammer2_trans_t *trans, hammer2_chain_t *chain);
 void hammer2_chain_countbrefs(hammer2_chain_t *chain,
                                hammer2_blockref_t *base, int count);
 
@@ -905,18 +876,12 @@ void hammer2_pfs_memory_wait(hammer2_pfsmount_t *pmp);
 void hammer2_pfs_memory_inc(hammer2_pfsmount_t *pmp);
 void hammer2_pfs_memory_wakeup(hammer2_pfsmount_t *pmp);
 
-int hammer2_base_find(hammer2_chain_t *chain,
-                               hammer2_blockref_t *base, int count,
-                               int *cache_indexp, hammer2_key_t *key_nextp,
-                               hammer2_key_t key_beg, hammer2_key_t key_end,
-                               int delete_filter);
 void hammer2_base_delete(hammer2_trans_t *trans, hammer2_chain_t *chain,
                                hammer2_blockref_t *base, int count,
                                int *cache_indexp, hammer2_chain_t *child);
 void hammer2_base_insert(hammer2_trans_t *trans, hammer2_chain_t *chain,
                                hammer2_blockref_t *base, int count,
                                int *cache_indexp, hammer2_chain_t *child);
-void hammer2_chain_refactor(hammer2_chain_t **chainp);
 
 /*
  * hammer2_trans.c
@@ -976,7 +941,6 @@ int hammer2_msg_adhoc_input(kdmsg_msg_t *msg);
  */
 void hammer2_clusterctl_wakeup(kdmsg_iocom_t *iocom);
 void hammer2_volconf_update(hammer2_mount_t *hmp, int index);
-void hammer2_update_spans(hammer2_mount_t *hmp);
 void hammer2_cluster_reconnect(hammer2_mount_t *hmp, struct file *fp);
 void hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp, char pfx);
 void hammer2_bioq_sync(hammer2_pfsmount_t *pmp);
@@ -1002,18 +966,14 @@ const hammer2_media_data_t *hammer2_cluster_data(hammer2_cluster_t *cluster);
 hammer2_media_data_t *hammer2_cluster_wdata(hammer2_cluster_t *cluster);
 hammer2_cluster_t *hammer2_cluster_from_chain(hammer2_chain_t *chain);
 int hammer2_cluster_modified(hammer2_cluster_t *cluster);
-int hammer2_cluster_unlinked(hammer2_cluster_t *cluster);
 int hammer2_cluster_duplicated(hammer2_cluster_t *cluster);
 void hammer2_cluster_set_chainflags(hammer2_cluster_t *cluster, uint32_t flags);
 void hammer2_cluster_bref(hammer2_cluster_t *cluster, hammer2_blockref_t *bref);
-void hammer2_cluster_setsubmod(hammer2_trans_t *trans,
+void hammer2_cluster_setflush(hammer2_trans_t *trans,
                        hammer2_cluster_t *cluster);
 hammer2_cluster_t *hammer2_cluster_alloc(hammer2_pfsmount_t *pmp,
                        hammer2_trans_t *trans,
                        hammer2_blockref_t *bref);
-void hammer2_cluster_core_alloc(hammer2_trans_t *trans,
-                       hammer2_cluster_t *ncluster,
-                       hammer2_cluster_t *ocluster);
 void hammer2_cluster_ref(hammer2_cluster_t *cluster);
 void hammer2_cluster_drop(hammer2_cluster_t *cluster);
 void hammer2_cluster_wait(hammer2_cluster_t *cluster);
@@ -1023,7 +983,6 @@ void hammer2_cluster_replace_locked(hammer2_cluster_t *dst,
                        hammer2_cluster_t *src);
 hammer2_cluster_t *hammer2_cluster_copy(hammer2_cluster_t *ocluster,
                        int with_chains);
-void hammer2_cluster_refactor(hammer2_cluster_t *cluster);
 void hammer2_cluster_unlock(hammer2_cluster_t *cluster);
 void hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
                        hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
@@ -1051,16 +1010,14 @@ hammer2_cluster_t *hammer2_cluster_scan(hammer2_cluster_t *cparent,
 int hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
                        hammer2_cluster_t **clusterp,
                        hammer2_key_t key, int keybits, int type, size_t bytes);
-void hammer2_cluster_duplicate(hammer2_trans_t *trans,
-                       hammer2_cluster_t *cparent, hammer2_cluster_t *cluster,
-                       hammer2_blockref_t *bref,
-                       int snapshot, int duplicate_reason);
-void hammer2_cluster_delete_duplicate(hammer2_trans_t *trans,
+void hammer2_cluster_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref,
+                       hammer2_cluster_t *cparent, hammer2_cluster_t *cluster);
+void hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *pcluster,
                        hammer2_cluster_t *cluster, int flags);
-void hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
-                       int flags);
 int hammer2_cluster_snapshot(hammer2_trans_t *trans,
                        hammer2_cluster_t *ocluster, hammer2_ioc_pfs_t *pfs);
+hammer2_cluster_t *hammer2_cluster_parent(hammer2_cluster_t *cluster);
+
 
 #endif /* !_KERNEL */
 #endif /* !_VFS_HAMMER2_HAMMER2_H_ */
index 681ac50..6f8ec62 100644 (file)
@@ -3,7 +3,7 @@
  *
  * This code is derived from software contributed to The DragonFly Project
  * by Matthew Dillon <dillon@dragonflybsd.org>
- * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ * and Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * indirect blocks, data blocks, etc).  Chains represent a portion of the
  * HAMMER2 topology.
  *
- * A chain is topologically stable once it has been inserted into the
- * in-memory topology.  Modifications which copy, move, or resize the chain
- * are handled via the DELETE-DUPLICATE mechanic where the original chain
- * stays intact but is marked deleted and a new chain is allocated which
- * shares the old chain's children.
- *
- * This sharing is handled via the hammer2_chain_core structure.
- *
- * The DELETE-DUPLICATE mechanism allows the same topological level to contain
- * many overloadings.  However, our RBTREE mechanics require that there be
- * no overlaps so we accomplish the overloading by moving conflicting chains
- * with smaller or equal radii into a sub-RBTREE under the chain being
- * overloaded.
- *
- * DELETE-DUPLICATE is also used when a modification to a chain crosses a
- * flush synchronization boundary, allowing the flush code to continue flushing
- * the older version of the topology and not be disrupted by new frontend
- * operations.
- *
- *                             LIVE VS FLUSH VIEW
- *
- * All lookup and iterate operations and most modifications are done on the
- * live view.  During flushes lookups are not normally done and modifications
- * may be run on the flush view.  However, flushes often needs to allocate
- * blocks and the freemap_alloc/free code issues lookups.  This code is
- * special cased to use the live view when called from a flush.
- *
- * General chain lookup/iteration functions are NOT aware of the flush view,
- * they only know about live views.
+ * Chains are no-longer delete-duplicated.  Instead, the original in-memory
+ * chain will be moved along with its block reference (e.g. for things like
+ * renames, hardlink operations, modifications, etc), and will be indexed
+ * on a secondary list for flush handling instead of propagating a flag
+ * upward to the root.
+ *
+ * Concurrent front-end operations can still run against backend flushes
+ * as long as they do not cross the current flush boundary.  An operation
+ * running above the current flush (in areas not yet flushed) can become
+ * part of the current flush while ano peration running below the current
+ * flush can become part of the next flush.
  */
 #include <sys/cdefs.h>
 #include <sys/param.h>
@@ -142,39 +124,27 @@ hammer2_isclusterable(hammer2_chain_t *chain)
 }
 
 /*
- * Recursively set update_xhi starting at chain and moving upward.  Stop early
- * if we hit a PFS transition (PFS flush code will have to detect the case
- * and perform an update within its own transaction).  The transaction xid
- * is only good within the current PFS.
- *
- * This controls top-down visibility for flushes.  The child has just one
- * 'above' core, but the core itself can be multi-homed with parents iterated
- * via core->ownerq.  The last parent is the 'live' parent (all others had to
- * have been delete-duplicated).  We always propagate upward through the live
- * parent.
- *
- * This function is not used during a flush (except when the flush is
- * allocating which requires the live tree).  The flush keeps track of its
- * recursion itself.
- *
- * XXX SMP races.  For now we do not allow concurrent transactions with
- *     different transaction ids and there should be no race, but if we do
- *     later on there will be a problem.
+ * Make a chain visible to the flusher.  The flusher needs to be able to
+ * do flushes of a subdirectory chains or single files so it does a top-down
+ * recursion using the ONFLUSH flag for the recursion.  It locates MODIFIED
+ * or UPDATE chains and flushes back up the chain to the root.
  */
 void
-hammer2_chain_setsubmod(hammer2_trans_t *trans, hammer2_chain_t *chain)
+hammer2_chain_setflush(hammer2_trans_t *trans, hammer2_chain_t *chain)
 {
-       hammer2_chain_core_t *above;
-
-       if (chain->update_xhi < trans->sync_xid)
-               chain->update_xhi = trans->sync_xid;
+       hammer2_chain_t *parent;
 
-       while ((above = chain->above) != NULL) {
-               spin_lock(&above->cst.spin);
-               chain = TAILQ_LAST(&above->ownerq, h2_core_list);
-               if (chain->update_xhi < trans->sync_xid)
-                       chain->update_xhi = trans->sync_xid;
-               spin_unlock(&above->cst.spin);
+       if ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) {
+               spin_lock(&chain->core.cst.spin);
+               while ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) {
+                       atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH);
+                       if ((parent = chain->parent) == NULL)
+                               break;
+                       spin_lock(&parent->core.cst.spin);
+                       spin_unlock(&chain->core.cst.spin);
+                       chain = parent;
+               }
+               spin_unlock(&chain->core.cst.spin);
        }
 }
 
@@ -234,7 +204,6 @@ hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_pfsmount_t *pmp,
        chain->bytes = bytes;
        chain->refs = 1;
        chain->flags = HAMMER2_CHAIN_ALLOCATED;
-       chain->delete_xid = HAMMER2_XID_MAX;
 
        /*
         * Set the PFS boundary flag if this chain represents a PFS root.
@@ -242,20 +211,6 @@ hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_pfsmount_t *pmp,
        if (bref->flags & HAMMER2_BREF_FLAG_PFSROOT)
                chain->flags |= HAMMER2_CHAIN_PFSBOUNDARY;
 
-       /*
-        * Set modify_xid if a transaction is creating the inode.
-        * Enforce update_xlo = 0 so nearby transactions do not think
-        * it has been flushed when it hasn't.
-        *
-        * NOTE: When loading a chain from backing store or creating a
-        *       snapshot, trans will be NULL and the caller is responsible
-        *       for setting these fields.
-        */
-       if (trans) {
-               chain->modify_xid = trans->sync_xid;
-               chain->update_xlo = 0;
-       }
-
        return (chain);
 }
 
@@ -270,88 +225,16 @@ hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_pfsmount_t *pmp,
  * drop code visibility in the correct order, otherwise drops can be missed.
  */
 void
-hammer2_chain_core_alloc(hammer2_trans_t *trans,
-                        hammer2_chain_t *nchain, hammer2_chain_t *ochain)
+hammer2_chain_core_alloc(hammer2_trans_t *trans, hammer2_chain_t *chain)
 {
-       hammer2_chain_core_t *core;
-
-       KKASSERT(nchain->core == NULL);
+       hammer2_chain_core_t *core = &chain->core;
 
-       if (ochain == NULL) {
-               /*
-                * Fresh core under nchain (no multi-homing of ochain's
-                * sub-tree).
-                */
-               core = kmalloc(sizeof(*core), nchain->hmp->mchain,
-                              M_WAITOK | M_ZERO);
-               TAILQ_INIT(&core->ownerq);
-               TAILQ_INIT(&core->dbq);
-               RB_INIT(&core->rbtree); /* live chains */
-               RB_INIT(&core->dbtree); /* deleted original (bmapped) chains */
-               core->sharecnt = 1;
-               core->good = 0x1234;
-               nchain->core = core;
-               ccms_cst_init(&core->cst, nchain);
-               TAILQ_INSERT_TAIL(&core->ownerq, nchain, core_entry);
-       } else {
-               /*
-                * Propagate the PFSROOT flag which we set on all subdirs
-                * under the super-root.
-                */
-               atomic_set_int(&nchain->flags,
-                              ochain->flags & HAMMER2_CHAIN_PFSROOT);
-
-               /*
-                * Duplicating ochain -> nchain.  Set the DUPLICATED flag on
-                * ochain if nchain is not a snapshot.
-                *
-                * It is possible for the DUPLICATED flag to already be
-                * set when called via a flush operation because flush
-                * operations may have to work on elements with delete_xid's
-                * beyond the flush sync_xid.  In this situation we must
-                * ensure that nchain is placed just after ochain in the
-                * ownerq and that the DUPLICATED flag is set on nchain so
-                * 'live' operations skip past it to the correct chain.
-                *
-                * The flusher understands the blockref synchronization state
-                * for any stale chains by observing bref.mirror_tid, which
-                * delete-duplicate replicates.
-                *
-                * WARNING! However, the case is disallowed when the flusher
-                *          is allocating freemap space because this entails
-                *          more than just adjusting a block table.
-                */
-               if (ochain->flags & HAMMER2_CHAIN_DUPLICATED) {
-                       KKASSERT(trans->flags & HAMMER2_TRANS_ISFLUSH);
-                       atomic_set_int(&nchain->flags,
-                                      HAMMER2_CHAIN_DUPLICATED);
-               }
-               if ((nchain->flags & HAMMER2_CHAIN_SNAPSHOT) == 0) {
-                       atomic_set_int(&ochain->flags,
-                                      HAMMER2_CHAIN_DUPLICATED);
-               }
-               core = ochain->core;
-               atomic_add_int(&core->sharecnt, 1);
-
-               spin_lock(&core->cst.spin);
-               nchain->core = core;
-
-               /*
-                * Maintain ordering for refactor test so we don't skip over
-                * a snapshot.  Also, during flushes, delete-duplications
-                * for block-table updates can occur on ochains already
-                * deleted (delete-duplicated by a later transaction), or
-                * on forward-indexed ochains.  We must properly insert
-                * nchain relative to ochain.
-                */
-               if (trans && trans->sync_xid < ochain->modify_xid) {
-                       TAILQ_INSERT_BEFORE(ochain, nchain, core_entry);
-               } else {
-                       TAILQ_INSERT_AFTER(&core->ownerq, ochain,
-                                          nchain, core_entry);
-               }
-               spin_unlock(&core->cst.spin);
-       }
+       /*
+        * Fresh core under nchain (no multi-homing of ochain's
+        * sub-tree).
+        */
+       RB_INIT(&core->rbtree); /* live chains */
+       ccms_cst_init(&core->cst, chain);
 }
 
 /*
@@ -376,77 +259,44 @@ hammer2_chain_ref(hammer2_chain_t *chain)
 
 static
 int
-hammer2_chain_insert(hammer2_chain_core_t *above,
-                    hammer2_chain_t *ochain, hammer2_chain_t *nchain,
+hammer2_chain_insert(hammer2_chain_t *parent, hammer2_chain_t *chain,
                     int flags, int generation)
 {
        hammer2_chain_t *xchain;
        int error = 0;
 
        if (flags & HAMMER2_CHAIN_INSERT_SPIN)
-               spin_lock(&above->cst.spin);
+               spin_lock(&parent->core.cst.spin);
 
        /*
         * Interlocked by spinlock, check for race
         */
        if ((flags & HAMMER2_CHAIN_INSERT_RACE) &&
-           above->generation != generation) {
+           parent->core.generation != generation) {
                error = EAGAIN;
                goto failed;
        }
 
        /*
-        * Insert nchain
-        *
-        * XXX BMAPPED might not be handled correctly for ochain/nchain
-        *     ordering in both DELETED cases (flush and non-flush-term),
-        *     so delete-duplicate code.
+        * Insert chain
         */
-       if (nchain->flags & HAMMER2_CHAIN_DELETED) {
-               if (ochain && (ochain->flags & HAMMER2_CHAIN_BMAPPED)) {
-                       if (ochain->flags & HAMMER2_CHAIN_ONDBTREE) {
-                               RB_REMOVE(hammer2_chain_tree,
-                                         &above->dbtree, ochain);
-                               atomic_clear_int(&ochain->flags,
-                                                HAMMER2_CHAIN_ONDBTREE);
-                               TAILQ_INSERT_TAIL(&above->dbq,
-                                                 ochain, db_entry);
-                               atomic_set_int(&ochain->flags,
-                                               HAMMER2_CHAIN_ONDBQ);
-                       }
-                       /* clear BMAPPED (DBTREE, sometimes RBTREE) */
-                       atomic_clear_int(&ochain->flags, HAMMER2_CHAIN_BMAPPED);
-
-                       xchain = RB_INSERT(hammer2_chain_tree,
-                                          &above->dbtree, nchain);
-                       KKASSERT(xchain == NULL);
-                       atomic_set_int(&nchain->flags,
-                                      HAMMER2_CHAIN_ONDBTREE |
-                                      HAMMER2_CHAIN_BMAPPED);
-               } else {
-                       TAILQ_INSERT_TAIL(&above->dbq, nchain, db_entry);
-                       atomic_set_int(&nchain->flags, HAMMER2_CHAIN_ONDBQ);
-               }
-       } else {
-               xchain = RB_INSERT(hammer2_chain_tree, &above->rbtree, nchain);
-               KASSERT(xchain == NULL,
-                       ("hammer2_chain_insert: collision %p", nchain));
-               atomic_set_int(&nchain->flags, HAMMER2_CHAIN_ONRBTREE);
-       }
-
-       nchain->above = above;
-       ++above->chain_count;
-       ++above->generation;
+       xchain = RB_INSERT(hammer2_chain_tree, &parent->core.rbtree, chain);
+       KASSERT(xchain == NULL,
+               ("hammer2_chain_insert: collision %p %p", chain, xchain));
+       atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
+       chain->parent = parent;
+       ++parent->core.chain_count;
+       ++parent->core.generation;      /* XXX incs for _get() too, XXX */
 
        /*
         * We have to keep track of the effective live-view blockref count
         * so the create code knows when to push an indirect block.
         */
        if (flags & HAMMER2_CHAIN_INSERT_LIVE)
-               atomic_add_int(&above->live_count, 1);
+               atomic_add_int(&parent->core.live_count, 1);
 failed:
        if (flags & HAMMER2_CHAIN_INSERT_SPIN)
-               spin_unlock(&above->cst.spin);
+               spin_unlock(&parent->core.cst.spin);
        return error;
 }
 
@@ -456,60 +306,35 @@ failed:
  * deallocate it, then recursely drop the parent using the implied ref
  * from the chain's chain->parent.
  */
-static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_chain_t *chain,
-                                              struct h2_core_list *delayq);
+static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_chain_t *chain);
 
 void
 hammer2_chain_drop(hammer2_chain_t *chain)
 {
-       struct h2_core_list delayq;
-       hammer2_chain_t *scan;
        u_int refs;
        u_int need = 0;
 
        if (hammer2_debug & 0x200000)
                Debugger("drop");
 
-       if (chain->flags & HAMMER2_CHAIN_FLUSH_CREATE)
-               ++need;
-       if (chain->flags & HAMMER2_CHAIN_FLUSH_DELETE)
+       if (chain->flags & HAMMER2_CHAIN_UPDATE)
                ++need;
        if (chain->flags & HAMMER2_CHAIN_MODIFIED)
                ++need;
        KKASSERT(chain->refs > need);
 
-       TAILQ_INIT(&delayq);
-
        while (chain) {
                refs = chain->refs;
                cpu_ccfence();
                KKASSERT(refs > 0);
 
                if (refs == 1) {
-                       chain = hammer2_chain_lastdrop(chain, &delayq);
+                       chain = hammer2_chain_lastdrop(chain);
                } else {
                        if (atomic_cmpset_int(&chain->refs, refs, refs - 1))
                                break;
                        /* retry the same chain */
                }
-
-               /*
-                * When we've exhausted lastdrop chaining pull off of delayq.
-                * chains on delayq are dead but are used to placehold other
-                * chains which we added a ref to for the purpose of dropping.
-                */
-               if (chain == NULL) {
-                       hammer2_mount_t *hmp;
-
-                       if ((scan = TAILQ_FIRST(&delayq)) != NULL) {
-                               chain = (void *)scan->data;
-                               TAILQ_REMOVE(&delayq, scan, core_entry);
-                               scan->flags &= ~HAMMER2_CHAIN_ALLOCATED;
-                               hmp = scan->hmp;
-                               scan->hmp = NULL;
-                               kfree(scan, hmp->mchain);
-                       }
-               }
        }
 }
 
@@ -530,14 +355,12 @@ hammer2_chain_drop(hammer2_chain_t *chain)
  */
 static
 hammer2_chain_t *
-hammer2_chain_lastdrop(hammer2_chain_t *chain, struct h2_core_list *delayq)
+hammer2_chain_lastdrop(hammer2_chain_t *chain)
 {
        hammer2_pfsmount_t *pmp;
        hammer2_mount_t *hmp;
-       hammer2_chain_core_t *above;
-       hammer2_chain_core_t *core;
-       hammer2_chain_t *rdrop1;
-       hammer2_chain_t *rdrop2;
+       hammer2_chain_t *parent;
+       hammer2_chain_t *rdrop;
 
        /*
         * Spinlock the core and check to see if it is empty.  If it is
@@ -545,68 +368,40 @@ hammer2_chain_lastdrop(hammer2_chain_t *chain, struct h2_core_list *delayq)
         * in core->rbtree are associated with other chains contemporary
         * with ours but not with our chain directly.
         */
-       if ((core = chain->core) != NULL) {
-               spin_lock(&core->cst.spin);
+       spin_lock(&chain->core.cst.spin);
 
-               /*
-                * We can't free non-stale chains with children until we are
-                * able to free the children because there might be a flush
-                * dependency.  Flushes of stale children (which should also
-                * have their deleted flag set) short-cut recursive flush
-                * dependencies and can be freed here.  Any flushes which run
-                * through stale children due to the flush synchronization
-                * point should have a FLUSH_* bit set in the chain and not
-                * reach lastdrop at this time.
-                *
-                * NOTE: We return (chain) on failure to retry.
-                */
-               if (core->chain_count &&
-                   (chain->flags & HAMMER2_CHAIN_DUPLICATED) == 0) {
-                       if (atomic_cmpset_int(&chain->refs, 1, 0))
-                               chain = NULL;   /* success */
-                       spin_unlock(&core->cst.spin);
-                       return(chain);
-               }
-               /* no chains left under us */
-
-               /*
-                * Various parts of the code might be holding a ref on a
-                * stale chain as a placemarker which must be iterated to
-                * locate a later non-stale (live) chain.  We must be sure
-                * NOT to free the later non-stale chain (which might have
-                * no refs).  Otherwise mass confusion may result.
-                *
-                * The DUPLICATED flag tells us whether the chain is stale
-                * or not, so the rule is that any chain whos DUPLICATED flag
-                * is NOT set must also be at the head of the ownerq.
-                *
-                * Note that the DELETED flag is not involved.  That is, a
-                * live chain can represent a deletion that has not yet been
-                * flushed (or still has refs).
-                */
-#if 0
-               if (TAILQ_NEXT(chain, core_entry) == NULL &&
-                   TAILQ_FIRST(&core->ownerq) != chain) {
-#endif
-               if ((chain->flags & HAMMER2_CHAIN_DUPLICATED) == 0 &&
-                   TAILQ_FIRST(&core->ownerq) != chain) {
-                       if (atomic_cmpset_int(&chain->refs, 1, 0))
-                               chain = NULL;   /* success */
-                       spin_unlock(&core->cst.spin);
-                       return(chain);
+       /*
+        * We can't free non-stale chains with children until we are
+        * able to free the children because there might be a flush
+        * dependency.  Flushes of stale children (which should also
+        * have their deleted flag set) short-cut recursive flush
+        * dependencies and can be freed here.  Any flushes which run
+        * through stale children due to the flush synchronization
+        * point should have a FLUSH_* bit set in the chain and not
+        * reach lastdrop at this time.
+        *
+        * NOTE: We return (chain) on failure to retry.
+        */
+       if (chain->core.chain_count) {
+               if (atomic_cmpset_int(&chain->refs, 1, 0)) {
+                       spin_unlock(&chain->core.cst.spin);
+                       chain = NULL;   /* success */
+               } else {
+                       spin_unlock(&chain->core.cst.spin);
                }
+               return(chain);
        }
+       /* no chains left under us */
 
        /*
         * chain->core has no children left so no accessors can get to our
-        * chain from there.  Now we have to lock the above core to interlock
+        * chain from there.  Now we have to lock the parent core to interlock
         * remaining possible accessors that might bump chain's refs before
         * we can safely drop chain's refs with intent to free the chain.
         */
        hmp = chain->hmp;
        pmp = chain->pmp;       /* can be NULL */
-       rdrop1 = NULL;
-       rdrop2 = NULL;
+       rdrop = NULL;
 
        /*
         * Spinlock the parent and try to drop the last ref on chain.
@@ -615,13 +410,12 @@ hammer2_chain_lastdrop(hammer2_chain_t *chain, struct h2_core_list *delayq)
         * (normal core locks are top-down recursive but we define core
         *  spinlocks as bottom-up recursive, so this is safe).
         */
-       if ((above = chain->above) != NULL) {
-               spin_lock(&above->cst.spin);
+       if ((parent = chain->parent) != NULL) {
+               spin_lock(&parent->core.cst.spin);
                if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) {
                        /* 1->0 transition failed */
-                       spin_unlock(&above->cst.spin);
-                       if (core)
-                               spin_unlock(&core->cst.spin);
+                       spin_unlock(&parent->core.cst.spin);
+                       spin_unlock(&chain->core.cst.spin);
                        return(chain);  /* retry */
                }
 
@@ -629,95 +423,45 @@ hammer2_chain_lastdrop(hammer2_chain_t *chain, struct h2_core_list *delayq)
                 * 1->0 transition successful, remove chain from its
                 * above core.
                 */
-               switch (chain->flags & (HAMMER2_CHAIN_ONRBTREE |
-                                       HAMMER2_CHAIN_ONDBTREE |
-                                       HAMMER2_CHAIN_ONDBQ)) {
-               case HAMMER2_CHAIN_ONRBTREE:
-                       RB_REMOVE(hammer2_chain_tree, &above->rbtree, chain);
+               if (chain->flags & HAMMER2_CHAIN_ONRBTREE) {
+                       RB_REMOVE(hammer2_chain_tree,
+                                 &parent->core.rbtree, chain);
                        atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
-                       break;
-               case HAMMER2_CHAIN_ONDBTREE:
-                       RB_REMOVE(hammer2_chain_tree, &above->dbtree, chain);
-                       atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONDBTREE);
-                       break;
-               case HAMMER2_CHAIN_ONDBQ:
-                       TAILQ_REMOVE(&above->dbq, chain, db_entry);
-                       atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONDBQ);
-                       break;
-               default:
-                       panic("hammer2_chain_lastdrop: chain %p badflags %08x",
-                             chain, chain->flags);
-                       break;
+                       --parent->core.chain_count;
+                       chain->parent = NULL;
                }
 
-               --above->chain_count;
-               chain->above = NULL;
-
                /*
                 * If our chain was the last chain in the parent's core the
-                * core is now empty and its parents might now be droppable.
-                * Try to drop the first multi-homed parent by gaining a
-                * ref on it here and then dropping it below.
+                * core is now empty and its parent might have to be
+                * re-dropped if it has 0 refs.
                 */
-               if (above->chain_count == 0) {
-                       rdrop1 = TAILQ_FIRST(&above->ownerq);
-                       if (rdrop1 &&
-                           atomic_cmpset_int(&rdrop1->refs, 0, 1) == 0) {
-                               rdrop1 = NULL;
+               if (parent->core.chain_count == 0) {
+                       rdrop = parent;
+                       if (atomic_cmpset_int(&rdrop->refs, 0, 1) == 0) {
+                               rdrop = NULL;
                        }
                }
-               spin_unlock(&above->cst.spin);
-               above = NULL;   /* safety */
+               spin_unlock(&parent->core.cst.spin);
+               parent = NULL;  /* safety */
        }
 
        /*
         * Successful 1->0 transition and the chain can be destroyed now.
         *
-        * We still have the core spinlock (if core is non-NULL), and core's
-        * chain_count is 0.  The above spinlock is gone.
-        *
-        * Remove chain from ownerq.  Once core has no more owners (and no
-        * children which is already the case) we can destroy core.
-        *
-        * If core has more owners we may be able to continue a bottom-up
-        * drop with our next sibling.
+        * We still have the core spinlock, and core's chain_count is 0.
+        * Any parent spinlock is gone.
         */
-       if (core) {
-               chain->core = NULL;
-
-               TAILQ_REMOVE(&core->ownerq, chain, core_entry);
-               rdrop2 = TAILQ_FIRST(&core->ownerq);
-               if (rdrop2 && atomic_cmpset_int(&rdrop2->refs, 0, 1) == 0)
-                       rdrop2 = NULL;
-               spin_unlock(&core->cst.spin);
-
-               /*
-                * We can do the final 1->0 transition with an atomic op
-                * after releasing core's spinlock.
-                */
-               if (atomic_fetchadd_int(&core->sharecnt, -1) == 1) {
-                       /*
-                        * On the 1->0 transition of core we can destroy
-                        * it.
-                        */
-                       KKASSERT(TAILQ_EMPTY(&core->ownerq));
-                       KKASSERT(RB_EMPTY(&core->rbtree) &&
-                                RB_EMPTY(&core->dbtree) &&
-                                TAILQ_EMPTY(&core->dbq) &&
-                                core->chain_count == 0);
-                       KKASSERT(core->cst.count == 0);
-                       KKASSERT(core->cst.upgrade == 0);
-                       core->good = 0x5678;
-                       kfree(core, hmp->mchain);
-               }
-               core = NULL;    /* safety */
-       }
+       spin_unlock(&chain->core.cst.spin);
+       KKASSERT(RB_EMPTY(&chain->core.rbtree) &&
+                chain->core.chain_count == 0);
+       KKASSERT(chain->core.cst.count == 0);
+       KKASSERT(chain->core.cst.upgrade == 0);
 
        /*
         * All spin locks are gone, finish freeing stuff.
         */
-       KKASSERT((chain->flags & (HAMMER2_CHAIN_FLUSH_CREATE |
-                                 HAMMER2_CHAIN_FLUSH_DELETE |
+       KKASSERT((chain->flags & (HAMMER2_CHAIN_UPDATE |
                                  HAMMER2_CHAIN_MODIFIED)) == 0);
        hammer2_chain_drop_data(chain, 1);
 
@@ -729,25 +473,16 @@ hammer2_chain_lastdrop(hammer2_chain_t *chain, struct h2_core_list *delayq)
         * drop, because we have potentially two things to drop and can only
         * return one directly.
         */
-       if (rdrop1 && rdrop2) {
-               KKASSERT(chain->flags & HAMMER2_CHAIN_ALLOCATED);
-               chain->data = (void *)rdrop1;
-               TAILQ_INSERT_TAIL(delayq, chain, core_entry);
-               rdrop1 = NULL;
-       } else if (chain->flags & HAMMER2_CHAIN_ALLOCATED) {
+       if (chain->flags & HAMMER2_CHAIN_ALLOCATED) {
                chain->flags &= ~HAMMER2_CHAIN_ALLOCATED;
                chain->hmp = NULL;
                kfree(chain, hmp->mchain);
        }
 
        /*
-        * Either or both can be NULL.  We already handled the case where
-        * both might not have been NULL.
+        * Possible chaining loop when parent re-drop needed.
         */
-       if (rdrop1)
-               return(rdrop1);
-       else
-               return(rdrop2);
+       return(rdrop);
 }
 
 /*
@@ -818,7 +553,6 @@ int
 hammer2_chain_lock(hammer2_chain_t *chain, int how)
 {
        hammer2_mount_t *hmp;
-       hammer2_chain_core_t *core;
        hammer2_blockref_t *bref;
        ccms_state_t ostate;
        char *bdata;
@@ -837,11 +571,10 @@ hammer2_chain_lock(hammer2_chain_t *chain, int how)
        /*
         * Get the appropriate lock.
         */
-       core = chain->core;
        if (how & HAMMER2_RESOLVE_SHARED)
-               ccms_thread_lock(&core->cst, CCMS_STATE_SHARED);
+               ccms_thread_lock(&chain->core.cst, CCMS_STATE_SHARED);
        else
-               ccms_thread_lock(&core->cst, CCMS_STATE_EXCLUSIVE);
+               ccms_thread_lock(&chain->core.cst, CCMS_STATE_EXCLUSIVE);
 
        /*
         * If we already have a valid data pointer no further action is
@@ -877,9 +610,9 @@ hammer2_chain_lock(hammer2_chain_t *chain, int how)
         * buffer cache.  If another thread got to it before us we
         * can just return.
         */
-       ostate = ccms_thread_lock_upgrade(&core->cst);
+       ostate = ccms_thread_lock_upgrade(&chain->core.cst);
        if (chain->data) {
-               ccms_thread_lock_downgrade(&core->cst, ostate);
+               ccms_thread_lock_downgrade(&chain->core.cst, ostate);
                return (0);
        }
 
@@ -915,7 +648,7 @@ hammer2_chain_lock(hammer2_chain_t *chain, int how)
                kprintf("hammer2_chain_lock: I/O error %016jx: %d\n",
                        (intmax_t)bref->data_off, error);
                hammer2_io_bqrelse(&chain->dio);
-               ccms_thread_lock_downgrade(&core->cst, ostate);
+               ccms_thread_lock_downgrade(&chain->core.cst, ostate);
                return (error);
        }
 
@@ -931,13 +664,12 @@ hammer2_chain_lock(hammer2_chain_t *chain, int how)
 #endif
 
        /*
-        * We can clear the INITIAL state now, we've resolved the buffer
-        * to zeros and marked it dirty with hammer2_io_new().
+        * Clear INITIAL.  In this case we used io_new() and the buffer has
+        * been zero'd and marked dirty.
         */
        bdata = hammer2_io_data(chain->dio, chain->bref.data_off);
-       if (chain->flags & HAMMER2_CHAIN_INITIAL) {
+       if (chain->flags & HAMMER2_CHAIN_INITIAL)
                atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
-       }
 
        /*
         * Setup the data pointer, either pointing it to an embedded data
@@ -968,7 +700,7 @@ hammer2_chain_lock(hammer2_chain_t *chain, int how)
                chain->data = (void *)bdata;
                break;
        }
-       ccms_thread_lock_downgrade(&core->cst, ostate);
+       ccms_thread_lock_downgrade(&chain->core.cst, ostate);
        return (0);
 }
 
@@ -1056,7 +788,6 @@ hammer2_chain_load_async(hammer2_cluster_t *cluster,
 void
 hammer2_chain_unlock(hammer2_chain_t *chain)
 {
-       hammer2_chain_core_t *core = chain->core;
        ccms_state_t ostate;
        long *counterp;
        u_int lockcnt;
@@ -1077,7 +808,7 @@ hammer2_chain_unlock(hammer2_chain_t *chain)
                if (lockcnt > 1) {
                        if (atomic_cmpset_int(&chain->lockcnt,
                                              lockcnt, lockcnt - 1)) {
-                               ccms_thread_unlock(&core->cst);
+                               ccms_thread_unlock(&chain->core.cst);
                                hammer2_chain_drop(chain);
                                return;
                        }
@@ -1100,9 +831,9 @@ hammer2_chain_unlock(hammer2_chain_t *chain)
         * exclusively all that will happen is that the chain will be
         * reloaded after we unload it.
         */
-       ostate = ccms_thread_lock_upgrade(&core->cst);
+       ostate = ccms_thread_lock_upgrade(&chain->core.cst);
        if (chain->lockcnt) {
-               ccms_thread_unlock_upgraded(&core->cst, ostate);
+               ccms_thread_unlock_upgraded(&chain->core.cst, ostate);
                hammer2_chain_drop(chain);
                return;
        }
@@ -1116,7 +847,7 @@ hammer2_chain_unlock(hammer2_chain_t *chain)
        if (chain->dio == NULL) {
                if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0)
                        hammer2_chain_drop_data(chain, 0);
-               ccms_thread_unlock_upgraded(&core->cst, ostate);
+               ccms_thread_unlock_upgraded(&chain->core.cst, ostate);
                hammer2_chain_drop(chain);
                return;
        }
@@ -1191,7 +922,7 @@ hammer2_chain_unlock(hammer2_chain_t *chain)
        } else {
                hammer2_io_bqrelse(&chain->dio);
        }
-       ccms_thread_unlock_upgraded(&core->cst, ostate);
+       ccms_thread_unlock_upgraded(&chain->core.cst, ostate);
        hammer2_chain_drop(chain);
 }
 
@@ -1213,61 +944,53 @@ void
 hammer2_chain_countbrefs(hammer2_chain_t *chain,
                         hammer2_blockref_t *base, int count)
 {
-       hammer2_chain_core_t *core = chain->core;
-
-       KKASSERT((chain->flags & HAMMER2_CHAIN_DUPLICATED) == 0);
-
-       spin_lock(&core->cst.spin);
-        if ((core->flags & HAMMER2_CORE_COUNTEDBREFS) == 0) {
+       spin_lock(&chain->core.cst.spin);
+        if ((chain->core.flags & HAMMER2_CORE_COUNTEDBREFS) == 0) {
                if (base) {
                        while (--count >= 0) {
                                if (base[count].type)
                                        break;
                        }
-                       core->live_zero = count + 1;
+                       chain->core.live_zero = count + 1;
                        while (count >= 0) {
                                if (base[count].type)
-                                       atomic_add_int(&core->live_count, 1);
+                                       atomic_add_int(&chain->core.live_count,
+                                                      1);
                                --count;
                        }
                } else {
-                       core->live_zero = 0;
+                       chain->core.live_zero = 0;
                }
                /* else do not modify live_count */
-               atomic_set_int(&core->flags, HAMMER2_CORE_COUNTEDBREFS);
+               atomic_set_int(&chain->core.flags, HAMMER2_CORE_COUNTEDBREFS);
        }
-       spin_unlock(&core->cst.spin);
+       spin_unlock(&chain->core.cst.spin);
 }
 
 /*
- * Resize the chain's physical storage allocation in-place.  This may
- * replace the passed-in chain with a new chain.
- *
- * Chains can be resized smaller without reallocating the storage.
- * Resizing larger will reallocate the storage.
+ * Resize the chain's physical storage allocation in-place.  This will
+ * modify the passed-in chain.  Chains can be resized smaller without
+ * reallocating the storage.  Resizing larger will reallocate the storage.
+ * Excess or prior storage is reclaimed asynchronously at a later time.
  *
- * Must be passed an exclusively locked parent and chain, returns a new
- * exclusively locked chain at the same index and unlocks the old chain.
- * Flushes the buffer if necessary.
+ * Must be passed an exclusively locked parent and chain.
  *
  * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order
- * to avoid instantiating a device buffer that conflicts with the vnode
- * data buffer.  That is, the passed-in bp is a logical buffer, whereas
- * any chain-oriented bp would be a device buffer.
+ * to avoid instantiating a device buffer that conflicts with the vnode data
+ * buffer.  That is, the passed-in bp is a logical buffer, whereas any
+ * chain-oriented bp would be a device buffer.
  *
  * XXX return error if cannot resize.
  */
 void
 hammer2_chain_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
-                    hammer2_chain_t *parent, hammer2_chain_t **chainp,
+                    hammer2_chain_t *parent, hammer2_chain_t *chain,
                     int nradix, int flags)
 {
        hammer2_mount_t *hmp;
-       hammer2_chain_t *chain;
        size_t obytes;
        size_t nbytes;
 
-       chain = *chainp;
        hmp = chain->hmp;
 
        /*
@@ -1287,19 +1010,16 @@ hammer2_chain_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
                return;
 
        /*
-        * Delete the old chain and duplicate it at the same (parent, index),
-        * returning a new chain.  This allows the old chain to still be
-        * used by the flush code.  The new chain will be returned in a
-        * modified state.
-        *
         * The parent does not have to be locked for the delete/duplicate call,
         * but is in this particular code path.
         *
         * NOTE: If we are not crossing a synchronization point the
         *       duplication code will simply reuse the existing chain
         *       structure.
+        *
+        * NOTE: The modify will set BMAPUPD for us if BMAPPED is set.
         */
-       hammer2_chain_delete_duplicate(trans, &chain, 0);
+       hammer2_chain_modify(trans, chain, 0);
 
        /*
         * Relocate the block, even if making it smaller (because different
@@ -1317,8 +1037,6 @@ hammer2_chain_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
         * blocks).
         */
        KKASSERT(chain->dio == NULL);
-
-       *chainp = chain;
 }
 
 #if 0
@@ -1359,17 +1077,15 @@ hammer2_chain_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
 #endif
 
 void
-hammer2_chain_modify(hammer2_trans_t *trans, hammer2_chain_t **chainp,
-                    int flags)
+hammer2_chain_modify(hammer2_trans_t *trans, hammer2_chain_t *chain, int flags)
 {
        hammer2_mount_t *hmp;
-       hammer2_chain_t *chain;
        hammer2_io_t *dio;
        int error;
        int wasinitial;
+       int newmod;
        char *bdata;
 
-       chain = *chainp;
        hmp = chain->hmp;
 
        /*
@@ -1382,25 +1098,6 @@ hammer2_chain_modify(hammer2_trans_t *trans, hammer2_chain_t **chainp,
                         (flags & HAMMER2_MODIFY_OPTDATA) == 0);
        }
 
-       /*
-        * Determine if a delete-duplicate is needed.
-        *
-        * (a) Modify_tid is part of a prior flush
-        * (b) Transaction is concurrent with a flush (has higher tid)
-        * (c) and chain is not in the initial state (freshly created)
-        * (d) and caller didn't request an in-place modification.
-        *
-        * The freemap and volume header special chains are never D-Dd.
-        */
-       if (chain->modify_xid != trans->sync_xid &&        /* cross boundary */
-           (flags & HAMMER2_MODIFY_INPLACE) == 0) {       /* from d-d */
-               if (chain != &hmp->fchain && chain != &hmp->vchain) {
-                       KKASSERT((flags & HAMMER2_MODIFY_ASSERTNOCOPY) == 0);
-                       hammer2_chain_delete_duplicate(trans, chainp, 0);
-                       chain = *chainp;
-               }
-       }
-
        /*
         * Data must be resolved if already assigned unless explicitly
         * flagged otherwise.
@@ -1413,17 +1110,19 @@ hammer2_chain_modify(hammer2_trans_t *trans, hammer2_chain_t **chainp,
 
        /*
         * Otherwise do initial-chain handling.  Set MODIFIED to indicate
-        * that the chain has been modified.  Set FLUSH_CREATE to flush
-        * the new blockref (the D-D set FLUSH_DELETE on the old chain to
-        * delete the old blockref).
+        * that the chain has been modified.  Set UPDATE to ensure that
+        * the blockref is updated in the parent.
         */
        if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) {
                atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
                hammer2_chain_ref(chain);
                hammer2_pfs_memory_inc(chain->pmp);
+               newmod = 1;
+       } else {
+               newmod = 0;
        }
-       if ((chain->flags & HAMMER2_CHAIN_FLUSH_CREATE) == 0) {
-               atomic_set_int(&chain->flags, HAMMER2_CHAIN_FLUSH_CREATE);
+       if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0) {
+               atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
                hammer2_chain_ref(chain);
        }
 
@@ -1436,8 +1135,7 @@ hammer2_chain_modify(hammer2_trans_t *trans, hammer2_chain_t **chainp,
         */
        if (chain != &hmp->vchain && chain != &hmp->fchain) {
                if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0 ||
-                    ((flags & HAMMER2_MODIFY_NOREALLOC) == 0 &&
-                     chain->modify_xid != trans->sync_xid)
+                    ((flags & HAMMER2_MODIFY_NOREALLOC) == 0 && newmod)
                ) {
                        hammer2_freemap_alloc(trans, chain, chain->bytes);
                        /* XXX failed allocation */
@@ -1449,17 +1147,14 @@ hammer2_chain_modify(hammer2_trans_t *trans, hammer2_chain_t **chainp,
        }
 
        /*
-        * Update modify_xid.  XXX special-case vchain/fchain because they
-        * are always modified in-place.  Otherwise the chain being modified
-        * must not be part of a future transaction.
+        * Set BMAPUPD to tell the flush code that an existing blockmap entry
+        * requires updating as well as to tell the delete code that the
+        * chain's blockref might not exactly match (in terms of physical size
+        * or block offset) the one in the parent's blocktable.  The base key
+        * of course will still match.
         */
-       if (chain == &hmp->vchain || chain == &hmp->fchain) {
-               if (chain->modify_xid <= trans->sync_xid)
-                       chain->modify_xid = trans->sync_xid;
-       } else {
-               KKASSERT(chain->modify_xid <= trans->sync_xid);
-               chain->modify_xid = trans->sync_xid;
-       }
+       if (chain->flags & HAMMER2_CHAIN_BMAPPED)
+               atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPUPD);
 
        /*
         * Do not COW BREF_TYPE_DATA when OPTDATA is set.  This is because
@@ -1559,7 +1254,13 @@ hammer2_chain_modify(hammer2_trans_t *trans, hammer2_chain_t **chainp,
 
        }
 skip2:
-       hammer2_chain_setsubmod(trans, chain);
+       /*
+        * setflush on parent indicating that the parent must recurse down
+        * to us.  Do not call on chain itself which might already have it
+        * set.
+        */
+       if (chain->parent)
+               hammer2_chain_setflush(trans, chain->parent);
 }
 
 /*
@@ -1583,16 +1284,14 @@ hammer2_voldata_modify(hammer2_mount_t *hmp)
        if ((hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) == 0) {
                atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED);
                hammer2_chain_ref(&hmp->vchain);
+               hammer2_pfs_memory_inc(hmp->vchain.pmp);
        }
 }
 
 /*
  * This function returns the chain at the nearest key within the specified
- * range with the highest delete_xid.  The core spinlock must be held on
- * call and the returned chain will be referenced but not locked.
- *
- * The returned chain may or may not be in a deleted state.  Note that
- * live chains have a delete_xid = XID_MAX.
+ * range.  The core spinlock must be held on call and the returned chain
+ * will be referenced but not locked.
  *
  * This function will recurse through chain->rbtree as necessary and will
  * return a *key_nextp suitable for iteration.  *key_nextp is only set if
@@ -1629,8 +1328,7 @@ hammer2_chain_find(hammer2_chain_t *parent, hammer2_key_t *key_nextp,
        info.key_end = key_end;
        info.key_next = *key_nextp;
 
-       KKASSERT(parent->core->good == 0x1234);
-       RB_SCAN(hammer2_chain_tree, &parent->core->rbtree,
+       RB_SCAN(hammer2_chain_tree, &parent->core.rbtree,
                hammer2_chain_find_cmp, hammer2_chain_find_callback,
                &info);
        *key_nextp = info.key_next;
@@ -1642,36 +1340,6 @@ hammer2_chain_find(hammer2_chain_t *parent, hammer2_key_t *key_nextp,
        return (info.best);
 }
 
-/*
- * Find a deleted chain covering a block table entry.  Be careful to deal
- * with the race condition where the block table has been updated but the
- * chain has not yet been removed from dbtree (due to multiple parents having
- * to be updated).
- */
-static
-hammer2_chain_t *
-hammer2_chain_find_deleted(hammer2_chain_t *parent,
-                         hammer2_key_t key_beg, hammer2_key_t key_end)
-{
-       struct hammer2_chain_find_info info;
-       hammer2_chain_t *child;
-
-       info.best = NULL;
-       info.key_beg = key_beg;
-       info.key_end = key_end;
-       info.key_next = 0;
-
-       KKASSERT(parent->core->good == 0x1234);
-       RB_SCAN(hammer2_chain_tree, &parent->core->dbtree,
-               hammer2_chain_find_cmp, hammer2_chain_find_callback,
-               &info);
-       if ((child = info.best) != NULL) {
-               if (child->delete_xid <= parent->update_xlo)
-                       child = NULL;
-       }
-       return child;
-}
-
 static
 int
 hammer2_chain_find_cmp(hammer2_chain_t *child, void *data)
@@ -1715,45 +1383,38 @@ hammer2_chain_find_callback(hammer2_chain_t *child, void *data)
        } else if (best->bref.key <= info->key_beg &&
                   child->bref.key <= info->key_beg) {
                /*
-                * If our current best is flush with key_beg and child is
-                * also flush with key_beg choose based on delete_xid.
-                *
-                * key_next will automatically be limited to the smaller of
-                * the two end-points.
+                * Illegal overlap.
                 */
-               if (child->delete_xid > best->delete_xid)
-                       info->best = child;
+               KKASSERT(0);
+               /*info->best = child;*/
        } else if (child->bref.key < best->bref.key) {
                /*
                 * Child has a nearer key and best is not flush with key_beg.
-                * Truncate key_next to the old best key iff it had a better
-                * delete_xid.
+                * Set best to child.  Truncate key_next to the old best key.
                 */
                info->best = child;
-               if (best->delete_xid >= child->delete_xid &&
-                   (info->key_next > best->bref.key || info->key_next == 0))
+               if (info->key_next > best->bref.key || info->key_next == 0)
                        info->key_next = best->bref.key;
        } else if (child->bref.key == best->bref.key) {
                /*
-                * If our current best is flush with the child then choose
-                * based on delete_xid.
+                * If our current best is flush with the child then this
+                * is an illegal overlap.
                 *
                 * key_next will automatically be limited to the smaller of
                 * the two end-points.
                 */
-               if (child->delete_xid > best->delete_xid)
-                       info->best = child;
+               KKASSERT(0);
+               info->best = child;
        } else {
                /*
                 * Keep the current best but truncate key_next to the child's
-                * base iff the child has a higher delete_xid.
+                * base.
                 *
                 * key_next will also automatically be limited to the smaller
                 * of the two end-points (probably not necessary for this case
                 * but we do it anyway).
                 */
-               if (child->delete_xid >= best->delete_xid &&
-                   (info->key_next > child->bref.key || info->key_next == 0))
+               if (info->key_next > child->bref.key || info->key_next == 0)
                        info->key_next = child->bref.key;
        }
 
@@ -1769,9 +1430,7 @@ hammer2_chain_find_callback(hammer2_chain_t *child, void *data)
 
 /*
  * Retrieve the specified chain from a media blockref, creating the
- * in-memory chain structure which reflects it.  modify_xid will be
- * set to the min value which forces any modifications to issue a
- * delete-duplicate.
+ * in-memory chain structure which reflects it.
  *
  * To handle insertion races pass the INSERT_RACE flag along with the
  * generation number of the core.  NULL will be returned if the generation
@@ -1789,7 +1448,6 @@ hammer2_chain_get(hammer2_chain_t *parent, int generation,
                  hammer2_blockref_t *bref)
 {
        hammer2_mount_t *hmp = parent->hmp;
-       hammer2_chain_core_t *above = parent->core;
        hammer2_chain_t *chain;
        int error;
 
@@ -1801,15 +1459,13 @@ hammer2_chain_get(hammer2_chain_t *parent, int generation,
                chain = hammer2_chain_alloc(hmp, NULL, NULL, bref);
        else
                chain = hammer2_chain_alloc(hmp, parent->pmp, NULL, bref);
-       hammer2_chain_core_alloc(NULL, chain, NULL);
+       hammer2_chain_core_alloc(NULL, chain);
        /* ref'd chain returned */
 
        /*
-        * Set modify_xid and update_xlo to the chain's synchronization
-        * point from the media.
+        * Flag that the chain is in the parent's blockmap so delete/flush
+        * knows what to do with it.
         */
-       chain->modify_xid = HAMMER2_XID_MIN;
-       chain->update_xlo = HAMMER2_XID_MIN;
        atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPPED);
 
        /*
@@ -1819,14 +1475,12 @@ hammer2_chain_get(hammer2_chain_t *parent, int generation,
         * a shared lock on the parent.
         */
        KKASSERT(parent->refs > 0);
-       error = hammer2_chain_insert(above, NULL, chain,
+       error = hammer2_chain_insert(parent, chain,
                                     HAMMER2_CHAIN_INSERT_SPIN |
                                     HAMMER2_CHAIN_INSERT_RACE,
                                     generation);
        if (error) {
-               KKASSERT((chain->flags & (HAMMER2_CHAIN_ONRBTREE |
-                                         HAMMER2_CHAIN_ONDBTREE |
-                                         HAMMER2_CHAIN_ONDBQ)) == 0);
+               KKASSERT((chain->flags & HAMMER2_CHAIN_ONRBTREE) == 0);
                kprintf("chain %p get race\n", chain);
                hammer2_chain_drop(chain);
                chain = NULL;
@@ -1868,52 +1522,23 @@ hammer2_chain_t *
 hammer2_chain_getparent(hammer2_chain_t **parentp, int how)
 {
        hammer2_chain_t *oparent;
-       hammer2_chain_t *bparent;
        hammer2_chain_t *nparent;
-       hammer2_chain_core_t *above;
-
-       oparent = *parentp;
-       above = oparent->above;
-
-       spin_lock(&above->cst.spin);
-       bparent = TAILQ_FIRST(&above->ownerq);
-       hammer2_chain_ref(bparent);
 
        /*
         * Be careful of order, oparent must be unlocked before nparent
-        * is locked below to avoid a deadlock.  We might as well delay its
-        * unlocking until we conveniently no longer have the spinlock (instead
-        * of cycling the spinlock).
-        *
-        * Theoretically our ref on bparent should prevent elements of the
-        * following chain from going away and prevent above from going away,
-        * but we still need the spinlock to safely scan the list.
+        * is locked below to avoid a deadlock.
         */
-       for (;;) {
-               nparent = bparent;
-               while (nparent->flags & HAMMER2_CHAIN_DUPLICATED)
-                       nparent = TAILQ_NEXT(nparent, core_entry);
-               hammer2_chain_ref(nparent);
-               spin_unlock(&above->cst.spin);
-
-               if (oparent) {
-                       hammer2_chain_unlock(oparent);
-                       oparent = NULL;
-               }
-               hammer2_chain_lock(nparent, how | HAMMER2_RESOLVE_NOREF);
-               hammer2_chain_drop(bparent);
-
-               /*
-                * We might have raced a delete-duplicate.
-                */
-               if ((nparent->flags & HAMMER2_CHAIN_DUPLICATED) == 0)
-                       break;
-               bparent = nparent;
-               hammer2_chain_ref(bparent);
-               hammer2_chain_unlock(nparent);
-               spin_lock(&above->cst.spin);
-               /* retry */
+       oparent = *parentp;
+       spin_lock(&oparent->core.cst.spin);
+       nparent = oparent->parent;
+       hammer2_chain_ref(nparent);
+       spin_unlock(&oparent->core.cst.spin);
+       if (oparent) {
+               hammer2_chain_unlock(oparent);
+               oparent = NULL;
        }
+
+       hammer2_chain_lock(nparent, how | HAMMER2_RESOLVE_NOREF);
        *parentp = nparent;
 
        return (nparent);
@@ -1965,14 +1590,12 @@ hammer2_chain_lookup(hammer2_chain_t **parentp, hammer2_key_t *key_nextp,
        hammer2_blockref_t bcopy;
        hammer2_key_t scan_beg;
        hammer2_key_t scan_end;
-       hammer2_chain_core_t *above;
        int count = 0;
        int how_always = HAMMER2_RESOLVE_ALWAYS;
        int how_maybe = HAMMER2_RESOLVE_MAYBE;
        int how;
        int generation;
        int maxloops = 300000;
-       int wasdup;
 
        *ddflagp = 0;
        if (flags & HAMMER2_LOOKUP_ALWAYS) {
@@ -2085,31 +1708,30 @@ again:
        /*
         * Merged scan to find next candidate.
         *
-        * hammer2_base_*() functions require the above->live_* fields
+        * hammer2_base_*() functions require the parent->core.live_* fields
         * to be synchronized.
         *
         * We need to hold the spinlock to access the block array and RB tree
         * and to interlock chain creation.
         */
-       above = parent->core;
-       if ((parent->core->flags & HAMMER2_CORE_COUNTEDBREFS) == 0)
+       if ((parent->core.flags & HAMMER2_CORE_COUNTEDBREFS) == 0)
                hammer2_chain_countbrefs(parent, base, count);
 
        /*
         * Combined search
         */
-       spin_lock(&above->cst.spin);
+       spin_lock(&parent->core.cst.spin);
        chain = hammer2_combined_find(parent, base, count,
                                      cache_indexp, key_nextp,
                                      key_beg, key_end,
                                      &bref);
-       generation = above->generation;
+       generation = parent->core.generation;
 
        /*
         * Exhausted parent chain, iterate.
         */
        if (bref == NULL) {
-               spin_unlock(&above->cst.spin);
+               spin_unlock(&parent->core.cst.spin);
                if (key_beg == key_end) /* short cut single-key case */
                        return (NULL);
 
@@ -2138,7 +1760,7 @@ again:
         */
        if (chain == NULL) {
                bcopy = *bref;
-               spin_unlock(&above->cst.spin);
+               spin_unlock(&parent->core.cst.spin);
                chain = hammer2_chain_get(parent, generation,
                                          &bcopy);
                if (chain == NULL) {
@@ -2150,11 +1772,9 @@ again:
                        hammer2_chain_drop(chain);
                        goto again;
                }
-               wasdup = 0;
        } else {
                hammer2_chain_ref(chain);
-               wasdup = ((chain->flags & HAMMER2_CHAIN_DUPLICATED) != 0);
-               spin_unlock(&above->cst.spin);
+               spin_unlock(&parent->core.cst.spin);
        }
 
        /*
@@ -2183,11 +1803,9 @@ again:
         */
        if (chain->flags & HAMMER2_CHAIN_DELETED) {
                hammer2_chain_unlock(chain);
-               if ((chain->flags & HAMMER2_CHAIN_DUPLICATED) == 0 || wasdup) {
-                       key_beg = *key_nextp;
-                       if (key_beg == 0 || key_beg > key_end)
-                               return(NULL);
-               }
+               key_beg = *key_nextp;
+               if (key_beg == 0 || key_beg > key_end)
+                       return(NULL);
                goto again;
        }
 
@@ -2316,7 +1934,6 @@ hammer2_chain_scan(hammer2_chain_t *parent, hammer2_chain_t *chain,
        hammer2_blockref_t *base;
        hammer2_blockref_t *bref;
        hammer2_blockref_t bcopy;
-       hammer2_chain_core_t *above;
        hammer2_key_t key;
        hammer2_key_t next_key;
        int count = 0;
@@ -2325,7 +1942,6 @@ hammer2_chain_scan(hammer2_chain_t *parent, hammer2_chain_t *chain,
        int how;
        int generation;
        int maxloops = 300000;
-       int wasdup;
 
        hmp = parent->hmp;
 
@@ -2411,29 +2027,28 @@ again:
        /*
         * Merged scan to find next candidate.
         *
-        * hammer2_base_*() functions require the above->live_* fields
+        * hammer2_base_*() functions require the parent->core.live_* fields
         * to be synchronized.
         *
         * We need to hold the spinlock to access the block array and RB tree
         * and to interlock chain creation.
         */
-       if ((parent->core->flags & HAMMER2_CORE_COUNTEDBREFS) == 0)
+       if ((parent->core.flags & HAMMER2_CORE_COUNTEDBREFS) == 0)
                hammer2_chain_countbrefs(parent, base, count);
 
-       above = parent->core;
        next_key = 0;
-       spin_lock(&above->cst.spin);
+       spin_lock(&parent->core.cst.spin);
        chain = hammer2_combined_find(parent, base, count,
                                      cache_indexp, &next_key,
                                      key, HAMMER2_KEY_MAX,
                                      &bref);
-       generation = above->generation;
+       generation = parent->core.generation;
 
        /*
         * Exhausted parent chain, we're done.
         */
        if (bref == NULL) {
-               spin_unlock(&above->cst.spin);
+               spin_unlock(&parent->core.cst.spin);
                KKASSERT(chain == NULL);
                goto done;
        }
@@ -2443,7 +2058,7 @@ again:
         */
        if (chain == NULL) {
                bcopy = *bref;
-               spin_unlock(&above->cst.spin);
+               spin_unlock(&parent->core.cst.spin);
                chain = hammer2_chain_get(parent, generation, &bcopy);
                if (chain == NULL) {
                        kprintf("retry scan parent %p keys %016jx\n",
@@ -2455,11 +2070,9 @@ again:
                        chain = NULL;
                        goto again;
                }
-               wasdup = 0;
        } else {
                hammer2_chain_ref(chain);
-               wasdup = ((chain->flags & HAMMER2_CHAIN_DUPLICATED) != 0);
-               spin_unlock(&above->cst.spin);
+               spin_unlock(&parent->core.cst.spin);
        }
 
        /*
@@ -2488,11 +2101,9 @@ again:
                hammer2_chain_unlock(chain);
                chain = NULL;
 
-               if ((chain->flags & HAMMER2_CHAIN_DUPLICATED) == 0 || wasdup) {
-                       key = next_key;
-                       if (key == 0)
-                               goto done;
-               }
+               key = next_key;
+               if (key == 0)
+                       goto done;
                goto again;
        }
 
@@ -2519,9 +2130,7 @@ done:
  *
  * (*chainp) usually starts out NULL and returns the newly created chain,
  * but if the caller desires the caller may allocate a disconnected chain
- * and pass it in instead.  (It is also possible for the caller to use
- * chain_duplicate() to create a disconnected chain, manipulate it, then
- * pass it into this function to insert it).
+ * and pass it in instead.
  *
  * This function should NOT be used to insert INDIRECT blocks.  It is
  * typically used to create/insert inodes and data blocks.
@@ -2539,8 +2148,7 @@ hammer2_chain_create(hammer2_trans_t *trans, hammer2_chain_t **parentp,
 {
        hammer2_mount_t *hmp;
        hammer2_chain_t *chain;
-       hammer2_chain_t *parent = *parentp;
-       hammer2_chain_core_t *above;
+       hammer2_chain_t *parent;
        hammer2_blockref_t *base;
        hammer2_blockref_t dummy;
        int allocated = 0;
@@ -2551,8 +2159,8 @@ hammer2_chain_create(hammer2_trans_t *trans, hammer2_chain_t **parentp,
        /*
         * Topology may be crossing a PFS boundary.
         */
-       above = parent->core;
-       KKASSERT(ccms_thread_lock_owned(&above->cst));
+       parent = *parentp;
+       KKASSERT(ccms_thread_lock_owned(&parent->core.cst));
        hmp = parent->hmp;
        chain = *chainp;
 
@@ -2570,7 +2178,7 @@ hammer2_chain_create(hammer2_trans_t *trans, hammer2_chain_t **parentp,
                dummy.data_off = hammer2_getradix(bytes);
                dummy.methods = parent->bref.methods;
                chain = hammer2_chain_alloc(hmp, pmp, trans, &dummy);
-               hammer2_chain_core_alloc(trans, chain, NULL);
+               hammer2_chain_core_alloc(trans, chain);
 
                /*
                 * Lock the chain manually, chain_lock will load the chain
@@ -2578,7 +2186,7 @@ hammer2_chain_create(hammer2_trans_t *trans, hammer2_chain_t **parentp,
                 * to 1 by chain_alloc() for us, but lockcnt is not).
                 */
                chain->lockcnt = 1;
-               ccms_thread_lock(&chain->core->cst, CCMS_STATE_EXCLUSIVE);
+               ccms_thread_lock(&chain->core.cst, CCMS_STATE_EXCLUSIVE);
                allocated = 1;
 
                /*
@@ -2620,26 +2228,18 @@ hammer2_chain_create(hammer2_trans_t *trans, hammer2_chain_t **parentp,
                }
        } else {
                /*
-                * We are reattaching a chain that has been duplicated and
-                * left disconnected under a DIFFERENT parent with potentially
-                * different key/keybits.
-                *
-                * The chain must be modified in the current transaction
-                * (the duplication code should have done that for us),
-                * and it's modify_xid should be greater than the parent's
-                * bref.mirror_tid.  This should cause it to be created under
-                * the new parent.
-                *
-                * If deleted in the same transaction, the create/delete TIDs
-                * will be the same and effective the chain will not have
-                * existed at all from the point of view of the parent.
+                * We are reattaching a previously deleted chain, possibly
+                * under a new parent and possibly with a new key/keybits.
+                * The chain does not have to be in a modified state.  The
+                * UPDATE flag will be set later on in this routine.
                 *
                 * Do NOT mess with the current state of the INITIAL flag.
                 */
-               KKASSERT(chain->modify_xid == trans->sync_xid);
                chain->bref.key = key;
                chain->bref.keybits = keybits;
-               KKASSERT(chain->above == NULL);
+               if (chain->flags & HAMMER2_CHAIN_DELETED)
+                       atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DELETED);
+               KKASSERT(chain->parent == NULL);
        }
 
        /*
@@ -2649,7 +2249,6 @@ hammer2_chain_create(hammer2_trans_t *trans, hammer2_chain_t **parentp,
 again:
        if (--maxloops == 0)
                panic("hammer2_chain_create: maxloops");
-       above = parent->core;
 
        switch(parent->bref.type) {
        case HAMMER2_BREF_TYPE_INODE:
@@ -2688,10 +2287,11 @@ again:
        /*
         * Make sure we've counted the brefs
         */
-       if ((parent->core->flags & HAMMER2_CORE_COUNTEDBREFS) == 0)
+       if ((parent->core.flags & HAMMER2_CORE_COUNTEDBREFS) == 0)
                hammer2_chain_countbrefs(parent, base, count);
 
-       KKASSERT(above->live_count >= 0 && above->live_count <= count);
+       KKASSERT(parent->core.live_count >= 0 &&
+                parent->core.live_count <= count);
 
        /*
         * If no free blockref could be found we must create an indirect
@@ -2702,7 +2302,7 @@ again:
         * This may return the new indirect block or the old parent depending
         * on where the key falls.  NULL is returned on error.
         */
-       if (above->live_count == count) {
+       if (parent->core.live_count == count) {
                hammer2_chain_t *nparent;
 
                nparent = hammer2_chain_create_indirect(trans, parent,
@@ -2724,10 +2324,10 @@ again:
        /*
         * Link the chain into its parent.
         */
-       if (chain->above != NULL)
+       if (chain->parent != NULL)
                panic("hammer2: hammer2_chain_create: chain already connected");
-       KKASSERT(chain->above == NULL);
-       hammer2_chain_insert(above, NULL, chain,
+       KKASSERT(chain->parent == NULL);
+       hammer2_chain_insert(parent, chain,
                             HAMMER2_CHAIN_INSERT_SPIN |
                             HAMMER2_CHAIN_INSERT_LIVE,
                             0);
@@ -2735,7 +2335,7 @@ again:
        if (allocated) {
                /*
                 * Mark the newly created chain modified.  This will cause
-                * FLUSH_CREATE to be set.
+                * UPDATE to be set.
                 *
                 * Device buffers are not instantiated for DATA elements
                 * as these are handled by logical buffers.
@@ -2751,7 +2351,7 @@ again:
                case HAMMER2_BREF_TYPE_DATA:
                case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
                case HAMMER2_BREF_TYPE_INODE:
-                       hammer2_chain_modify(trans, &chain,
+                       hammer2_chain_modify(trans, chain,
                                             HAMMER2_MODIFY_OPTDATA |
                                             HAMMER2_MODIFY_ASSERTNOCOPY);
                        break;
@@ -2768,17 +2368,23 @@ again:
                }
        } else {
                /*
-                * When reconnecting a chain we must set FLUSH_CREATE and
-                * setsubmod so the flush recognizes that it must update
+                * When reconnecting a chain we must set UPDATE and
+                * setflush so the flush recognizes that it must update
                 * the bref in the parent.
                 */
-               if ((chain->flags & HAMMER2_CHAIN_FLUSH_CREATE) == 0) {
+               if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0) {
                        hammer2_chain_ref(chain);
-                       atomic_set_int(&chain->flags,
-                                      HAMMER2_CHAIN_FLUSH_CREATE);
+                       atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
                }
        }
-       hammer2_chain_setsubmod(trans, chain);
+
+       /*
+        * We must setflush(parent) to ensure that it recurses through to
+        * chain.  setflush(chain) might not work because ONFLUSH is possibly
+        * already set in the chain (so it won't recurse up to set it in the
+        * parent).
+        */
+       hammer2_chain_setflush(trans, parent);
 
 done:
        *chainp = chain;
@@ -2787,12 +2393,13 @@ done:
 }
 
 /*
- * Replace (*chainp) with a duplicate in-memory chain structure which shares
- * the same core and media state as the orignal.  The original *chainp is
- * unlocked and the replacement will be returned locked.  The duplicated
- * chain is inserted under (*parentp).
+ * Move the chain from its old parent to a new parent.  The chain must have
+ * already been deleted or already disconnected (or never associated) with
+ * a parent.  The chain is reassociated with the new parent and the deleted
+ * flag will be cleared (no longer deleted).  The chain's modification state
+ * is not altered.
  *
- * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (*parentp) TO THE INSERTION
+ * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (parent) TO THE INSERTION
  * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING
  * FULL.  This typically means that the caller is creating the chain after
  * doing a hammer2_chain_lookup().
@@ -2801,10 +2408,6 @@ done:
  * Note that hammer2_cluster_duplicate() *ONLY* uses the key and keybits fields
  * from a passed-in bref and uses the old chain's bref for everything else.
  *
- * The old chain must be in a DELETED state unless snapshot is non-zero.
- *
- * The new chain will be live (i.e. not deleted), and modified.
- *
  * If (parent) is non-NULL then the new duplicated chain is inserted under
  * the parent.
  *
@@ -2812,429 +2415,177 @@ done:
  * anywhere, similar to if it had just been chain_alloc()'d (suitable for
  * passing into hammer2_chain_create() after this function returns).
  *
- * WARNING! This function cannot take snapshots all by itself.  The caller
- *         needs to do other massaging for snapshots.
- *
  * WARNING! This function calls create which means it can insert indirect
- *         blocks.  Callers may have to refactor locked chains held across
- *         the call (other than the ones passed into the call).
+ *         blocks.  This can cause other unrelated chains in the parent to
+ *         be moved to a newly inserted indirect block in addition to the
+ *         specific chain.
  */
 void
-hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t **parentp,
-                       hammer2_chain_t **chainp, hammer2_blockref_t *bref,
-                       int snapshot, int duplicate_reason)
+hammer2_chain_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref,
+                    hammer2_chain_t **parentp, hammer2_chain_t *chain)
 {
        hammer2_mount_t *hmp;
        hammer2_chain_t *parent;
-       hammer2_chain_t *ochain;
-       hammer2_chain_t *nchain;
-       hammer2_chain_core_t *above;
        size_t bytes;
 
        /*
-        * We want nchain to be our go-to live chain, but ochain may be in
-        * a MODIFIED state within the current flush synchronization segment.
-        * Force any further modifications of ochain to do another COW
-        * operation even if modify_xid indicates that one is not needed.
-        *
-        * We don't want to set FORCECOW on nchain simply as an optimization,
-        * as many duplication calls simply move chains into ichains and
-        * then delete the original.
-        *
         * WARNING!  We should never resolve DATA to device buffers
         *           (XXX allow it if the caller did?), and since
         *           we currently do not have the logical buffer cache
         *           buffer in-hand to fix its cached physical offset
         *           we also force the modify code to not COW it. XXX
         */
-       ochain = *chainp;
-       hmp = ochain->hmp;
-       KKASSERT(snapshot == 1 || (ochain->flags & HAMMER2_CHAIN_DELETED));
+       hmp = chain->hmp;
+       KKASSERT(chain->parent == NULL);
 
        /*
         * Now create a duplicate of the chain structure, associating
         * it with the same core, making it the same size, pointing it
         * to the same bref (the same media block).
-        *
-        * Give nchain the same modify_xid that we previously ensured was
-        * sufficiently advanced to trigger a block table insertion on flush.
-        *
-        * nchain copies ochain's data and must inherit ochain->update_xlo.
-        *
-        * NOTE: bref.mirror_tid duplicated by virtue of bref copy in
-        *       hammer2_chain_alloc()
         */
        if (bref == NULL)
-               bref = &ochain->bref;
-       if (snapshot) {
-               nchain = hammer2_chain_alloc(hmp, NULL, trans, bref);
-               atomic_set_int(&nchain->flags, HAMMER2_CHAIN_SNAPSHOT);
-       } else {
-               nchain = hammer2_chain_alloc(hmp, ochain->pmp, trans, bref);
-       }
-       hammer2_chain_core_alloc(trans, nchain, ochain);
+               bref = &chain->bref;
        bytes = (hammer2_off_t)1 <<
                (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
-       nchain->bytes = bytes;
-       nchain->modify_xid = ochain->modify_xid;
-       nchain->update_xlo = ochain->update_xlo;
-       nchain->inode_reason = ochain->inode_reason + 0x100000;
-       atomic_set_int(&nchain->flags,
-                      ochain->flags & (HAMMER2_CHAIN_INITIAL |
-                                       HAMMER2_CHAIN_FORCECOW |
-                                       HAMMER2_CHAIN_UNLINKED |
-                                       HAMMER2_CHAIN_PFSROOT |
-                                       HAMMER2_CHAIN_PFSBOUNDARY));
-       if (ochain->modify_xid == trans->sync_xid)
-               atomic_set_int(&ochain->flags, HAMMER2_CHAIN_FORCECOW);
-
-       /*
-        * Switch from ochain to nchain
-        */
-       hammer2_chain_lock(nchain, HAMMER2_RESOLVE_NEVER |
-                                  HAMMER2_RESOLVE_NOREF);
-       /* nchain has 1 ref */
-       hammer2_chain_unlock(ochain);
-
-       /*
-        * Place nchain in the modified state, instantiate media data
-        * if necessary.  Because modify_xid is already completely
-        * synchronized this should not result in a delete-duplicate.
-        *
-        * We want nchain at the target to look like a new insertion.
-        * Forcing the modification to be INPLACE accomplishes this
-        * because we get the same nchain with an updated modify_xid.
-        */
-       if (nchain->bref.type == HAMMER2_BREF_TYPE_DATA) {
-               hammer2_chain_modify(trans, &nchain,
-                                    HAMMER2_MODIFY_OPTDATA |
-                                    HAMMER2_MODIFY_NOREALLOC |
-                                    HAMMER2_MODIFY_INPLACE);
-       } else if (nchain->flags & HAMMER2_CHAIN_INITIAL) {
-               hammer2_chain_modify(trans, &nchain,
-                                    HAMMER2_MODIFY_OPTDATA |
-                                    HAMMER2_MODIFY_INPLACE);
-       } else {
-               hammer2_chain_modify(trans, &nchain,
-                                    HAMMER2_MODIFY_INPLACE);
-       }
 
        /*
         * If parent is not NULL the duplicated chain will be entered under
-        * the parent and the FLUSH_CREATE bit set to tell flush to update
+        * the parent and the UPDATE bit set to tell flush to update
         * the blockref.
         *
+        * We must setflush(parent) to ensure that it recurses through to
+        * chain.  setflush(chain) might not work because ONFLUSH is possibly
+        * already set in the chain (so it won't recurse up to set it in the
+        * parent).
+        *
         * Having both chains locked is extremely important for atomicy.
         */
        if (parentp && (parent = *parentp) != NULL) {
-               above = parent->core;
-               KKASSERT(ccms_thread_lock_owned(&above->cst));
-               KKASSERT((nchain->flags & HAMMER2_CHAIN_DELETED) == 0);
+               KKASSERT(ccms_thread_lock_owned(&parent->core.cst));
                KKASSERT(parent->refs > 0);
 
-               hammer2_chain_create(trans, parentp, &nchain, nchain->pmp,
-                                    nchain->bref.key, nchain->bref.keybits,
-                                    nchain->bref.type, nchain->bytes);
-               parent = NULL;
-
-               KKASSERT(nchain->flags & HAMMER2_CHAIN_FLUSH_CREATE);
-               hammer2_chain_setsubmod(trans, nchain);
+               hammer2_chain_create(trans, parentp, &chain, chain->pmp,
+                                    bref->key, bref->keybits, bref->type,
+                                    chain->bytes);
+               KKASSERT(chain->flags & HAMMER2_CHAIN_UPDATE);
+               hammer2_chain_setflush(trans, *parentp);
        }
-
-       *chainp = nchain;
 }
 
 /*
  * Helper function for deleting chains.
  *
- * The chain is removed from the live view (the RBTREE).
- *
- * If appropriate, the chain is added to the shadow topology and FLUSH_DELETE
- * is set for flusher visbility.  The caller is responsible for calling
- * setsubmod on chain, so we do not adjust update_xhi here.
+ * The chain is removed from the live view (the RBTREE) as well as the parent's
+ * blockmap.  Both chain and its parent must be locked.
  */
 static void
 _hammer2_chain_delete_helper(hammer2_trans_t *trans,
-                            hammer2_chain_core_t *above,
-                            hammer2_chain_t *chain)
+                            hammer2_chain_t *parent, hammer2_chain_t *chain)
 {
        hammer2_mount_t *hmp;
-       hammer2_chain_t *xchain;
-
-       KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE);
-       KKASSERT(trans->sync_xid >= chain->modify_xid);
-       KKASSERT((chain->flags & (HAMMER2_CHAIN_DELETED |
-                                 HAMMER2_CHAIN_ONDBQ |
-                                 HAMMER2_CHAIN_ONDBTREE |
-                                 HAMMER2_CHAIN_FLUSH_DELETE)) == 0);
 
-       /*
-        * Flag as deleted, reduce live_count and bump the above core's
-        * generation.
-        */
-       chain->delete_xid = trans->sync_xid;
-       atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
-       atomic_add_int(&above->live_count, -1);
-       ++above->generation;
+       KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0);
        hmp = chain->hmp;
 
-       /*
-        * Remove from live tree
-        */
-       RB_REMOVE(hammer2_chain_tree, &above->rbtree, chain);
-       atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
-
        if (chain->flags & HAMMER2_CHAIN_BMAPPED) {
                /*
-                * If the chain was originally bmapped we must place on the
-                * deleted tree and set FLUSH_DELETE (+ref) to prevent
-                * destruction of the chain until the flush can reconcile
-                * the parent's block table.
-                *
-                * NOTE! DBTREE is only representitive of the live view,
-                *       the flush must check both DBTREE and DBQ.
+                * Chain is blockmapped, so there must be a parent.
+                * Atomically remove the chain from the parent and remove
+                * the blockmap entry.
                 */
-               xchain = RB_INSERT(hammer2_chain_tree, &above->dbtree, chain);
-               KKASSERT(xchain == NULL);
-               atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONDBTREE);
+               hammer2_blockref_t *base;
+               int count;
+
+               KKASSERT(parent != NULL);
+               KKASSERT((parent->flags & HAMMER2_CHAIN_INITIAL) == 0);
+               hammer2_chain_modify(trans, parent,
+                                    HAMMER2_MODIFY_OPTDATA);
 
-               atomic_set_int(&chain->flags, HAMMER2_CHAIN_FLUSH_DELETE);
-               hammer2_chain_ref(chain);
-       } else {
                /*
-                * If the chain no longer (and never had) an actual blockmap
-                * entry we must place it on the dbq list and set FLUSH_DELETE
-                * (+ref) to prevent destruction of the chain until the flush
-                * can reconcile the parent's block table.
-                *
-                * NOTE! DBTREE is only representitive of the live view,
-                *       the flush must check both DBTREE and DBQ.
+                * Calculate blockmap pointer
                 */
-               TAILQ_INSERT_TAIL(&above->dbq, chain, db_entry);
-               atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONDBQ);
-
-               atomic_set_int(&chain->flags, HAMMER2_CHAIN_FLUSH_DELETE);
-               hammer2_chain_ref(chain);
-       }
-}
-
-/*
- * Special in-place delete-duplicate sequence which does not require a
- * locked parent.  (*chainp) is marked DELETED and atomically replaced
- * with a duplicate.  Atomicy is at the very-fine spin-lock level in
- * order to ensure that lookups do not race us.
- *
- * The flush code will sometimes call this function with a deleted chain.
- * In this situation the old chain's memory is reallocated without
- * duplicating it.
- *
- * The new chain will be marked modified for the current transaction.
- */
-void
-hammer2_chain_delete_duplicate(hammer2_trans_t *trans, hammer2_chain_t **chainp,
-                              int flags)
-{
-       hammer2_mount_t *hmp;
-       hammer2_chain_t *ochain;
-       hammer2_chain_t *nchain;
-       hammer2_chain_core_t *above;
-       size_t bytes;
-       uint32_t oflags;
-
-       if (hammer2_debug & 0x20000)
-               Debugger("dd");
-
-       /*
-        * Note that we do not have to call setsubmod on ochain, calling it
-        * on nchain is sufficient.
-        */
-       ochain = *chainp;
-       oflags = ochain->flags;         /* flags prior to core_alloc mods */
-       hmp = ochain->hmp;
-
-       if (ochain->bref.type == HAMMER2_BREF_TYPE_INODE) {
-               KKASSERT(ochain->data);
-       }
-
-       /*
-        * First create a duplicate of the chain structure.
-        * (nchain is allocated with one ref).
-        *
-        * In the case where nchain inherits ochains core, nchain is
-        * effectively locked due to ochain being locked (and sharing the
-        * core), until we can give nchain its own official ock.
-        *
-        * WARNING! Flusher concurrency can create two cases.  The first is
-        *          that the flusher might be working on a chain that has
-        *          been deleted in the live view but is live in the flusher's
-        *          view.  In the second case the flusher may be duplicating
-        *          a forward-transacted chain.  In both situations nchain
-        *          must be marked deleted.
-        *
-        * WARNING! hammer2_chain_core_alloc() also acts on these issues.
-        */
-       nchain = hammer2_chain_alloc(hmp, ochain->pmp, trans, &ochain->bref);
-       if ((ochain->flags & HAMMER2_CHAIN_DELETED) ||
-           (ochain->modify_xid > trans->sync_xid)) {
-               atomic_set_int(&nchain->flags, HAMMER2_CHAIN_DELETED);
-       }
-       if (flags & HAMMER2_DELDUP_RECORE)
-               hammer2_chain_core_alloc(trans, nchain, NULL);
-       else
-               hammer2_chain_core_alloc(trans, nchain, ochain);
-       above = ochain->above;
-
-       bytes = (hammer2_off_t)1 <<
-               (int)(ochain->bref.data_off & HAMMER2_OFF_MASK_RADIX);
-       nchain->bytes = bytes;
-
-       /*
-        * nchain inherits ochain's live state including its modification
-        * state.  This function disposes of the original.  Because we are
-        * doing this in-place under the same parent the block array
-        * inserted/deleted state does not change.
-        *
-        * nchain copies ochain's data and must inherit ochain->update_xlo.
-        *
-        * If ochain was previously marked FORCECOW we also flag nchain
-        * FORCECOW (used during hardlink splits).  FORCECOW forces a
-        * reallocation of the block when we modify the chain a little later,
-        * it does not force another delete-duplicate.
-        *
-        * NOTE: bref.mirror_tid duplicated by virtue of bref copy in
-        *       hammer2_chain_alloc()
-        */
-       nchain->data_count += ochain->data_count;
-       nchain->inode_count += ochain->inode_count;
-       atomic_set_int(&nchain->flags,
-                      ochain->flags & (HAMMER2_CHAIN_INITIAL |
-                                       HAMMER2_CHAIN_FORCECOW |
-                                       HAMMER2_CHAIN_UNLINKED |
-                                       HAMMER2_CHAIN_PFSROOT |
-                                       HAMMER2_CHAIN_PFSBOUNDARY));
-       if (ochain->modify_xid == trans->sync_xid)
-               atomic_set_int(&ochain->flags, HAMMER2_CHAIN_FORCECOW);
-       nchain->inode_reason = ochain->inode_reason + 0x1000;
-       nchain->update_xlo = ochain->update_xlo;
-
-       /*
-        * Lock nchain so both chains are now locked (extremely important
-        * for atomicy).  The shared core allows us to unlock ochain without
-        * actually unlocking ochain.
-        */
-       hammer2_chain_lock(nchain, HAMMER2_RESOLVE_NEVER);
-       /* extra ref still present from original allocation */
-
-       KKASSERT(ochain->flags & (HAMMER2_CHAIN_ONRBTREE |
-                                 HAMMER2_CHAIN_ONDBTREE |
-                                 HAMMER2_CHAIN_ONDBQ));
-       spin_lock(&above->cst.spin);
+               KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE);
+               spin_lock(&parent->core.cst.spin);
 
-       nchain->modify_xid = ochain->modify_xid;
-       nchain->delete_xid = HAMMER2_XID_MAX;
+               atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
+               atomic_add_int(&parent->core.live_count, -1);
+               ++parent->core.generation;
+               RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain);
+               atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
+               --parent->core.chain_count;
+               chain->parent = NULL;
 
-       if ((nchain->flags & HAMMER2_CHAIN_DELETED) &&
-           (oflags & HAMMER2_CHAIN_DUPLICATED)) {
-               /*
-                * Special case, used by the flush code when a chain which
-                * has been delete-duplicated is visible (effectively 'live')
-                * in the flush code.
-                *
-                * In this situations nchain will be marked deleted and
-                * insert before ochain.  nchain must inherit certain features
-                * of ochain.
-                */
-               KKASSERT(trans->flags & HAMMER2_TRANS_ISFLUSH);
-               KKASSERT(ochain->modify_xid < trans->sync_xid);
-               KKASSERT(ochain->delete_xid > trans->sync_xid);
-               atomic_set_int(&nchain->flags, HAMMER2_CHAIN_FLUSH_TEMPORARY);
-               hammer2_chain_insert(above, ochain, nchain, 0, 0);
-
-               if ((ochain->flags & HAMMER2_CHAIN_DELETED) &&
-                   ochain->modify_xid < trans->sync_xid) {
-                       nchain->delete_xid = ochain->delete_xid;
-                       ochain->delete_xid = trans->sync_xid;
-               } else if (ochain->modify_xid > trans->sync_xid) {
-                       nchain->delete_xid = ochain->modify_xid;
+               switch(parent->bref.type) {
+               case HAMMER2_BREF_TYPE_INODE:
+                       /*
+                        * Access the inode's block array.  However, there
+                        * is no block array if the inode is flagged
+                        * DIRECTDATA.  The DIRECTDATA case typicaly only
+                        * occurs when a hardlink has been shifted up the
+                        * tree and the original inode gets replaced with
+                        * an OBJTYPE_HARDLINK placeholding inode.
+                        */
+                       if (parent->data &&
+                           (parent->data->ipdata.op_flags &
+                            HAMMER2_OPFLAG_DIRECTDATA) == 0) {
+                               base =
+                                  &parent->data->ipdata.u.blockset.blockref[0];
+                       } else {
+                               base = NULL;
+                       }
+                       count = HAMMER2_SET_COUNT;
+                       break;
+               case HAMMER2_BREF_TYPE_INDIRECT:
+               case HAMMER2_BREF_TYPE_FREEMAP_NODE:
+                       if (parent->data)
+                               base = &parent->data->npdata[0];
+                       else
+                               base = NULL;
+                       count = parent->bytes / sizeof(hammer2_blockref_t);
+                       break;
+               case HAMMER2_BREF_TYPE_VOLUME:
+                       base = &hmp->voldata.sroot_blockset.blockref[0];
+                       count = HAMMER2_SET_COUNT;
+                       break;
+               case HAMMER2_BREF_TYPE_FREEMAP:
+                       base = &parent->data->npdata[0];
+                       count = HAMMER2_SET_COUNT;
+                       break;
+               default:
+                       base = NULL;
+                       count = 0;
+                       panic("hammer2_flush_pass2: "
+                             "unrecognized blockref type: %d",
+                             parent->bref.type);
+               }
+               if (base) {
+                       int cache_index = -1;
+                       hammer2_base_delete(trans, parent, base, count,
+                                           &cache_index, chain);
                }
-       } else if (nchain->flags & HAMMER2_CHAIN_DELETED) {
+               spin_unlock(&parent->core.cst.spin);
+       } else if (chain->flags & HAMMER2_CHAIN_ONRBTREE) {
                /*
-                * ochain is 'live' with respect to not having been D-D'd,
-                * but is flagged DELETED.  Sometimes updates to deleted
-                * chains must be allowed due to references which still exist
-                * on those chains, or due to a flush trying to retire a
-                * logical buffer cache buffer.
-                *
-                * In this situation the D-D operates normally, except
-                * ochain has already been deleted and nchain is also
-                * marked deleted.
+                * Chain is not blockmapped but a parent is present.
+                * Atomically remove the chain from the parent.  There is
+                * no blockmap entry to remove.
                 */
-               hammer2_chain_insert(above, ochain, nchain, 0, 0);
-               nchain->delete_xid = trans->sync_xid;
+               spin_lock(&parent->core.cst.spin);
+               atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
+               atomic_add_int(&parent->core.live_count, -1);
+               ++parent->core.generation;
+               RB_REMOVE(hammer2_chain_tree, &parent->core.rbtree, chain);
+               atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
+               --parent->core.chain_count;
+               chain->parent = NULL;
+               spin_unlock(&parent->core.cst.spin);
        } else {
                /*
-                * Normal case, delete-duplicate deletes ochain and nchain
-                * is the new live chain.
+                * Chain is not blockmapped and has no parent.  This
+                * is a degenerate case.
                 */
-               _hammer2_chain_delete_helper(trans, above, ochain);
-               hammer2_chain_insert(above, ochain, nchain,
-                                    HAMMER2_CHAIN_INSERT_LIVE, 0);
+               atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
        }
-       spin_unlock(&above->cst.spin);
-
-       /*
-        * ochain must be unlocked because ochain and nchain might share
-        * a buffer cache buffer, so we need to release it so nchain can
-        * potentially obtain it.
-        */
-       hammer2_chain_setsubmod(trans, ochain);
-       hammer2_chain_unlock(ochain);
-
-       /*
-        * Finishing fixing up nchain.  A new block will be allocated if
-        * crossing a synchronization point (meta-data only).
-        *
-        * Calling hammer2_chain_modify() will update modify_xid to
-        * (typically) trans->sync_xid.
-        */
-       if (nchain->bref.type == HAMMER2_BREF_TYPE_DATA) {
-               hammer2_chain_modify(trans, &nchain,
-                                    HAMMER2_MODIFY_OPTDATA |
-                                    HAMMER2_MODIFY_NOREALLOC |
-                                    HAMMER2_MODIFY_INPLACE);
-       } else if (nchain->flags & HAMMER2_CHAIN_INITIAL) {
-               hammer2_chain_modify(trans, &nchain,
-                                    HAMMER2_MODIFY_OPTDATA |
-                                    HAMMER2_MODIFY_INPLACE);
-       } else {
-               hammer2_chain_modify(trans, &nchain,
-                                    HAMMER2_MODIFY_INPLACE);
-       }
-       hammer2_chain_drop(nchain);
-
-       /*
-        * Unconditionally set FLUSH_CREATE to force the parent blockrefs to
-        * update as the chain_modify() above won't necessarily do it.
-        */
-       if ((nchain->flags & HAMMER2_CHAIN_FLUSH_CREATE) == 0) {
-               atomic_set_int(&nchain->flags, HAMMER2_CHAIN_FLUSH_CREATE);
-               hammer2_chain_ref(nchain);
-       }
-
-       /*
-        * If nchain is in a DELETED state we must set FLUSH_DELETE
-        */
-       if (nchain->flags & HAMMER2_CHAIN_DELETED)
-               KKASSERT((nchain->flags & HAMMER2_CHAIN_FLUSH_DELETE) == 0);
-#if 1
-       if ((nchain->flags & HAMMER2_CHAIN_FLUSH_DELETE) == 0 &&
-           (nchain->flags & HAMMER2_CHAIN_DELETED)) {
-               atomic_set_int(&nchain->flags, HAMMER2_CHAIN_FLUSH_DELETE);
-               hammer2_chain_ref(nchain);
-       }
-#endif
-       hammer2_chain_setsubmod(trans, nchain);
-       *chainp = nchain;
 }
 
 /*
@@ -3291,8 +2642,6 @@ hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent,
                              int for_type, int *errorp)
 {
        hammer2_mount_t *hmp;
-       hammer2_chain_core_t *above;
-       hammer2_chain_core_t *icore;
        hammer2_blockref_t *base;
        hammer2_blockref_t *bref;
        hammer2_blockref_t bcopy;
@@ -3311,8 +2660,6 @@ hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent,
        int reason;
        int generation;
        int maxloops = 300000;
-       int retry_same;
-       int wasdup;
 
        /*
         * Calculate the base blockref pointer or NULL if the chain
@@ -3321,8 +2668,7 @@ hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent,
         */
        hmp = parent->hmp;
        *errorp = 0;
-       KKASSERT(ccms_thread_lock_owned(&parent->core->cst));
-       above = parent->core;
+       KKASSERT(ccms_thread_lock_owned(&parent->core.cst));
 
        /*hammer2_chain_modify(trans, &parent, HAMMER2_MODIFY_OPTDATA);*/
        if (parent->flags & HAMMER2_CHAIN_INITIAL) {
@@ -3381,7 +2727,6 @@ hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent,
         * dummy used in later chain allocation (no longer used for lookups).
         */
        bzero(&dummy, sizeof(dummy));
-       dummy.delete_xid = HAMMER2_XID_MAX;
 
        /*
         * When creating an indirect block for a freemap node or leaf
@@ -3435,8 +2780,7 @@ hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent,
 
        ichain = hammer2_chain_alloc(hmp, parent->pmp, trans, &dummy.bref);
        atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL);
-       hammer2_chain_core_alloc(trans, ichain, NULL);
-       icore = ichain->core;
+       hammer2_chain_core_alloc(trans, ichain);
        hammer2_chain_lock(ichain, HAMMER2_RESOLVE_MAYBE);
        hammer2_chain_drop(ichain);     /* excess ref from alloc */
 
@@ -3445,7 +2789,7 @@ hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent,
         * OPTDATA to allow it to remain in the INITIAL state.  Otherwise
         * it won't be acted upon by the flush code.
         */
-       hammer2_chain_modify(trans, &ichain, HAMMER2_MODIFY_OPTDATA);
+       hammer2_chain_modify(trans, ichain, HAMMER2_MODIFY_OPTDATA);
 
        /*
         * Iterate the original parent and move the matching brefs into
@@ -3456,14 +2800,13 @@ hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent,
        key_beg = 0;
        key_end = HAMMER2_KEY_MAX;
        cache_index = 0;
-       spin_lock(&above->cst.spin);
+       spin_lock(&parent->core.cst.spin);
        loops = 0;
        reason = 0;
-       retry_same = 0;
 
        for (;;) {
                if (++loops > 100000) {
-                   spin_unlock(&above->cst.spin);
+                   spin_unlock(&parent->core.cst.spin);
                    panic("excessive loops r=%d p=%p base/count %p:%d %016jx\n",
                          reason, parent, base, count, key_next);
                }
@@ -3478,7 +2821,7 @@ hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent,
                                              &cache_index, &key_next,
                                              key_beg, key_end,
                                              &bref);
-               generation = above->generation;
+               generation = parent->core.generation;
                if (bref == NULL)
                        break;
                key_next = bref->key + ((hammer2_key_t)1 << bref->keybits);
@@ -3506,9 +2849,7 @@ hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent,
                         * Use chain already present in the RBTREE
                         */
                        hammer2_chain_ref(chain);
-                       wasdup = ((chain->flags &
-                                  HAMMER2_CHAIN_DUPLICATED) != 0);
-                       spin_unlock(&above->cst.spin);
+                       spin_unlock(&parent->core.cst.spin);
                        hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER |
                                                  HAMMER2_RESOLVE_NOREF);
                } else {
@@ -3517,27 +2858,27 @@ hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent,
                         * on insertion race.
                         */
                        bcopy = *bref;
-                       spin_unlock(&above->cst.spin);
+                       spin_unlock(&parent->core.cst.spin);
                        chain = hammer2_chain_get(parent, generation, &bcopy);
                        if (chain == NULL) {
                                reason = 1;
-                               spin_lock(&above->cst.spin);
+                               spin_lock(&parent->core.cst.spin);
                                continue;
                        }
                        if (bcmp(&bcopy, bref, sizeof(bcopy))) {
+                               kprintf("REASON 2\n");
                                reason = 2;
                                hammer2_chain_drop(chain);
-                               spin_lock(&above->cst.spin);
+                               spin_lock(&parent->core.cst.spin);
                                continue;
                        }
                        hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER |
                                                  HAMMER2_RESOLVE_NOREF);
-                       wasdup = 0;
                }
 
                /*
-                * This is always live so if the chain has been delete-
-                * duplicated we raced someone and we have to retry.
+                * This is always live so if the chain has been deleted
+                * we raced someone and we have to retry.
                 *
                 * NOTE: Lookups can race delete-duplicate because
                 *       delete-duplicate does not lock the parent's core
@@ -3550,10 +2891,6 @@ hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent,
                 */
                if (chain->flags & HAMMER2_CHAIN_DELETED) {
                        hammer2_chain_unlock(chain);
-                       if ((chain->flags & HAMMER2_CHAIN_DUPLICATED) &&
-                           wasdup == 0) {
-                               retry_same = 1;
-                       }
                        goto next_key;
                }
 
@@ -3564,32 +2901,30 @@ hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent,
                 *          Fortunately we have none (our locked chains are
                 *          passed into and modified by the call).
                 */
-               hammer2_chain_delete(trans, chain, 0);
-               hammer2_chain_duplicate(trans, &ichain, &chain, NULL, 0, 1);
+               hammer2_chain_delete(trans, parent, chain, 0);
+               hammer2_chain_rename(trans, NULL, &ichain, chain);
                hammer2_chain_unlock(chain);
                KKASSERT(parent->refs > 0);
                chain = NULL;
 next_key:
-               spin_lock(&above->cst.spin);
+               spin_lock(&parent->core.cst.spin);
 next_key_spinlocked:
                if (--maxloops == 0)
                        panic("hammer2_chain_create_indirect: maxloops");
                reason = 4;
-               if (retry_same == 0) {
-                       if (key_next == 0 || key_next > key_end)
-                               break;
-                       key_beg = key_next;
-               }
+               if (key_next == 0 || key_next > key_end)
+                       break;
+               key_beg = key_next;
                /* loop */
        }
-       spin_unlock(&above->cst.spin);
+       spin_unlock(&parent->core.cst.spin);
 
        /*
         * Insert the new indirect block into the parent now that we've
         * cleared out some entries in the parent.  We calculated a good
         * insertion index in the loop above (ichain->index).
         *
-        * We don't have to set FLUSH_CREATE here because we mark ichain
+        * We don't have to set UPDATE here because we mark ichain
         * modified down below (so the normal modified -> flush -> set-moved
         * sequence applies).
         *
@@ -3597,20 +2932,16 @@ next_key_spinlocked:
         * and the parent is locked.
         */
        KKASSERT((ichain->flags & HAMMER2_CHAIN_ONRBTREE) == 0);
-       hammer2_chain_insert(above, NULL, ichain,
+       hammer2_chain_insert(parent, ichain,
                             HAMMER2_CHAIN_INSERT_SPIN |
                             HAMMER2_CHAIN_INSERT_LIVE,
                             0);
 
        /*
-        * Mark the new indirect block modified after insertion, which
-        * will propagate up through parent all the way to the root and
-        * also allocate the physical block in ichain for our caller,
-        * and assign ichain->data to a pre-zero'd space (because there
-        * is not prior data to copy into it).
+        * Make sure flushes propogate after our manual insertion.
         */
-       /*hammer2_chain_modify(trans, &ichain, HAMMER2_MODIFY_OPTDATA);*/
-       hammer2_chain_setsubmod(trans, ichain);
+       hammer2_chain_setflush(trans, ichain);
+       hammer2_chain_setflush(trans, parent);
 
        /*
         * Figure out what to return.
@@ -3649,7 +2980,6 @@ int
 hammer2_chain_indkey_freemap(hammer2_chain_t *parent, hammer2_key_t *keyp,
                             int keybits, hammer2_blockref_t *base, int count)
 {
-       hammer2_chain_core_t *above;
        hammer2_chain_t *chain;
        hammer2_blockref_t *bref;
        hammer2_key_t key;
@@ -3662,7 +2992,6 @@ hammer2_chain_indkey_freemap(hammer2_chain_t *parent, hammer2_key_t *keyp,
        int maxloops = 300000;
 
        key = *keyp;
-       above = parent->core;
        locount = 0;
        hicount = 0;
        keybits = 64;
@@ -3674,7 +3003,7 @@ hammer2_chain_indkey_freemap(hammer2_chain_t *parent, hammer2_key_t *keyp,
        key_beg = 0;
        key_end = HAMMER2_KEY_MAX;
        cache_index = 0;
-       spin_lock(&above->cst.spin);
+       spin_lock(&parent->core.cst.spin);
 
        for (;;) {
                if (--maxloops == 0) {
@@ -3693,8 +3022,7 @@ hammer2_chain_indkey_freemap(hammer2_chain_t *parent, hammer2_key_t *keyp,
                        break;
 
                /*
-                * NOTE: No need to check DUPLICATED here because we do
-                *       not release the spinlock.
+                * Skip deleted chains.
                 */
                if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) {
                        if (key_next == 0 || key_next > key_end)
@@ -3721,7 +3049,7 @@ hammer2_chain_indkey_freemap(hammer2_chain_t *parent, hammer2_key_t *keyp,
                        break;
                key_beg = key_next;
        }
-       spin_unlock(&above->cst.spin);
+       spin_unlock(&parent->core.cst.spin);
 
        /*
         * Return the keybits for a higher-level FREEMAP_NODE covering
@@ -3760,7 +3088,6 @@ static int
 hammer2_chain_indkey_normal(hammer2_chain_t *parent, hammer2_key_t *keyp,
                            int keybits, hammer2_blockref_t *base, int count)
 {
-       hammer2_chain_core_t *above;
        hammer2_blockref_t *bref;
        hammer2_chain_t *chain;
        hammer2_key_t key_beg;
@@ -3774,7 +3101,6 @@ hammer2_chain_indkey_normal(hammer2_chain_t *parent, hammer2_key_t *keyp,
        int maxloops = 300000;
 
        key = *keyp;
-       above = parent->core;
        locount = 0;
        hicount = 0;
 
@@ -3787,7 +3113,7 @@ hammer2_chain_indkey_normal(hammer2_chain_t *parent, hammer2_key_t *keyp,
        key_beg = 0;
        key_end = HAMMER2_KEY_MAX;
        cache_index = 0;
-       spin_lock(&above->cst.spin);
+       spin_lock(&parent->core.cst.spin);
 
        for (;;) {
                if (--maxloops == 0) {
@@ -3874,7 +3200,7 @@ hammer2_chain_indkey_normal(hammer2_chain_t *parent, hammer2_key_t *keyp,
                        break;
                key_beg = key_next;
        }
-       spin_unlock(&above->cst.spin);
+       spin_unlock(&parent->core.cst.spin);
        bref = NULL;    /* now invalid (safety) */
 
        /*
@@ -3916,60 +3242,49 @@ hammer2_chain_indkey_normal(hammer2_chain_t *parent, hammer2_key_t *keyp,
 }
 
 /*
- * Sets CHAIN_DELETED and CHAIN_FLUSH_DELETE in the chain being deleted and
- * set chain->delete_xid.  The chain is not actually marked possibly-free
- * in the freemap until the deletion is completely flushed out (because
- * a flush which doesn't cover the entire deletion is flushing the deleted
- * chain as if it were live).
+ * Sets CHAIN_DELETED and remove the chain's blockref from the parent if
+ * it exists.
  *
- * This function does NOT generate a modification to the parent.  It
- * would be nearly impossible to figure out which parent to modify anyway.
- * Such modifications are handled top-down by the flush code and are
- * properly merged using the flush synchronization point.
+ * Both parent and chain must be locked exclusively.
  *
- * The find/get code will properly overload the RBTREE check on top of
- * the bref check to detect deleted entries.
+ * This function will modify the parent if the blockref requires removal
+ * from the parent's block table.
  *
  * This function is NOT recursive.  Any entity already pushed into the
  * chain (such as an inode) may still need visibility into its contents,
  * as well as the ability to read and modify the contents.  For example,
  * for an unlinked file which is still open.
- *
- * NOTE: Deletions normally do not occur in the middle of a duplication
- *      chain but we use a trick for hardlink migration that refactors
- *      the originating inode without deleting it, so we make no assumptions
- *      here.
  */
 void
-hammer2_chain_delete(hammer2_trans_t *trans, hammer2_chain_t *chain, int flags)
+hammer2_chain_delete(hammer2_trans_t *trans, hammer2_chain_t *parent,
+                    hammer2_chain_t *chain, int flags)
 {
-       KKASSERT(ccms_thread_lock_owned(&chain->core->cst));
+       KKASSERT(ccms_thread_lock_owned(&chain->core.cst));
 
        /*
         * Nothing to do if already marked.
-        */
-       if (chain->flags & HAMMER2_CHAIN_DELETED)
-               return;
-
-       /*
-        * The setting of DELETED causes finds, lookups, and _next iterations
-        * to no longer recognize the chain.  RB_SCAN()s will still have
-        * visibility (needed for flush serialization points).
         *
         * We need the spinlock on the core whos RBTREE contains chain
         * to protect against races.
         */
-       spin_lock(&chain->above->cst.spin);
-       _hammer2_chain_delete_helper(trans, chain->above, chain);
-       spin_unlock(&chain->above->cst.spin);
+       if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) {
+               KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0 &&
+                        chain->parent == parent);
+               _hammer2_chain_delete_helper(trans, parent, chain);
+       }
 
-       hammer2_chain_setsubmod(trans, chain);
+       if (flags & HAMMER2_DELETE_PERMANENT) {
+               atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY);
+               hammer2_flush(trans, chain);
+       } else {
+               /* XXX might not be needed */
+               hammer2_chain_setflush(trans, chain);
+       }
 }
 
 /*
  * Returns the index of the nearest element in the blockref array >= elm.
- * Returns (count) if no element could be found.  If delete_filter is non-zero
- * the scan filters out any blockrefs which match deleted chains on dbtree.
+ * Returns (count) if no element could be found.
  *
  * Sets *key_nextp to the next key for loop purposes but does not modify
  * it if the next key would be higher than the current value of *key_nextp.
@@ -3981,14 +3296,12 @@ hammer2_chain_delete(hammer2_trans_t *trans, hammer2_chain_t *chain, int flags)
  *
  * The spin lock on the related chain must be held.
  */
-int
+static int
 hammer2_base_find(hammer2_chain_t *parent,
                  hammer2_blockref_t *base, int count,
                  int *cache_indexp, hammer2_key_t *key_nextp,
-                 hammer2_key_t key_beg, hammer2_key_t key_end,
-                 int delete_filter)
+                 hammer2_key_t key_beg, hammer2_key_t key_end)
 {
-       hammer2_chain_core_t *core = parent->core;
        hammer2_blockref_t *scan;
        hammer2_key_t scan_end;
        int i;
@@ -3998,8 +3311,7 @@ hammer2_base_find(hammer2_chain_t *parent,
         * Require the live chain's already have their core's counted
         * so we can optimize operations.
         */
-        KKASSERT((parent->flags & HAMMER2_CHAIN_DUPLICATED) ||
-                core->flags & HAMMER2_CORE_COUNTEDBREFS);
+        KKASSERT(parent->core.flags & HAMMER2_CORE_COUNTEDBREFS);
 
        /*
         * Degenerate case
@@ -4016,10 +3328,7 @@ hammer2_base_find(hammer2_chain_t *parent,
         */
        i = *cache_indexp;
        cpu_ccfence();
-       if (parent->flags & HAMMER2_CHAIN_DUPLICATED)
-               limit = count;
-       else
-               limit = core->live_zero;
+       limit = parent->core.live_zero;
        if (i >= limit)
                i = limit - 1;
        if (i < 0)
@@ -4045,19 +3354,8 @@ hammer2_base_find(hammer2_chain_t *parent,
                if (scan->type != 0) {
                        scan_end = scan->key +
                                   ((hammer2_key_t)1 << scan->keybits) - 1;
-                       if (scan->key > key_beg || scan_end >= key_beg) {
-                               /*
-                                * Check to see if the entry is covered by
-                                * a deleted chain and ignore the entry if
-                                * it is and delete_filter != 0.
-                                */
-                               if (delete_filter == 0)
-                                       break;
-                               if (hammer2_chain_find_deleted(
-                                       parent, scan->key, scan_end) == NULL) {
-                                       break;
-                               }
-                       }
+                       if (scan->key > key_beg || scan_end >= key_beg)
+                               break;
                }
                if (i >= limit)
                        return (count);
@@ -4089,7 +3387,7 @@ hammer2_base_find(hammer2_chain_t *parent,
  * When no in-memory chain has been found and a non-NULL bref is returned
  * in *bresp.
  *
- * Must be called with above's spinlock held.  Spinlock remains held
+ * Must be called with parent's spinlock held.  Spinlock remains held
  * through the operation.
  *
  * The returned chain is not locked or referenced.  Use the returned bref
@@ -4112,7 +3410,7 @@ hammer2_combined_find(hammer2_chain_t *parent,
         */
        *key_nextp = key_end + 1;
        i = hammer2_base_find(parent, base, count, cache_indexp,
-                             key_nextp, key_beg, key_end, 1);
+                             key_nextp, key_beg, key_end);
        chain = hammer2_chain_find(parent, key_nextp, key_beg, key_end);
 
        /*
@@ -4149,15 +3447,6 @@ hammer2_combined_find(hammer2_chain_t *parent,
        if ((chain->bref.key <= key_beg && base[i].key <= key_beg) ||
            chain->bref.key == base[i].key) {
                KKASSERT(chain->bref.key == base[i].key);
-               if ((chain->flags & HAMMER2_CHAIN_BMAPPED) == 0) {
-                       kprintf("chain not bmapped %p.%d %08x\n",
-                               chain, chain->bref.type, chain->flags);
-                       kprintf("in chain mod/del %08x %08x\n",
-                               chain->modify_xid, chain->delete_xid);
-                       kprintf("and updlo/hi %08x %08x\n",
-                               chain->update_xlo, chain->update_xhi);
-               }
-               KKASSERT(chain->flags & HAMMER2_CHAIN_BMAPPED);
                bref = &chain->bref;
                goto found;
        }
@@ -4197,10 +3486,9 @@ found:
 void
 hammer2_base_delete(hammer2_trans_t *trans, hammer2_chain_t *parent,
                    hammer2_blockref_t *base, int count,
-                   int *cache_indexp, hammer2_chain_t *child)
+                   int *cache_indexp, hammer2_chain_t *chain)
 {
-       hammer2_blockref_t *elm = &child->bref;
-       hammer2_chain_core_t *core = parent->core;
+       hammer2_blockref_t *elm = &chain->bref;
        hammer2_key_t key_next;
        int i;
 
@@ -4212,28 +3500,32 @@ hammer2_base_delete(hammer2_trans_t *trans, hammer2_chain_t *parent,
         */
        key_next = 0; /* max range */
        i = hammer2_base_find(parent, base, count, cache_indexp,
-                             &key_next, elm->key, elm->key, 0);
+                             &key_next, elm->key, elm->key);
        if (i == count || base[i].type == 0 ||
-           base[i].key != elm->key || base[i].keybits != elm->keybits) {
-               spin_unlock(&core->cst.spin);
-               panic("delete base %p element not found at %d/%d elm %p\n"
-                     "child ino_reason=%08x\n",
-                     base, i, count, elm,
-                     child->inode_reason);
+           base[i].key != elm->key ||
+           ((chain->flags & HAMMER2_CHAIN_BMAPUPD) == 0 &&
+            base[i].keybits != elm->keybits)) {
+               spin_unlock(&parent->core.cst.spin);
+               panic("delete base %p element not found at %d/%d elm %p\n",
+                     base, i, count, elm);
                return;
        }
        bzero(&base[i], sizeof(*base));
 
        /*
-        * We can only optimize core->live_zero for live chains.
+        * We can only optimize parent->core.live_zero for live chains.
         */
-       if ((parent->flags & HAMMER2_CHAIN_DUPLICATED) == 0) {
-               if (core->live_zero == i + 1) {
-                       while (--i >= 0 && base[i].type == 0)
-                               ;
-                       core->live_zero = i + 1;
-               }
+       if (parent->core.live_zero == i + 1) {
+               while (--i >= 0 && base[i].type == 0)
+                       ;
+               parent->core.live_zero = i + 1;
        }
+
+       /*
+        * Clear appropriate blockmap flags in chain.
+        */
+       atomic_clear_int(&chain->flags, HAMMER2_CHAIN_BMAPPED |
+                                       HAMMER2_CHAIN_BMAPUPD);
 }
 
 /*
@@ -4248,10 +3540,9 @@ hammer2_base_delete(hammer2_trans_t *trans, hammer2_chain_t *parent,
 void
 hammer2_base_insert(hammer2_trans_t *trans __unused, hammer2_chain_t *parent,
                    hammer2_blockref_t *base, int count,
-                   int *cache_indexp, hammer2_chain_t *child)
+                   int *cache_indexp, hammer2_chain_t *chain)
 {
-       hammer2_blockref_t *elm = &child->bref;
-       hammer2_chain_core_t *core = parent->core;
+       hammer2_blockref_t *elm = &chain->bref;
        hammer2_key_t key_next;
        hammer2_key_t xkey;
        int i;
@@ -4269,7 +3560,7 @@ hammer2_base_insert(hammer2_trans_t *trans __unused, hammer2_chain_t *parent,
         */
        key_next = 0; /* max range */
        i = hammer2_base_find(parent, base, count, cache_indexp,
-                             &key_next, elm->key, elm->key, 0);
+                             &key_next, elm->key, elm->key);
 
        /*
         * Shortcut fill optimization, typical ordered insertion(s) may not
@@ -4278,28 +3569,24 @@ hammer2_base_insert(hammer2_trans_t *trans __unused, hammer2_chain_t *parent,
        KKASSERT(i >= 0 && i <= count);
 
        /*
-        * We can only optimize core->live_zero for live chains.
+        * Set appropriate blockmap flags in chain.
         */
-       if (i == count && core->live_zero < count) {
-               if ((parent->flags & HAMMER2_CHAIN_DUPLICATED) == 0) {
-                       i = core->live_zero++;
-                       base[i] = *elm;
-                       return;
-               }
+       atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPPED);
+
+       /*
+        * We can only optimize parent->core.live_zero for live chains.
+        */
+       if (i == count && parent->core.live_zero < count) {
+               i = parent->core.live_zero++;
+               base[i] = *elm;
+               return;
        }
 
        xkey = elm->key + ((hammer2_key_t)1 << elm->keybits) - 1;
        if (i != count && (base[i].key < elm->key || xkey >= base[i].key)) {
-               if (child->flags & HAMMER2_CHAIN_FLUSH_TEMPORARY) {
-                       kprintf("child %p special replace\n", child);
-                       base[i] = *elm;
-                       return;
-               } else {
-                       spin_unlock(&core->cst.spin);
-                       panic("insert base %p overlapping "
-                             "elements at %d elm %p\n",
-                             base, i, elm);
-               }
+               spin_unlock(&parent->core.cst.spin);
+               panic("insert base %p overlapping elements at %d elm %p\n",
+                     base, i, elm);
        }
 
        /*
@@ -4326,13 +3613,11 @@ hammer2_base_insert(hammer2_trans_t *trans __unused, hammer2_chain_t *parent,
                        base[i] = *elm;
 
                        /*
-                        * We can only update core->live_zero for live
+                        * We can only update parent->core.live_zero for live
                         * chains.
                         */
-                       if ((parent->flags & HAMMER2_CHAIN_DUPLICATED) == 0) {
-                               if (core->live_zero <= k)
-                                       core->live_zero = k + 1;
-                       }
+                       if (parent->core.live_zero <= k)
+                               parent->core.live_zero = k + 1;
                        u = 2;
                        goto validate;
                }
@@ -4458,29 +3743,3 @@ hammer2_chain_wait(hammer2_chain_t *chain)
 {
        tsleep(chain, 0, "chnflw", 1);
 }
-
-/*
- * chain may have been moved around by the create.
- */
-void
-hammer2_chain_refactor(hammer2_chain_t **chainp)
-{
-       hammer2_chain_t *chain = *chainp;
-       hammer2_chain_core_t *core;
-
-       core = chain->core;
-       while (chain->flags & HAMMER2_CHAIN_DUPLICATED) {
-               spin_lock(&core->cst.spin);
-               chain = TAILQ_NEXT(chain, core_entry);
-               while (chain->flags & HAMMER2_CHAIN_DUPLICATED)
-                       chain = TAILQ_NEXT(chain, core_entry);
-               hammer2_chain_ref(chain);
-               spin_unlock(&core->cst.spin);
-               KKASSERT(chain->core == core);
-
-               hammer2_chain_unlock(*chainp);
-               hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS |
-                                         HAMMER2_RESOLVE_NOREF); /* eat ref */
-               *chainp = chain;
-       }
-}
index 9d5a8f1..2dd722d 100644 (file)
@@ -89,12 +89,6 @@ hammer2_cluster_modified(hammer2_cluster_t *cluster)
        return((cluster->focus->flags & HAMMER2_CHAIN_MODIFIED) != 0);
 }
 
-int
-hammer2_cluster_unlinked(hammer2_cluster_t *cluster)
-{
-       return((cluster->focus->flags & HAMMER2_CHAIN_UNLINKED) != 0);
-}
-
 /*
  * Return a bref representative of the cluster.  Any data offset is removed
  * (since it would only be applicable to a particular chain in the cluster).
@@ -123,7 +117,7 @@ hammer2_cluster_set_chainflags(hammer2_cluster_t *cluster, uint32_t flags)
 }
 
 void
-hammer2_cluster_setsubmod(hammer2_trans_t *trans, hammer2_cluster_t *cluster)
+hammer2_cluster_setflush(hammer2_trans_t *trans, hammer2_cluster_t *cluster)
 {
        hammer2_chain_t *chain;
        int i;
@@ -131,7 +125,7 @@ hammer2_cluster_setsubmod(hammer2_trans_t *trans, hammer2_cluster_t *cluster)
        for (i = 0; i < cluster->nchains; ++i) {
                chain = cluster->array[i];
                if (chain)
-                       hammer2_chain_setsubmod(trans, chain);
+                       hammer2_chain_setflush(trans, chain);
        }
 }
 
@@ -167,7 +161,9 @@ hammer2_cluster_alloc(hammer2_pfsmount_t *pmp,
        hammer2_cluster_t *cluster;
        hammer2_cluster_t *rcluster;
        hammer2_chain_t *chain;
+#if 0
        u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
+#endif
        int i;
 
        KKASSERT(pmp != NULL);
@@ -204,26 +200,19 @@ hammer2_cluster_alloc(hammer2_pfsmount_t *pmp,
        for (i = 0; i < rcluster->nchains; ++i) {
                chain = hammer2_chain_alloc(rcluster->array[i]->hmp,
                                            pmp, trans, bref);
+#if 0
                chain->hmp = rcluster->array[i]->hmp;
                chain->bref = *bref;
                chain->bytes = bytes;
                chain->refs = 1;
                chain->flags = HAMMER2_CHAIN_ALLOCATED;
-               chain->delete_xid = HAMMER2_XID_MAX;
+#endif
 
                /*
-                * Set modify_tid if a transaction is creating the inode.
-                * Enforce update_xlo = 0 so nearby transactions do not think
-                * it has been flushed when it hasn't.
-                *
                 * NOTE: When loading a chain from backing store or creating a
                 *       snapshot, trans will be NULL and the caller is
                 *       responsible for setting these fields.
                 */
-               if (trans) {
-                       chain->modify_xid = trans->sync_xid;
-                       chain->update_xlo = 0;
-               }
                cluster->array[i] = chain;
        }
        cluster->nchains = i;
@@ -233,32 +222,6 @@ hammer2_cluster_alloc(hammer2_pfsmount_t *pmp,
        return (cluster);
 }
 
-/*
- * Associate an existing core with the chain or allocate a new core.
- *
- * The core is not locked.  No additional refs on the chain are made.
- * (trans) must not be NULL if (core) is not NULL.
- *
- * When chains are delete-duplicated during flushes we insert nchain on
- * the ownerq after ochain instead of at the end in order to give the
- * drop code visibility in the correct order, otherwise drops can be missed.
- */
-void
-hammer2_cluster_core_alloc(hammer2_trans_t *trans,
-                          hammer2_cluster_t *ncluster,
-                          hammer2_cluster_t *ocluster)
-{
-       int i;
-
-       for (i = 0; i < ocluster->nchains; ++i) {
-               if (ncluster->array[i]) {
-                       hammer2_chain_core_alloc(trans,
-                                                ncluster->array[i],
-                                                ocluster->array[i]);
-               }
-       }
-}
-
 /*
  * Add a reference to a cluster.
  *
@@ -445,9 +408,9 @@ hammer2_cluster_copy(hammer2_cluster_t *ocluster, int copy_flags)
        ncluster = kmalloc(sizeof(*ncluster), M_HAMMER2, M_WAITOK | M_ZERO);
        ncluster->pmp = pmp;
        ncluster->nchains = ocluster->nchains;
-       ncluster->focus = ocluster->focus;
        ncluster->refs = (copy_flags & HAMMER2_CLUSTER_COPY_NOREF) ? 0 : 1;
        if ((copy_flags & HAMMER2_CLUSTER_COPY_NOCHAINS) == 0) {
+               ncluster->focus = ocluster->focus;
                for (i = 0; i < ocluster->nchains; ++i) {
                        chain = ocluster->array[i];
                        ncluster->array[i] = chain;
@@ -486,24 +449,6 @@ hammer2_cluster_unlock(hammer2_cluster_t *cluster)
        }
 }
 
-/*
- * Refactor the locked chains of a cluster.
- */
-void
-hammer2_cluster_refactor(hammer2_cluster_t *cluster)
-{
-       int i;
-
-       cluster->focus = NULL;
-       for (i = 0; i < cluster->nchains; ++i) {
-               if (cluster->array[i]) {
-                       hammer2_chain_refactor(&cluster->array[i]);
-                       if (cluster->focus == NULL)
-                               cluster->focus = cluster->array[i];
-               }
-       }
-}
-
 /*
  * Resize the cluster's physical storage allocation in-place.  This may
  * replace the cluster's chains.
@@ -524,7 +469,7 @@ hammer2_cluster_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
                        KKASSERT(cparent->array[i]);
                        hammer2_chain_resize(trans, ip,
                                             cparent->array[i],
-                                            &cluster->array[i],
+                                            cluster->array[i],
                                             nradix, flags);
                        if (cluster->focus == NULL)
                                cluster->focus = cluster->array[i];
@@ -566,7 +511,7 @@ hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
        cluster->focus = NULL;
        for (i = 0; i < cluster->nchains; ++i) {
                if (cluster->array[i]) {
-                       hammer2_chain_modify(trans, &cluster->array[i], flags);
+                       hammer2_chain_modify(trans, cluster->array[i], flags);
                        if (cluster->focus == NULL)
                                cluster->focus = cluster->array[i];
                }
@@ -578,10 +523,10 @@ hammer2_cluster_modify(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
  *
  * Nominal front-end operations only edit non-block-table data in a single
  * chain.  This code copies such modifications to the other chains in the
- * cluster.
+ * cluster.  Blocktable modifications are handled on a chain-by-chain basis
+ * by both the frontend and the backend and will explode in fireworks if
+ * blindly copied.
  */
-/* hammer2_cluster_modsync() */
-
 void
 hammer2_cluster_modsync(hammer2_cluster_t *cluster)
 {
@@ -911,17 +856,16 @@ hammer2_cluster_create(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
 }
 
 /*
- * Duplicate a cluster under a new parent.
+ * Rename a cluster to a new parent.
  *
- * WARNING! Unlike hammer2_chain_duplicate(), only the key and keybits fields
+ * WARNING! Unlike hammer2_chain_rename(), only the key and keybits fields
  *         are used from a passed-in non-NULL bref pointer.  All other fields
  *         are extracted from the original chain for each chain in the
  *         iteration.
  */
 void
-hammer2_cluster_duplicate(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
-                         hammer2_cluster_t *cluster, hammer2_blockref_t *bref,
-                         int snapshot, int duplicate_reason)
+hammer2_cluster_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref,
+                      hammer2_cluster_t *cparent, hammer2_cluster_t *cluster)
 {
        hammer2_chain_t *chain;
        hammer2_blockref_t xbref;
@@ -937,17 +881,13 @@ hammer2_cluster_duplicate(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
                                xbref = chain->bref;
                                xbref.key = bref->key;
                                xbref.keybits = bref->keybits;
-                               hammer2_chain_duplicate(trans,
-                                                       &cparent->array[i],
-                                                       &chain, &xbref,
-                                                       snapshot,
-                                                       duplicate_reason);
+                               hammer2_chain_rename(trans, &xbref,
+                                                    &cparent->array[i],
+                                                    chain);
                        } else {
-                               hammer2_chain_duplicate(trans,
-                                                       &cparent->array[i],
-                                                       &chain, NULL,
-                                                       snapshot,
-                                                       duplicate_reason);
+                               hammer2_chain_rename(trans, NULL,
+                                                    &cparent->array[i],
+                                                    chain);
                        }
                        cluster->array[i] = chain;
                        if (cluster->focus == NULL)
@@ -962,41 +902,29 @@ hammer2_cluster_duplicate(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
 }
 
 /*
- * Delete-duplicate a cluster in-place.
+ * Mark a cluster deleted
  */
 void
-hammer2_cluster_delete_duplicate(hammer2_trans_t *trans,
-                                hammer2_cluster_t *cluster, int flags)
+hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cparent,
+                      hammer2_cluster_t *cluster, int flags)
 {
        hammer2_chain_t *chain;
+       hammer2_chain_t *parent;
        int i;
 
-       cluster->focus = NULL;
-       for (i = 0; i < cluster->nchains; ++i) {
-               chain = cluster->array[i];
-               if (chain) {
-                       hammer2_chain_delete_duplicate(trans, &chain, flags);
-                       cluster->array[i] = chain;
-                       if (cluster->focus == NULL)
-                               cluster->focus = chain;
-               }
+       if (cparent == NULL) {
+               kprintf("cparent is NULL\n");
+               return;
        }
-}
-
-/*
- * Mark a cluster deleted
- */
-void
-hammer2_cluster_delete(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
-                      int flags)
-{
-       hammer2_chain_t *chain;
-       int i;
 
        for (i = 0; i < cluster->nchains; ++i) {
+               parent = (i < cparent->nchains) ? cparent->array[i] : NULL;
                chain = cluster->array[i];
-               if (chain)
-                       hammer2_chain_delete(trans, chain, flags);
+               if (chain && parent == NULL) {
+                       kprintf("hammer2_cluster_delete: parent NULL\n");
+               } else if (chain) {
+                       hammer2_chain_delete(trans, parent, chain, flags);
+               }
        }
 }
 
@@ -1071,6 +999,41 @@ hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster,
        return (error);
 }
 
+/*
+ * Return locked parent cluster given a locked child.  The child remains
+ * locked on return.
+ */
+hammer2_cluster_t *
+hammer2_cluster_parent(hammer2_cluster_t *cluster)
+{
+       hammer2_cluster_t *cparent;
+       int i;
+
+       cparent = hammer2_cluster_copy(cluster, HAMMER2_CLUSTER_COPY_NOCHAINS);
+       for (i = 0; i < cluster->nchains; ++i) {
+               hammer2_chain_t *chain;
+               hammer2_chain_t *rchain;
+
+               chain = cluster->array[i];
+               if (chain == NULL)
+                       continue;
+               hammer2_chain_ref(chain);
+               while ((rchain = chain->parent) != NULL) {
+                       hammer2_chain_ref(rchain);
+                       hammer2_chain_unlock(chain);
+                       hammer2_chain_lock(rchain, HAMMER2_RESOLVE_ALWAYS);
+                       hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
+                       hammer2_chain_drop(rchain);
+                       if (chain->parent == rchain)
+                               break;
+                       hammer2_chain_unlock(rchain);
+               }
+               hammer2_chain_drop(chain);
+               cparent->array[i] = rchain;
+       }
+       return cparent;
+}
+
 /************************************************************************
  *                         NODE FAILURES                               *
  ************************************************************************
index 159b94c..f5db70e 100644 (file)
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
-
 /*
  *                     TRANSACTION AND FLUSH HANDLING
  *
  * Deceptively simple but actually fairly difficult to implement properly is
  * how I would describe it.
  *
- * The biggest problem is that each PFS may belong to a cluster so its
- * media modify_tid and mirror_tid fields are in a completely different
- * domain than the topology related to the super-root.  Most of the code
- * operates using modify_xid and delete_xid which are local identifiers.
+ * The biggest issue is that each PFS may belong to a cluster so its media
+ * modify_tid and mirror_tid fields are in a completely different domain
+ * than the topology related to the super-root.
  *
- * The second biggest problem is that we really want to allow flushes to run
- * concurrently with new front-end operations, which means that the in-memory
- * topology of hammer2_chain structures can represent both current state and
- * snapshot-for-flush state.
+ * Flushing generally occurs bottom-up but requires a top-down scan to
+ * locate chains with MODIFIED and/or UPDATE bits set.  The ONFLUSH flag
+ * tells how to recurse downward to find these chains.
  */
 
 #include <sys/cdefs.h>
@@ -74,40 +71,21 @@ struct hammer2_flush_info {
        int             depth;
        int             diddeferral;
        int             cache_index;
-       int             domodify;
-       struct h2_flush_deferral_list flush_list;
+       struct h2_flush_list flushq;
        hammer2_xid_t   sync_xid;       /* memory synchronization point */
 };
 
 typedef struct hammer2_flush_info hammer2_flush_info_t;
 
 static void hammer2_flush_core(hammer2_flush_info_t *info,
-                               hammer2_chain_t **chainp, int deleting);
-static int hammer2_flush_pass1(hammer2_chain_t *child, void *data);
-static int hammer2_flush_pass2(hammer2_chain_t *child, void *data);
-static int hammer2_flush_pass3(hammer2_chain_t *child, void *data);
-static int hammer2_flush_pass4(hammer2_chain_t *child, void *data);
-static int hammer2_flush_pass5(hammer2_chain_t *child, void *data);
+                               hammer2_chain_t *chain, int deleting);
+static int hammer2_flush_recurse(hammer2_chain_t *child, void *data);
+#if 0
 static void hammer2_rollup_stats(hammer2_chain_t *parent,
                                hammer2_chain_t *child, int how);
+#endif
 
 
-/*
- * Can we ignore a chain for the purposes of flushing modifications
- * to the media?
- *
- * This code is now degenerate.  We used to have to distinguish between
- * deleted chains and deleted chains associated with inodes that were
- * still open.  This mechanic has been fixed so the function is now
- * a simple test.
- */
-static __inline
-int
-h2ignore_deleted(hammer2_flush_info_t *info, hammer2_chain_t *chain)
-{
-       return (chain->delete_xid <= info->sync_xid);
-}
-
 #if 0
 static __inline
 void
@@ -226,7 +204,7 @@ hammer2_trans_init(hammer2_trans_t *trans, hammer2_pfsmount_t *pmp, int flags)
                /*
                 * No flushes are pending, we can go.  Use prior flush_xid + 1.
                 *
-                * WARNING!  Also see hammer2_chain_setsubmod()
+                * WARNING!  Also see hammer2_chain_setflush()
                 */
                TAILQ_INSERT_TAIL(&tman->transq, trans, entry);
                trans->sync_xid = tman->flush_xid + 1;
@@ -258,7 +236,7 @@ hammer2_trans_init(hammer2_trans_t *trans, hammer2_pfsmount_t *pmp, int flags)
                 * progress.  We insert after the current flush and may
                 * block.
                 *
-                * WARNING!  Also see hammer2_chain_setsubmod()
+                * WARNING!  Also see hammer2_chain_setflush()
                 */
                TAILQ_FOREACH(head, &tman->transq, entry) {
                        if (head->flags & HAMMER2_TRANS_ISFLUSH)
@@ -386,12 +364,10 @@ hammer2_trans_done(hammer2_trans_t *trans)
 /*
  * Flush the chain and all modified sub-chains through the specified
  * synchronization point, propagating parent chain modifications and
- * mirror_tid updates back up as needed.  Since we are recursing downward
- * we do not have to deal with the complexities of multi-homed chains (chains
- * with multiple parents).
+ * mirror_tid updates back up as needed.
  *
  * Caller must have interlocked against any non-flush-related modifying
- * operations in progress whos modify_xid values are less than or equal
+ * operations in progress whos XXX values are less than or equal
  * to the passed sync_xid.
  *
  * Caller must have already vetted synchronization points to ensure they
@@ -401,17 +377,15 @@ hammer2_trans_done(hammer2_trans_t *trans)
  * This routine can be called from several places but the most important
  * is from VFS_SYNC.
  *
- * chain is locked on call and will remain locked on return.  If a flush
- * occured, the chain's FLUSH_CREATE and/or FLUSH_DELETE bit will be set
- * indicating that its parent (which is not part of the flush) should be
- * updated.  The chain may be replaced by the call if it was modified.
+ * chain is locked on call and will remain locked on return.  The chain's
+ * UPDATE flag indicates that its parent's block table (which is not yet
+ * part of the flush) should be updated.  The chain may be replaced by
+ * the call if it was modified.
  */
 void
-hammer2_flush(hammer2_trans_t *trans, hammer2_chain_t **chainp)
+hammer2_flush(hammer2_trans_t *trans, hammer2_chain_t *chain)
 {
-       hammer2_chain_t *chain = *chainp;
        hammer2_chain_t *scan;
-       hammer2_chain_core_t *core;
        hammer2_flush_info_t info;
        int loops;
 
@@ -424,12 +398,18 @@ hammer2_flush(hammer2_trans_t *trans, hammer2_chain_t **chainp)
         * for re-execution after the stack has been popped.
         */
        bzero(&info, sizeof(info));
-       TAILQ_INIT(&info.flush_list);
+       TAILQ_INIT(&info.flushq);
        info.trans = trans;
        info.sync_xid = trans->sync_xid;
        info.cache_index = -1;
 
-       core = chain->core;
+       /*
+        * Calculate parent (can be NULL), if not NULL the flush core
+        * expects the parent to be referenced so it can easily lock/unlock
+        * it without it getting ripped up.
+        */
+       if ((info.parent = chain->parent) != NULL)
+               hammer2_chain_ref(info.parent);
 
        /*
         * Extra ref needed because flush_core expects it when replacing
@@ -444,9 +424,9 @@ hammer2_flush(hammer2_trans_t *trans, hammer2_chain_t **chainp)
                 * can leave the FLUSH_* bits set for these chains, which
                 * will be handled when we [re]flush chain after the unwind.
                 */
-               while ((scan = TAILQ_FIRST(&info.flush_list)) != NULL) {
+               while ((scan = TAILQ_FIRST(&info.flushq)) != NULL) {
                        KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED);
-                       TAILQ_REMOVE(&info.flush_list, scan, flush_node);
+                       TAILQ_REMOVE(&info.flushq, scan, flush_node);
                        atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED);
 
                        /*
@@ -459,7 +439,7 @@ hammer2_flush(hammer2_trans_t *trans, hammer2_chain_t **chainp)
                                kprintf("deferred flush %p\n", scan);
                        hammer2_chain_lock(scan, HAMMER2_RESOLVE_MAYBE);
                        hammer2_chain_drop(scan);       /* ref from deferral */
-                       hammer2_flush(trans, &scan);
+                       hammer2_flush(trans, scan);
                        hammer2_chain_unlock(scan);
                }
 
@@ -467,12 +447,12 @@ hammer2_flush(hammer2_trans_t *trans, hammer2_chain_t **chainp)
                 * [re]flush chain.
                 */
                info.diddeferral = 0;
-               hammer2_flush_core(&info, &chain, 0);
+               hammer2_flush_core(&info, chain, 0);
 
                /*
                 * Only loop if deep recursions have been deferred.
                 */
-               if (TAILQ_EMPTY(&info.flush_list))
+               if (TAILQ_EMPTY(&info.flushq))
                        break;
 
                if (++loops % 1000 == 0) {
@@ -483,15 +463,15 @@ hammer2_flush(hammer2_trans_t *trans, hammer2_chain_t **chainp)
                }
        }
        hammer2_chain_drop(chain);
-       *chainp = chain;
+       if (info.parent)
+               hammer2_chain_drop(info.parent);
 }
 
 /*
  * This is the core of the chain flushing code.  The chain is locked by the
  * caller and must also have an extra ref on it by the caller, and remains
  * locked and will have an extra ref on return.  Upon return, the caller can
- * test the FLUSH_CREATE and FLUSH_DELETE bits to determine what action must
- * be taken on the parent.
+ * test the UPDATE bit on the child to determine if the parent needs updating.
  *
  * (1) Determine if this node is a candidate for the flush, return if it is
  *     not.  fchain and vchain are always candidates for the flush.
@@ -501,27 +481,20 @@ hammer2_flush(hammer2_trans_t *trans, hammer2_chain_t **chainp)
  *     run.
  *
  * (3) Recursively flush live children (rbtree).  This can create deferrals.
- *     A successful flush clears the MODIFIED bit in the children.
- *
- * (4) Recursively flush deleted children (dbtree).  Deletions may be
- *     considered 'live' if the delete_tid is beyond the flush_tid.  If
- *     considered 'dead' the recursion is still needed in order to clean
- *     up the chain.  This can create deferrals.
- *
- *     A successful flush clears the MODIFIED bit in the children.
+ *     A successful flush clears the MODIFIED and UPDATE bits on the children
+ *     and typically causes the parent to be marked MODIFIED as the children
+ *     update the parent's block table.  A parent might already be marked
+ *     MODIFIED due to a deletion (whos blocktable update in the parent is
+ *     handled by the frontend), or if the parent itself is modified by the
+ *     frontend for other reasons.
  *
- * (5) Calculate block table updates on chain based on the children scans
- *     in (3) and (4) by testing the FLUSH_CREATE and FLUSH_DELETE bits,
- *     modifying chain if necessary to perform the block table updates.
- *     Deletions must be removed from dbtree when removed from the
- *     chain's block table.
+ * (4) Permanently disconnected sub-trees are cleaned up by the front-end.
+ *     Deleted-but-open inodes can still be individually flushed via the
+ *     filesystem syncer.
  *
- *     If 'chain' itself is marked DELETED but treated as live, the block
- *     table update(s) must be propagated to all contemporary chains.  In
- *     fact, all contemporary chains must be locked and updated uninterrupted
- *     to avoid lookup races.  Once MODIFIED and FLUSH_CREATE is cleared,
- *     a chain can be unloaded from memory with the expectation that it can
- *     be reloaded later via the block table at any time.
+ * (5) Note that an unmodified child may still need the block table in its
+ *     parent updated (e.g. rename/move).  The child will have UPDATE set
+ *     in this case.
  *
  *                     WARNING ON BREF MODIFY_TID/MIRROR_TID
  *
@@ -530,36 +503,25 @@ hammer2_flush(hammer2_trans_t *trans, hammer2_chain_t **chainp)
  * Instead we access it from the pmp.
  */
 static void
-hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp,
+hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain,
                   int deleting)
 {
-       hammer2_chain_t *chain = *chainp;
-       hammer2_chain_t *saved_parent;
+       hammer2_chain_t *parent;
        hammer2_mount_t *hmp;
        hammer2_pfsmount_t *pmp;
-       hammer2_chain_core_t *core;
        int diddeferral;
-       int saved_domodify;
-
-       hmp = chain->hmp;
-       pmp = chain->pmp;
-       core = chain->core;
-       diddeferral = info->diddeferral;
 
        /*
-        * (1) Check if we even have any work to do.
-        *
-        * This bit of code is capable of short-cutting entire sub-trees
-        * if they have not been touched or if they have already been
-        * flushed.
+        * (1) Optimize downward recursion to locate nodes needing action.
+        *     Nothing to do if none of these flags are set.
         */
-       if (/*(chain->flags & HAMMER2_CHAIN_MODIFIED) == 0 &&*/
-           (chain->update_xlo >= info->sync_xid ||     /* already synced */
-            chain->update_xlo >= chain->update_xhi)) { /* old/unchanged */
-               /* update_xlo/_xhi already filters chain out, do not update */
-               /* don't update bref.mirror_tid, pass2 is not called */
+       if ((chain->flags & HAMMER2_CHAIN_FLUSH_MASK) == 0)
                return;
-       }
+
+       hmp = chain->hmp;
+       pmp = chain->pmp;
+       diddeferral = info->diddeferral;
+       parent = info->parent;          /* can be NULL */
 
        /*
         * mirror_tid should not be forward-indexed
@@ -567,488 +529,342 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp,
        KKASSERT(chain->bref.mirror_tid <= pmp->flush_tid);
 
        /*
-        * Ignore chains modified beyond the current flush point.  These
-        * will be treated as if they did not exist.  Subchains with lower
-        * modify_xid's will still be accessible via other parents.
-        *
-        * Do not update bref.mirror_tid here, it will interfere with
-        * synchronization.  e.g. inode flush tid 1, concurrent D-D tid 2,
-        * then later on inode flush tid 2.  If we were to set mirror_tid
-        * to 1 during inode flush tid 1 the blockrefs would only be partially
-        * updated (and likely panic).
-        *
-        * We must update chain->update_xlo here to prevent re-entry in this
-        * flush transaction.
-        *
-        * (vchain and fchain are exceptions since they cannot be duplicated)
+        * Downward search recursion
         */
-       if (chain->modify_xid > info->sync_xid &&
-           chain != &hmp->fchain && chain != &hmp->vchain) {
-               /* do not update bref.mirror_tid, pass2 ignores chain */
-               /* chain->update_xlo = info->sync_xid; */
-               return;
-       }
-
-       /*
-        * (2) Recurse downward and check recursion depth.
-        * (3) Flush live children
-        * (4) Flush deleted children
-        *
-        * We adjust update_xlo if not deferring chain to prevent re-entry
-        * in this flush cycle, but it must be set AFTER the flush in case
-        * a deeper flush hits the chain.  Otherwise the deeper flush cannot
-        * complete.  We re-check the condition after finishing the flushes.
-        *
-        * update_xhi was already checked and prevents initial recursions on
-        * subtrees which have not been modified.
-        */
-       saved_parent = info->parent;
-       saved_domodify = info->domodify;
-       info->parent = chain;
-       info->domodify = 0;
-
        if (chain->flags & HAMMER2_CHAIN_DEFERRED) {
+               /*
+                * Already deferred.
+                */
                ++info->diddeferral;
        } else if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) {
-               if ((chain->flags & HAMMER2_CHAIN_DEFERRED) == 0) {
-                       hammer2_chain_ref(chain);
-                       TAILQ_INSERT_TAIL(&info->flush_list,
-                                         chain, flush_node);
-                       atomic_set_int(&chain->flags,
-                                      HAMMER2_CHAIN_DEFERRED);
-               }
+               /*
+                * Recursion depth reached.
+                */
+               hammer2_chain_ref(chain);
+               TAILQ_INSERT_TAIL(&info->flushq, chain, flush_node);
+               atomic_set_int(&chain->flags, HAMMER2_CHAIN_DEFERRED);
                ++info->diddeferral;
-       } else {
-               hammer2_chain_t *scan;
-
+       } else if (chain->flags & HAMMER2_CHAIN_ONFLUSH) {
                /*
-                * The flush is queue-agnostic when running pass1, but order
-                * is important to catch any races where an existing
-                * flush-visible child is moved from rbtree->dbtree/dbq.
-                *
-                * New children added by concurrent operations are not visible
-                * to the flush anyway so we don't care about those races.
-                * However, the flush itself can move a child from dbq to
-                * dbtree (rare in pass1 but it is possible).
-                *
-                * pass1 can handle re-execution of a child.
+                * Downward recursion search (actual flush occurs bottom-up).
+                * pre-clear ONFLUSH.  It can get set again due to races,
+                * which we want so the scan finds us again in the next flush.
                 */
-               spin_lock(&core->cst.spin);
-               KKASSERT(core->good == 0x1234 && core->sharecnt > 0);
-               RB_SCAN(hammer2_chain_tree, &core->rbtree,
-                       NULL, hammer2_flush_pass1, info);
-               RB_SCAN(hammer2_chain_tree, &core->dbtree,
-                       NULL, hammer2_flush_pass1, info);
-               scan = TAILQ_FIRST(&core->dbq);
-               while (scan) {
-                       KKASSERT(scan->flags & HAMMER2_CHAIN_ONDBQ);
-                       hammer2_flush_pass1(scan, info);
-                       if (scan->flags & HAMMER2_CHAIN_ONDBQ)
-                               scan = TAILQ_NEXT(scan, db_entry);
-                       else
-                               scan = TAILQ_FIRST(&core->dbq);
-               }
-               spin_unlock(&core->cst.spin);
+               atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH);
+               info->parent = chain;
+               spin_lock(&chain->core.cst.spin);
+               RB_SCAN(hammer2_chain_tree, &chain->core.rbtree,
+                       NULL, hammer2_flush_recurse, info);
+               spin_unlock(&chain->core.cst.spin);
+               info->parent = parent;
+               if (info->diddeferral)
+                       hammer2_chain_setflush(info->trans, chain);
        }
 
        /*
-        * Stop if deferred, do not update update_xlo.
+        * Now we are in the bottom-up part of the recursion.
+        *
+        * Do not update chain if lower layers were deferred.
         */
-       if (info->diddeferral) {
+       if (info->diddeferral)
                goto done;
-       }
 
        /*
-        * If a block table update is needed place the parent in a modified
-        * state, which might delete-duplicate it.
-        *
-        * - To prevent loops and other confusion, we synchronize update_xlo
-        *   for the original chain.
-        *
-        * - The original parent will not be used by the flush so we can
-        *   clear its MODIFIED bit.
+        * Propagate the DESTROY flag downwards.  This dummies up the flush
+        * code and tries to invalidate related buffer cache buffers to
+        * avoid the disk write.
+        */
+       if (parent && (parent->flags & HAMMER2_CHAIN_DESTROY))
+               atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY);
+
+       /*
+        * Chain was already modified or has become modified, flush it out.
         */
-       if (info->domodify) {
-               hammer2_chain_modify(info->trans, &info->parent, 0);
-               if (info->parent != chain) {
+again:
+       if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
+               /*
+                * Dispose of the modified bit.  UPDATE should already be
+                * set.
+                */
+               KKASSERT((chain->flags & HAMMER2_CHAIN_UPDATE) ||
+                        chain == &hmp->vchain);
+               atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
+               hammer2_pfs_memory_wakeup(pmp);
+               chain->bref.mirror_tid = pmp->flush_tid;
+
+               if ((chain->flags & HAMMER2_CHAIN_UPDATE) ||
+                   chain == &hmp->vchain ||
+                   chain == &hmp->fchain) {
                        /*
-                        * chain        - old
-                        * info->parent - new
-                        *
-                        * NOTE: bref.mirror_tid cannot be updated
-                        *       unless MODIFIED is cleared or already
-                        *       clear.
+                        * Drop the ref from the MODIFIED bit we cleared,
+                        * net -1 ref.
                         */
-                       chain->inode_reason += 0x10000000;
-                       info->parent->inode_reason += 0x100;
-                       KKASSERT(info->parent->core == chain->core);
-                       if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
-                               atomic_clear_int(&chain->flags,
-                                               HAMMER2_CHAIN_MODIFIED);
-                               hammer2_pfs_memory_wakeup(pmp);
-                               hammer2_chain_drop(chain);
-                       }
-#if 0
-                       if (chain->flags & HAMMER2_CHAIN_FLUSH_CREATE) {
-                               atomic_clear_int(&chain->flags,
-                                               HAMMER2_CHAIN_FLUSH_CREATE);
-                               hammer2_chain_drop(chain);
-                       }
-                       if (info->parent->flags & HAMMER2_CHAIN_FLUSH_DELETE) {
-                               atomic_clear_int(&info->parent->flags,
-                                               HAMMER2_CHAIN_FLUSH_DELETE);
-                               hammer2_chain_drop(info->parent);
-                       }
-#endif
-                       if (chain->update_xlo < info->sync_xid)
-                               chain->update_xlo = info->sync_xid;
-                       KKASSERT(info->parent->update_xlo < info->sync_xid);
                        hammer2_chain_drop(chain);
-                       hammer2_chain_ref(info->parent);
+               } else {
+                       /*
+                        * Drop the ref from the MODIFIED bit we cleared and
+                        * set a ref for the UPDATE bit we are setting.  Net
+                        * 0 refs.
+                        */
+                       atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
                }
-               chain = info->parent;
-       }
 
-       /*
-        * If a blocktable update is needed determine if this is the last
-        * parent requiring modification (check all parents using the core).
-        *
-        * Set bit 1 (0x02) of domodify if this is the last parent,
-        * which will cause scan2 to clear FLUSH_CREATE and FLUSH_DELETE.
-        */
-       if (1) {
-               hammer2_chain_t *scan;
+               /*
+                * Issue flush.
+                *
+                * A DELETED node that reaches this point must be flushed for
+                * synchronization point consistency.
+                *
+                * Update bref.mirror_tid, clear MODIFIED, and set UPDATE.
+                */
+               if (hammer2_debug & 0x1000) {
+                       kprintf("Flush %p.%d %016jx/%d sync_xid=%08x "
+                               "data=%016jx\n",
+                               chain, chain->bref.type,
+                               chain->bref.key, chain->bref.keybits,
+                               info->sync_xid,
+                               chain->bref.data_off);
+               }
+               if (hammer2_debug & 0x2000) {
+                       Debugger("Flush hell");
+               }
 
-               spin_lock(&core->cst.spin);
-               TAILQ_FOREACH(scan, &core->ownerq, core_entry) {
+               /*
+                * Update chain CRCs for flush.
+                *
+                * NOTE: Volume headers are NOT flushed here as they require
+                *       special processing.
+                */
+               switch(chain->bref.type) {
+               case HAMMER2_BREF_TYPE_FREEMAP:
+                       KKASSERT(hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED);
+                       hmp->voldata.freemap_tid = hmp->fchain.bref.mirror_tid;
+                       break;
+               case HAMMER2_BREF_TYPE_VOLUME:
                        /*
-                        * Ignore the current parent being processed (we do
-                        * not adjust update_xlo until after the fixup).
+                        * The free block table is flushed by hammer2_vfs_sync()
+                        * before it flushes vchain.  We must still hold fchain
+                        * locked while copying voldata to volsync, however.
                         */
-                       if (scan == chain)
-                               continue;
-
+                       hammer2_voldata_lock(hmp);
+                       hammer2_chain_lock(&hmp->fchain,
+                                          HAMMER2_RESOLVE_ALWAYS);
                        /*
-                        * Ignore chains which have already been updated
-                        * Ignore unmodified chains (lo >= hi).
+                        * There is no parent to our root vchain and fchain to
+                        * synchronize the bref to, their updated mirror_tid's
+                        * must be synchronized to the volume header.
                         */
-                       if ((scan->flags & HAMMER2_CHAIN_MODIFIED) == 0 &&
-                           (scan->update_xlo >= info->sync_xid ||
-                            scan->update_xlo >= scan->update_xhi)) {
-                               continue;
-                       }
+                       hmp->voldata.mirror_tid = chain->bref.mirror_tid;
+                       hmp->voldata.freemap_tid = hmp->fchain.bref.mirror_tid;
+                       kprintf("mirror_tid %08jx\n",
+                               (intmax_t)chain->bref.mirror_tid);
 
                        /*
-                        * Cannot exhaust all parents if one is not visible
-                        * to the flush.  The root chains are special-cased
-                        * because they cannot really be delete-duplicated.
+                        * The volume header is flushed manually by the
+                        * syncer, not here.  All we do here is adjust the
+                        * crc's.
                         */
-                       if (scan != &scan->hmp->fchain &&
-                           scan != &scan->hmp->vchain &&
-                           scan->modify_xid > info->sync_xid) {
-                               break;
-                       }
-
+                       KKASSERT(chain->data != NULL);
+                       KKASSERT(chain->dio == NULL);
+
+                       hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]=
+                               hammer2_icrc32(
+                                       (char *)&hmp->voldata +
+                                        HAMMER2_VOLUME_ICRC1_OFF,
+                                       HAMMER2_VOLUME_ICRC1_SIZE);
+                       hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]=
+                               hammer2_icrc32(
+                                       (char *)&hmp->voldata +
+                                        HAMMER2_VOLUME_ICRC0_OFF,
+                                       HAMMER2_VOLUME_ICRC0_SIZE);
+                       hmp->voldata.icrc_volheader =
+                               hammer2_icrc32(
+                                       (char *)&hmp->voldata +
+                                        HAMMER2_VOLUME_ICRCVH_OFF,
+                                       HAMMER2_VOLUME_ICRCVH_SIZE);
+                       hmp->volsync = hmp->voldata;
+                       atomic_set_int(&chain->flags, HAMMER2_CHAIN_VOLUMESYNC);
+                       hammer2_chain_unlock(&hmp->fchain);
+                       hammer2_voldata_unlock(hmp);
+                       break;
+               case HAMMER2_BREF_TYPE_DATA:
                        /*
-                        * Fail if update_xlo has not been synchronized to
-                        * at least our sync_xid on any modified parent chain.
+                        * Data elements have already been flushed via the
+                        * logical file buffer cache.  Their hash was set in
+                        * the bref by the vop_write code.
+                        *
+                        * Make sure any device buffer(s) have been flushed
+                        * out here (there aren't usually any to flush) XXX.
                         */
-                       if (scan->update_xlo < info->sync_xid)
-                               break;
-               }
-               spin_unlock(&core->cst.spin);
-               if (scan == NULL)
-                       info->domodify |= 2;
-       }
-
-       /*
-        * (5) Calculate block table updates or child cleanups.
-        *     (this whole operation has to be atomic)
-        *
-        * domodify 0x01 - block table updates
-        *          0x02 - child cleanups
-        *
-        *      pass2 - Process deletions from dbtree and dbq.
-        *      pass3 - Process insertions from rbtree, dbtree, and dbq.
-        *      pass4 - Cleanup child flags on the last parent and
-        *              Adjust queues on the live parent (deletions).
-        *      pass5 - Cleanup child flags on the last parent and
-        *              Adjust queues on the live parent (insertions).
-        *
-        *      Queue adjustments had to be separated into deletions and
-        *      insertions because both can occur on dbtree.
-        */
-       if (info->domodify) {
-               hammer2_chain_t *scan;
-
-               spin_lock(&core->cst.spin);
-
-               while ((info->domodify & 1) && info->parent) {
-                       /* PASS2 - Deletions */
-                       RB_SCAN(hammer2_chain_tree, &core->rbtree,
-                               NULL, hammer2_flush_pass2, info);
-                       RB_SCAN(hammer2_chain_tree, &core->dbtree,
-                               NULL, hammer2_flush_pass2, info);
-                       scan = TAILQ_FIRST(&core->dbq);
-                       TAILQ_FOREACH(scan, &core->dbq, db_entry) {
-                               KKASSERT(scan->flags & HAMMER2_CHAIN_ONDBQ);
-                               hammer2_flush_pass2(scan, info);
-                       }
-
-                       /* PASS3 - Insertions */
-                       RB_SCAN(hammer2_chain_tree, &core->rbtree,
-                               NULL, hammer2_flush_pass3, info);
-                       RB_SCAN(hammer2_chain_tree, &core->dbtree,
-                               NULL, hammer2_flush_pass3, info);
-                       TAILQ_FOREACH(scan, &core->dbq, db_entry) {
-                               KKASSERT(scan->flags & HAMMER2_CHAIN_ONDBQ);
-                               hammer2_flush_pass3(scan, info);
+                       break;
+               case HAMMER2_BREF_TYPE_INDIRECT:
+               case HAMMER2_BREF_TYPE_FREEMAP_NODE:
+               case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
+                       /*
+                        * Buffer I/O will be cleaned up when the volume is
+                        * flushed (but the kernel is free to flush it before
+                        * then, as well).
+                        */
+                       KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0);
+                       break;
+               case HAMMER2_BREF_TYPE_INODE:
+                       if (chain->data->ipdata.op_flags &
+                           HAMMER2_OPFLAG_PFSROOT) {
+                               /*
+                                * non-NULL pmp if mounted as a PFS.  We must
+                                * sync fields cached in the pmp.
+                                */
+                               hammer2_inode_data_t *ipdata;
+
+                               ipdata = &chain->data->ipdata;
+                               ipdata->pfs_inum = pmp->inode_tid;
+                       } else {
+                               /* can't be mounted as a PFS */
+                               KKASSERT((chain->flags &
+                                         HAMMER2_CHAIN_PFSROOT) == 0);
                        }
-                       info->parent = TAILQ_NEXT(info->parent, core_entry);
-                       if (info->parent)
-                               kprintf("FLUSH SPECIAL UPDATE (%p) %p.%d %08x\n",
-                                       chain, info->parent,
-                                       info->parent->bref.type,
-                                       info->parent->flags);
+                       KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0);
+                       break;
+               default:
+                       KKASSERT(chain->flags & HAMMER2_CHAIN_EMBEDDED);
+                       panic("hammer2_flush_core: unsupported "
+                             "embedded bref %d",
+                             chain->bref.type);
+                       /* NOT REACHED */
                }
-               info->parent = chain;
 
-               /* PASS4 - Cleanup */
-               RB_SCAN(hammer2_chain_tree, &core->rbtree,
-                       NULL, hammer2_flush_pass4, info);
-               scan = TAILQ_FIRST(&core->dbq);
-               while (scan) {
-                       KKASSERT(scan->flags & HAMMER2_CHAIN_ONDBQ);
-                       hammer2_flush_pass4(scan, info);
-                       if (scan->flags & HAMMER2_CHAIN_ONDBQ)
-                               scan = TAILQ_NEXT(scan, db_entry);
-                       else
-                               scan = TAILQ_FIRST(&core->dbq);
-               }
-               RB_SCAN(hammer2_chain_tree, &core->dbtree,
-                       NULL, hammer2_flush_pass4, info);
-
-               /* PASS5 - Cleanup */
-               RB_SCAN(hammer2_chain_tree, &core->rbtree,
-                       NULL, hammer2_flush_pass5, info);
-               scan = TAILQ_FIRST(&core->dbq);
-               while (scan) {
-                       KKASSERT(scan->flags & HAMMER2_CHAIN_ONDBQ);
-                       hammer2_flush_pass5(scan, info);
-                       if (scan->flags & HAMMER2_CHAIN_ONDBQ)
-                               scan = TAILQ_NEXT(scan, db_entry);
-                       else
-                               scan = TAILQ_FIRST(&core->dbq);
+               /*
+                * If the chain was destroyed try to avoid unnecessary I/O.
+                * (this only really works if the DIO system buffer is the
+                * same size as chain->bytes).
+                */
+               if (chain->flags & HAMMER2_CHAIN_DESTROY) {
+                       hammer2_io_setinval(chain->dio, chain->bytes);
                }
-               RB_SCAN(hammer2_chain_tree, &core->dbtree,
-                       NULL, hammer2_flush_pass5, info);
-
-               spin_unlock(&core->cst.spin);
        }
 
        /*
-        * Synchronize update_xlo to prevent reentrant block updates of this
-        * parent.
-        */
-       chain->update_xlo = info->sync_xid;
-
-       /*
-        * Skip the flush if the chain was not placed in a modified state
-        * or was not already in a modified state.
-        */
-       if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0)
-               goto done;
-
-       /*
-        * FLUSH THE CHAIN (on the way back up the recursion)
+        * If UPDATE is set the parent block table may need to be updated.
         *
-        * Chain is now deterministically being flushed and not being deferred.
-        * We've finished running the recursion and the blockref update.
+        * NOTE: UPDATE may be set on vchain or fchain in which case
+        *       parent could be NULL.  It's easiest to allow the case
+        *       and test for NULL.  parent can also wind up being NULL
+        *       due to a deletion so we need to handle the case anyway.
         *
-        * update bref.mirror_tid.  update_xlo has already been updated.
+        * If no parent exists we can just clear the UPDATE bit.  If the
+        * chain gets reattached later on the bit will simply get set
+        * again.
         */
-       chain->bref.mirror_tid = pmp->flush_tid;
-
-       /*
-        * Dispose of the modified bit.  FLUSH_CREATE should already be
-        * set.
-        */
-       KKASSERT((chain->flags & HAMMER2_CHAIN_FLUSH_CREATE) ||
-                chain == &hmp->vchain);
-       atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
-       hammer2_pfs_memory_wakeup(pmp);
-
-       if ((chain->flags & HAMMER2_CHAIN_FLUSH_CREATE) ||
-           chain == &hmp->vchain ||
-           chain == &hmp->fchain) {
-               /*
-                * Drop the ref from the MODIFIED bit we cleared,
-                * net -1 ref.
-                */
+       if ((chain->flags & HAMMER2_CHAIN_UPDATE) && parent == NULL) {
+               atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
                hammer2_chain_drop(chain);
-       } else {
-               /*
-                * Drop the ref from the MODIFIED bit we cleared and
-                * set a ref for the FLUSH_CREATE bit we are setting.
-                * Net 0 refs.
-                */
-               atomic_set_int(&chain->flags, HAMMER2_CHAIN_FLUSH_CREATE);
        }
 
        /*
-        * Skip the actual flush operation if the chain has been deleted
-        * in our flus hview.  There will be no block table entry that
-        * references it.
+        * The chain may need its blockrefs updated in the parent.  This
+        * requires some fancy footwork.
         */
-       if (h2ignore_deleted(info, chain))
-               goto done;
+       if (chain->flags & HAMMER2_CHAIN_UPDATE) {
+               hammer2_blockref_t *base;
+               int count;
 
-       /*
-        * Issue flush.
-        *
-        * A DELETED node that reaches this point must be flushed for
-        * synchronization point consistency.
-        *
-        * Update bref.mirror_tid, clear MODIFIED, and set MOVED.
-        *
-        * The caller will update the parent's reference to this chain
-        * by testing MOVED as long as the modification was in-bounds.
-        *
-        * MOVED is never set on the volume root as there is no parent
-        * to adjust.
-        */
-       if (hammer2_debug & 0x1000) {
-               kprintf("Flush %p.%d %016jx/%d sync_xid=%08x data=%016jx\n",
-                       chain, chain->bref.type,
-                       chain->bref.key, chain->bref.keybits,
-                       info->sync_xid, chain->bref.data_off);
-       }
-       if (hammer2_debug & 0x2000) {
-               Debugger("Flush hell");
-       }
-
-       /*
-        * If this is part of a recursive flush we can go ahead and write
-        * out the buffer cache buffer and pass a new bref back up the chain
-        * via the MOVED bit.
-        *
-        * Volume headers are NOT flushed here as they require special
-        * processing.
-        */
-       switch(chain->bref.type) {
-       case HAMMER2_BREF_TYPE_FREEMAP:
-               KKASSERT(hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED);
-               hmp->voldata.freemap_tid = hmp->fchain.bref.mirror_tid;
-               break;
-       case HAMMER2_BREF_TYPE_VOLUME:
                /*
-                * The free block table is flushed by hammer2_vfs_sync()
-                * before it flushes vchain.  We must still hold fchain
-                * locked while copying voldata to volsync, however.
+                * Both parent and chain must be locked.  This requires
+                * temporarily unlocking the chain.  We have to deal with
+                * the case where the chain might be reparented or modified
+                * while it was unlocked.
                 */
-               hammer2_voldata_lock(hmp);
-               hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
-#if 0
-               if ((hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) ||
-                   hmp->voldata.freemap_tid < info->trans->sync_tid) {
-                       /* this will modify vchain as a side effect */
-                       hammer2_chain_t *tmp = &hmp->fchain;
-                       hammer2_chain_flush(info->trans, &tmp);
-                       KKASSERT(tmp == &hmp->fchain);
+               hammer2_chain_unlock(chain);
+               hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
+               hammer2_chain_lock(chain, HAMMER2_RESOLVE_MAYBE);
+               if (chain->parent != parent) {
+                       kprintf("PARENT MISMATCH ch=%p p=%p/%p\n", chain, chain->parent, parent);
+                       hammer2_chain_unlock(parent);
+                       goto done;
                }
-#endif
 
                /*
-                * There is no parent to our root vchain and fchain to
-                * synchronize the bref to, their updated mirror_tid's
-                * must be synchronized to the volume header.
+                * Check race condition.  If someone got in and modified
+                * it again while it was unlocked, we have to loop up.
                 */
-               hmp->voldata.mirror_tid = chain->bref.mirror_tid;
-               hmp->voldata.freemap_tid = hmp->fchain.bref.mirror_tid;
-               kprintf("mirror_tid %08jx\n", (intmax_t)chain->bref.mirror_tid);
+               if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
+                       hammer2_chain_unlock(parent);
+                       kprintf("hammer2_flush: chain %p flush-mod race\n",
+                               chain);
+                       goto again;
+               }
 
                /*
-                * The volume header is flushed manually by the syncer, not
-                * here.  All we do here is adjust the crc's.
-                */
-               KKASSERT(chain->data != NULL);
-               KKASSERT(chain->dio == NULL);
-
-               hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]=
-                       hammer2_icrc32(
-                               (char *)&hmp->voldata +
-                                HAMMER2_VOLUME_ICRC1_OFF,
-                               HAMMER2_VOLUME_ICRC1_SIZE);
-               hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]=
-                       hammer2_icrc32(
-                               (char *)&hmp->voldata +
-                                HAMMER2_VOLUME_ICRC0_OFF,
-                               HAMMER2_VOLUME_ICRC0_SIZE);
-               hmp->voldata.icrc_volheader =
-                       hammer2_icrc32(
-                               (char *)&hmp->voldata +
-                                HAMMER2_VOLUME_ICRCVH_OFF,
-                               HAMMER2_VOLUME_ICRCVH_SIZE);
-               hmp->volsync = hmp->voldata;
-               atomic_set_int(&chain->flags, HAMMER2_CHAIN_VOLUMESYNC);
-               hammer2_chain_unlock(&hmp->fchain);
-               hammer2_voldata_unlock(hmp);
-               break;
-       case HAMMER2_BREF_TYPE_DATA:
-               /*
-                * Data elements have already been flushed via the logical
-                * file buffer cache.  Their hash was set in the bref by
-                * the vop_write code.
-                *
-                * Make sure any device buffer(s) have been flushed out here.
-                * (there aren't usually any to flush).
+                * Clear UPDATE flag
                 */
-               break;
-#if 0
-       case HAMMER2_BREF_TYPE_INDIRECT:
+               if (chain->flags & HAMMER2_CHAIN_UPDATE) {
+                       atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
+                       hammer2_chain_drop(chain);
+               }
+               hammer2_chain_modify(info->trans, parent, 0);
+
                /*
-                * Indirect blocks may be in an INITIAL state.  Use the
-                * chain_lock() call to ensure that the buffer has been
-                * instantiated (even though it is already locked the buffer
-                * might not have been instantiated).
-                *
-                * Only write the buffer out if it is dirty, it is possible
-                * the operating system had already written out the buffer.
+                * Calculate blockmap pointer
                 */
-               hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
-               KKASSERT(chain->dio != NULL);
-
-               chain->data = NULL;
-               hammer2_io_bqrelse(&chain->dio);
-               hammer2_chain_unlock(chain);
-               break;
-#endif
-       case HAMMER2_BREF_TYPE_INDIRECT:
-       case HAMMER2_BREF_TYPE_FREEMAP_NODE:
-       case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
-               KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0);
-               break;
-       case HAMMER2_BREF_TYPE_INODE:
-               if (chain->data->ipdata.op_flags & HAMMER2_OPFLAG_PFSROOT) {
+               switch(parent->bref.type) {
+               case HAMMER2_BREF_TYPE_INODE:
                        /*
-                        * non-NULL pmp if mounted as a PFS.  We must sync
-                        * fields cached in the pmp.
+                        * Access the inode's block array.  However, there is
+                        * no block array if the inode is flagged DIRECTDATA.
                         */
-                       hammer2_inode_data_t *ipdata;
+                       if (parent->data &&
+                           (parent->data->ipdata.op_flags &
+                            HAMMER2_OPFLAG_DIRECTDATA) == 0) {
+                               base = &parent->data->
+                                       ipdata.u.blockset.blockref[0];
+                       } else {
+                               base = NULL;
+                       }
+                       count = HAMMER2_SET_COUNT;
+                       break;
+               case HAMMER2_BREF_TYPE_INDIRECT:
+               case HAMMER2_BREF_TYPE_FREEMAP_NODE:
+                       if (parent->data)
+                               base = &parent->data->npdata[0];
+                       else
+                               base = NULL;
+                       count = parent->bytes / sizeof(hammer2_blockref_t);
+                       break;
+               case HAMMER2_BREF_TYPE_VOLUME:
+                       base = &chain->hmp->voldata.sroot_blockset.blockref[0];
+                       count = HAMMER2_SET_COUNT;
+                       break;
+               case HAMMER2_BREF_TYPE_FREEMAP:
+                       base = &parent->data->npdata[0];
+                       count = HAMMER2_SET_COUNT;
+                       break;
+               default:
+                       base = NULL;
+                       count = 0;
+                       panic("hammer2_flush_core: "
+                             "unrecognized blockref type: %d",
+                             parent->bref.type);
+               }
 
-                       ipdata = &chain->data->ipdata;
-                       ipdata->pfs_inum = pmp->inode_tid;
-               } else {
-                       /* can't be mounted as a PFS */
-                       KKASSERT((chain->flags & HAMMER2_CHAIN_PFSROOT) == 0);
+               /*
+                * Blocktable updates
+                */
+               if (base && (chain->flags & HAMMER2_CHAIN_BMAPUPD)) {
+                       if (chain->flags & HAMMER2_CHAIN_BMAPPED) {
+                               hammer2_base_delete(info->trans, parent,
+                                                   base, count,
+                                                   &info->cache_index, chain);
+                               hammer2_base_insert(info->trans, parent,
+                                                   base, count,
+                                                   &info->cache_index, chain);
+                       }
+               }
+               if (base && (chain->flags & HAMMER2_CHAIN_BMAPPED) == 0) {
+                       hammer2_base_insert(info->trans, parent,
+                                           base, count,
+                                           &info->cache_index, chain);
                }
-               KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0);
-               break;
-       default:
-               KKASSERT(chain->flags & HAMMER2_CHAIN_EMBEDDED);
-               panic("hammer2_flush_core: unsupported embedded bref %d",
-                     chain->bref.type);
-               /* NOT REACHED */
+               hammer2_chain_unlock(parent);
        }
 
        /*
@@ -1056,15 +872,11 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp,
         */
 done:
        KKASSERT(chain->refs > 1);
-       info->domodify = saved_domodify;
-       info->parent = saved_parent;
-       *chainp = chain;
-
        KKASSERT(chain->bref.mirror_tid <= chain->pmp->flush_tid);
 }
 
 /*
- * Flush helper pass1 (recursive)
+ * Flush recursion helper, called from flush_core, calls flush_core.
  *
  * Flushes the children of the caller's chain (info->parent), restricted
  * by sync_tid.  Set info->domodify if the child's blockref must propagate
@@ -1072,7 +884,7 @@ done:
  *
  * Ripouts can move child from rbtree to dbtree or dbq but the caller's
  * flush scan order prevents any chains from being lost.  A child can be
- * executes more than once (update_xlo is used to prevent infinite recursions).
+ * executes more than once.
  *
  * WARNING! If we do not call hammer2_flush_core() we must update
  *         bref.mirror_tid ourselves to indicate that the flush has
@@ -1084,38 +896,23 @@ done:
  *         not cross a pfs-root boundary.
  */
 static int
-hammer2_flush_pass1(hammer2_chain_t *child, void *data)
+hammer2_flush_recurse(hammer2_chain_t *child, void *data)
 {
        hammer2_flush_info_t *info = data;
-       hammer2_trans_t *trans = info->trans;
+       /*hammer2_trans_t *trans = info->trans;*/
        hammer2_chain_t *parent = info->parent;
 
        /*
-        * Child modified in a later transactions, nothing to flush in this
-        * transaction.
-        *
-        * Remember that modifications generally delete-duplicate so if the
-        * sub-tree is dirty another child will get us there.  But not this
-        * one.
-        *
         * (child can never be fchain or vchain so a special check isn't
         *  needed).
-        */
-       if (child->modify_xid > trans->sync_xid) {
-               KKASSERT(child->delete_xid >= child->modify_xid);
-               /*child->update_xlo = info->sync_xid;*/
-               /* do not update mirror_tid, pass2 will ignore chain */
-               return (0);
-       }
-
-       /*
+        *
         * We must ref the child before unlocking the spinlock.
         *
         * The caller has added a ref to the parent so we can temporarily
         * unlock it in order to lock the child.
         */
        hammer2_chain_ref(child);
-       spin_unlock(&parent->core->cst.spin);
+       spin_unlock(&parent->core.cst.spin);
 
        hammer2_chain_unlock(parent);
        hammer2_chain_lock(child, HAMMER2_RESOLVE_MAYBE);
@@ -1123,54 +920,17 @@ hammer2_flush_pass1(hammer2_chain_t *child, void *data)
        /*
         * Never recurse across a mounted PFS boundary.
         *
-        * Recurse and collect deferral data.  We only recursively sync
-        * (basically) if update_xlo has not been updated, indicating that
-        * the child has not already been processed.
+        * Recurse and collect deferral data.
         */
        if ((child->flags & HAMMER2_CHAIN_PFSBOUNDARY) == 0 ||
            child->pmp == NULL) {
-               if ((child->flags & HAMMER2_CHAIN_MODIFIED) ||
-                   (child->update_xlo < info->sync_xid &&
-                    child->update_xlo < child->update_xhi)) {
+               if (child->flags & HAMMER2_CHAIN_FLUSH_MASK) {
                        ++info->depth;
-                       hammer2_flush_core(info, &child, 0); /* XXX deleting */
+                       hammer2_flush_core(info, child, 0); /* XXX deleting */
                        --info->depth;
                }
        }
 
-       /*
-        * Determine if domodify should be set.  Do not otherwise adjust
-        * the child or pass2 will get confused.
-        *
-        * Insertion:
-        *      - child is flagged as possibly needing block table insertion.
-        *      - child not deleted or deletion is beyond transaction id
-        *      - child created beyond parent synchronization point
-        *      - parent not deleted as-of this transaction
-        */
-       if ((child->flags & HAMMER2_CHAIN_FLUSH_CREATE) &&
-           child->delete_xid > trans->sync_xid &&
-           child->modify_xid > parent->update_xlo &&
-           parent->delete_xid > trans->sync_xid) {
-               info->domodify = 1;
-       }
-
-       /*
-        * Removal:
-        *      - child is flagged as possibly needing block table removal.
-        *      - child deleted before or during this transaction
-        *      - child created prior or during parent synchronization point
-        *      - parent not yet synchronized to child deletion
-        *      - parent not deleted as-of this transaction
-        */
-       if ((child->flags & HAMMER2_CHAIN_FLUSH_DELETE) &&
-           child->delete_xid <= trans->sync_xid &&
-           child->modify_xid <= parent->update_xlo &&
-           child->delete_xid > parent->update_xlo &&
-           parent->delete_xid > trans->sync_xid) {
-               info->domodify = 1;
-       }
-
        /*
         * Relock to continue the loop
         */
@@ -1178,435 +938,13 @@ hammer2_flush_pass1(hammer2_chain_t *child, void *data)
        hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE);
        hammer2_chain_drop(child);
        KKASSERT(info->parent == parent);
+       spin_lock(&parent->core.cst.spin);
 
-       spin_lock(&parent->core->cst.spin);
        return (0);
 }
 
-/*
- * PASS2 - BLOCKTABLE DELETIONS
- */
-static int
-hammer2_flush_pass2(hammer2_chain_t *child, void *data)
-{
-       hammer2_flush_info_t *info = data;
-       hammer2_chain_t *parent = info->parent;
-       hammer2_mount_t *hmp = child->hmp;
-       hammer2_trans_t *trans = info->trans;
-       hammer2_blockref_t *base;
-       int count;
-
-       /*
-        * Prefilter - Ignore children not flagged as needing a parent
-        *             blocktable update.
-        */
-       if ((child->flags & HAMMER2_CHAIN_FLUSH_DELETE) == 0)
-               return (0);
-
-       /*
-        * Prefilter - Ignore children created after our flush_tid (not
-        *             visible to our flush).
-        */
-       if (child->modify_xid > trans->sync_xid) {
-               KKASSERT(child->delete_xid >= child->modify_xid);
-               return 0;
-       }
-
-       /*
-        * Prefilter - Don't bother updating the blockrefs for a deleted
-        *             parent (from the flush's perspective).  Otherwise,
-        *             we need to be COUNTEDBREFS synchronized for the
-        *             hammer2_base_*() functions.
-        *
-        * NOTE: This test must match the similar one in flush_core.
-        */
-       if (h2ignore_deleted(info, parent))
-               return 0;
-
-       /*
-        * Calculate blockmap pointer
-        */
-       switch(parent->bref.type) {
-       case HAMMER2_BREF_TYPE_INODE:
-               /*
-                * Access the inode's block array.  However, there is no
-                * block array if the inode is flagged DIRECTDATA.  The
-                * DIRECTDATA case typicaly only occurs when a hardlink has
-                * been shifted up the tree and the original inode gets
-                * replaced with an OBJTYPE_HARDLINK placeholding inode.
-                */
-               if (parent->data &&
-                   (parent->data->ipdata.op_flags &
-                    HAMMER2_OPFLAG_DIRECTDATA) == 0) {
-                       base = &parent->data->ipdata.u.blockset.blockref[0];
-               } else {
-                       base = NULL;
-               }
-               count = HAMMER2_SET_COUNT;
-               break;
-       case HAMMER2_BREF_TYPE_INDIRECT:
-       case HAMMER2_BREF_TYPE_FREEMAP_NODE:
-               if (parent->data)
-                       base = &parent->data->npdata[0];
-               else
-                       base = NULL;
-               count = parent->bytes / sizeof(hammer2_blockref_t);
-               break;
-       case HAMMER2_BREF_TYPE_VOLUME:
-               base = &hmp->voldata.sroot_blockset.blockref[0];
-               count = HAMMER2_SET_COUNT;
-               break;
-       case HAMMER2_BREF_TYPE_FREEMAP:
-               base = &parent->data->npdata[0];
-               count = HAMMER2_SET_COUNT;
-               break;
-       default:
-               base = NULL;
-               count = 0;
-               panic("hammer2_flush_pass2: unrecognized blockref type: %d",
-                     parent->bref.type);
-       }
-
-       /*
-        * Removal
-        *      - child is flagged for removal
-        *      - child deleted before or during this transaction
-        *      - child created prior or during parent synchronization point
-        *      - parent not yet synchronized to child's deletion
-        */
-       if (child->delete_xid <= trans->sync_xid &&
-           child->modify_xid <= parent->update_xlo &&
-           child->delete_xid > parent->update_xlo) {
-               /* can't assert BMAPPED because state adjustment may occur
-                * before we are done, and BMAPPED only applies to the live
-                * parent.
-                *KKASSERT(child->flags & HAMMER2_CHAIN_BMAPPED);*/
-               if (base) {
-                       hammer2_rollup_stats(parent, child, -1);
-                       hammer2_base_delete(trans, parent, base, count,
-                                           &info->cache_index, child);
-               }
-       }
-
-       return 0;
-}
-
-/*
- * PASS3 - BLOCKTABLE INSERTIONS
- */
-static int
-hammer2_flush_pass3(hammer2_chain_t *child, void *data)
-{
-       hammer2_flush_info_t *info = data;
-       hammer2_chain_t *parent = info->parent;
-       hammer2_mount_t *hmp = child->hmp;
-       hammer2_trans_t *trans = info->trans;
-       hammer2_blockref_t *base;
-       int count;
-
-       /*
-        * Prefilter - Ignore children not flagged as needing a parent
-        *             blocktable update.
-        */
-       if ((child->flags & HAMMER2_CHAIN_FLUSH_CREATE) == 0)
-               return (0);
-
-       /*
-        * Prefilter - Ignore children created after our flush_tid (not
-        *             visible to our flush).
-        */
-       if (child->modify_xid > trans->sync_xid) {
-               KKASSERT(child->delete_xid >= child->modify_xid);
-               return 0;
-       }
-
-       /*
-        * Prefilter - Don't bother updating the blockrefs for a deleted
-        *             parent (from the flush's perspective).  Otherwise,
-        *             we need to be COUNTEDBREFS synchronized for the
-        *             hammer2_base_*() functions.
-        *
-        * NOTE: This test must match the similar one in flush_core.
-        */
-       if (h2ignore_deleted(info, parent))
-               return 0;
-
-       /*
-        * Calculate blockmap pointer
-        */
-       switch(parent->bref.type) {
-       case HAMMER2_BREF_TYPE_INODE:
-               /*
-                * Access the inode's block array.  However, there is no
-                * block array if the inode is flagged DIRECTDATA.  The
-                * DIRECTDATA case typicaly only occurs when a hardlink has
-                * been shifted up the tree and the original inode gets
-                * replaced with an OBJTYPE_HARDLINK placeholding inode.
-                */
-               if (parent->data &&
-                   (parent->data->ipdata.op_flags &
-                    HAMMER2_OPFLAG_DIRECTDATA) == 0) {
-                       base = &parent->data->ipdata.u.blockset.blockref[0];
-               } else {
-                       base = NULL;
-               }
-               count = HAMMER2_SET_COUNT;
-               break;
-       case HAMMER2_BREF_TYPE_INDIRECT:
-       case HAMMER2_BREF_TYPE_FREEMAP_NODE:
-               if (parent->data)
-                       base = &parent->data->npdata[0];
-               else
-                       base = NULL;
-               count = parent->bytes / sizeof(hammer2_blockref_t);
-               break;
-       case HAMMER2_BREF_TYPE_VOLUME:
-               base = &hmp->voldata.sroot_blockset.blockref[0];
-               count = HAMMER2_SET_COUNT;
-               break;
-       case HAMMER2_BREF_TYPE_FREEMAP:
-               base = &parent->data->npdata[0];
-               count = HAMMER2_SET_COUNT;
-               break;
-       default:
-               base = NULL;
-               count = 0;
-               panic("hammer2_flush_pass3: "
-                     "unrecognized blockref type: %d",
-                     parent->bref.type);
-       }
-
-       /*
-        * Insertion
-        *      - child is flagged as possibly needing block table insertion.
-        *      - child not deleted or deletion is beyond transaction id
-        *      - child created beyond parent synchronization point
-        */
-       if (child->delete_xid > trans->sync_xid &&
-           child->modify_xid > parent->update_xlo) {
-               if (base) {
-                       hammer2_rollup_stats(parent, child, 1);
-                       hammer2_base_insert(trans, parent, base, count,
-                                           &info->cache_index, child);
-               }
-       }
-
-       return 0;
-}
-
-/*
- * PASS4 - CLEANUP CHILDREN (non-recursive, but CAN be re-entrant)
- *
- * Adjust queues and set or clear BMAPPED appropriately if processing
- * the live parent.  pass4 handles deletions, pass5 handles insertions.
- * Separate passes are required because both deletions and insertions can
- * occur on dbtree.
- *
- * Cleanup FLUSH_CREATE/FLUSH_DELETE on the last parent.
- */
-static int
-hammer2_flush_pass4(hammer2_chain_t *child, void *data)
-{
-       hammer2_flush_info_t *info = data;
-       hammer2_chain_t *parent = info->parent;
-       hammer2_chain_core_t *above = child->above;
-       hammer2_trans_t *trans = info->trans;
-
-       /*
-        * Prefilter - Ignore children created after our flush_tid (not
-        *             visible to our flush).
-        */
-       if (child->modify_xid > trans->sync_xid) {
-               KKASSERT(child->delete_xid >= child->modify_xid);
-               return 0;
-       }
-
-       /*
-        * Ref and lock child for operation, spinlock must be temporarily
-        * Make sure child is referenced before we unlock.
-        */
-       hammer2_chain_ref(child);
-       spin_unlock(&above->cst.spin);
-       hammer2_chain_lock(child, HAMMER2_RESOLVE_NEVER);
-       KKASSERT(child->above == above);
-       KKASSERT(parent->core == above);
-
-       /*
-        * Adjust BMAPPED state and rbtree/queue only when we hit the
-        * actual live parent.
-        */
-       if ((parent->flags & HAMMER2_CHAIN_DELETED) == 0) {
-               spin_lock(&above->cst.spin);
-
-               /*
-                * Deleting from blockmap, move child out of dbtree
-                * and clear BMAPPED.  Child should not be on RBTREE.
-                */
-               if (child->delete_xid <= trans->sync_xid &&
-                   child->modify_xid <= parent->update_xlo &&
-                   child->delete_xid > parent->update_xlo &&
-                   (child->flags & HAMMER2_CHAIN_BMAPPED)) {
-                       KKASSERT(child->flags & HAMMER2_CHAIN_ONDBTREE);
-                       RB_REMOVE(hammer2_chain_tree, &above->dbtree, child);
-                       atomic_clear_int(&child->flags, HAMMER2_CHAIN_ONDBTREE);
-                       atomic_clear_int(&child->flags, HAMMER2_CHAIN_BMAPPED);
-               }
-
-               /*
-                * Not on any list, place child on DBQ
-                */
-               if ((child->flags & (HAMMER2_CHAIN_ONRBTREE |
-                                    HAMMER2_CHAIN_ONDBTREE |
-                                    HAMMER2_CHAIN_ONDBQ)) == 0) {
-                       KKASSERT((child->flags & HAMMER2_CHAIN_BMAPPED) == 0);
-                       TAILQ_INSERT_TAIL(&above->dbq, child, db_entry);
-                       atomic_set_int(&child->flags, HAMMER2_CHAIN_ONDBQ);
-               }
-               spin_unlock(&above->cst.spin);
-       }
-
-       /*
-        * Unlock the child.  This can wind up dropping the child's
-        * last ref, removing it from the parent's RB tree, and deallocating
-        * the structure.  The RB_SCAN() our caller is doing handles the
-        * situation.
-        */
-       hammer2_chain_unlock(child);
-       hammer2_chain_drop(child);
-       spin_lock(&above->cst.spin);
-
-       /*
-        * The parent may have been delete-duplicated.
-        */
-       return (0);
-}
-
-static int
-hammer2_flush_pass5(hammer2_chain_t *child, void *data)
-{
-       hammer2_flush_info_t *info = data;
-       hammer2_chain_t *parent = info->parent;
-       hammer2_chain_t *xchain;
-       hammer2_chain_core_t *above = child->above;
-       hammer2_trans_t *trans = info->trans;
-
-       /*
-        * Prefilter - Ignore children created after our flush_tid (not
-        *             visible to our flush).
-        */
-       if (child->modify_xid > trans->sync_xid) {
-               KKASSERT(child->delete_xid >= child->modify_xid);
-               return 0;
-       }
-
-       /*
-        * Ref and lock child for operation, spinlock must be temporarily
-        * Make sure child is referenced before we unlock.
-        */
-       hammer2_chain_ref(child);
-       spin_unlock(&above->cst.spin);
-       hammer2_chain_lock(child, HAMMER2_RESOLVE_NEVER);
-       KKASSERT(child->above == above);
-       KKASSERT(parent->core == above);
-
-       /*
-        * Adjust BMAPPED state and rbtree/queue only when we hit the
-        * actual live parent.
-        */
-       if ((parent->flags & HAMMER2_CHAIN_DELETED) == 0) {
-               spin_lock(&above->cst.spin);
-
-               /*
-                * Inserting into blockmap, place child in rbtree or dbtree.
-                */
-               if (child->delete_xid > trans->sync_xid &&
-                   child->modify_xid > parent->update_xlo &&
-                   (child->flags & HAMMER2_CHAIN_BMAPPED) == 0) {
-                       if (child->flags & HAMMER2_CHAIN_ONDBQ) {
-                               TAILQ_REMOVE(&above->dbq, child, db_entry);
-                               atomic_clear_int(&child->flags,
-                                                HAMMER2_CHAIN_ONDBQ);
-                       }
-                       if ((child->flags & HAMMER2_CHAIN_DELETED) == 0 &&
-                           (child->flags & HAMMER2_CHAIN_ONRBTREE) == 0) {
-                               KKASSERT((child->flags &
-                                         (HAMMER2_CHAIN_ONDBTREE |
-                                          HAMMER2_CHAIN_ONDBQ)) == 0);
-                               xchain = RB_INSERT(hammer2_chain_tree,
-                                                  &above->rbtree, child);
-                               KKASSERT(xchain == NULL);
-                               atomic_set_int(&child->flags,
-                                              HAMMER2_CHAIN_ONRBTREE);
-                       } else
-                       if ((child->flags & HAMMER2_CHAIN_DELETED) &&
-                           (child->flags & HAMMER2_CHAIN_ONDBTREE) == 0) {
-                               KKASSERT((child->flags &
-                                         (HAMMER2_CHAIN_ONRBTREE |
-                                          HAMMER2_CHAIN_ONDBQ)) == 0);
-                               xchain = RB_INSERT(hammer2_chain_tree,
-                                                  &above->dbtree, child);
-                               KKASSERT(xchain == NULL);
-                               atomic_set_int(&child->flags,
-                                              HAMMER2_CHAIN_ONDBTREE);
-                       }
-                       atomic_set_int(&child->flags, HAMMER2_CHAIN_BMAPPED);
-                       KKASSERT(child->flags &
-                                (HAMMER2_CHAIN_ONRBTREE |
-                                 HAMMER2_CHAIN_ONDBTREE |
-                                 HAMMER2_CHAIN_ONDBQ));
-               }
-
-               /*
-                * Not on any list, place child on DBQ
-                */
-               if ((child->flags & (HAMMER2_CHAIN_ONRBTREE |
-                                    HAMMER2_CHAIN_ONDBTREE |
-                                    HAMMER2_CHAIN_ONDBQ)) == 0) {
-                       KKASSERT((child->flags & HAMMER2_CHAIN_BMAPPED) == 0);
-                       TAILQ_INSERT_TAIL(&above->dbq, child, db_entry);
-                       atomic_set_int(&child->flags, HAMMER2_CHAIN_ONDBQ);
-               }
-               spin_unlock(&above->cst.spin);
-       }
-
-       /*
-        * Cleanup flags on last parent iterated for flush.
-        */
-       if (info->domodify & 2) {
-               if (child->flags & HAMMER2_CHAIN_FLUSH_CREATE) {
-                       atomic_clear_int(&child->flags,
-                                        HAMMER2_CHAIN_FLUSH_CREATE);
-                       hammer2_chain_drop(child);
-               }
-               if ((child->flags & HAMMER2_CHAIN_FLUSH_DELETE) &&
-                   child->delete_xid <= trans->sync_xid) {
-                       KKASSERT((parent->flags & HAMMER2_CHAIN_DELETED) ||
-                                (child->flags & HAMMER2_CHAIN_ONDBTREE) == 0);
-                       /* XXX delete-duplicate chain insertion mech wrong */
-                       KKASSERT((parent->flags & HAMMER2_CHAIN_DELETED) ||
-                                (child->flags & HAMMER2_CHAIN_BMAPPED) == 0);
-                       atomic_clear_int(&child->flags,
-                                        HAMMER2_CHAIN_FLUSH_DELETE);
-                       hammer2_chain_drop(child);
-               }
-       }
-
-       /*
-        * Unlock the child.  This can wind up dropping the child's
-        * last ref, removing it from the parent's RB tree, and deallocating
-        * the structure.  The RB_SCAN() our caller is doing handles the
-        * situation.
-        */
-       hammer2_chain_unlock(child);
-       hammer2_chain_drop(child);
-       spin_lock(&above->cst.spin);
-
-       /*
-        * The parent may have been delete-duplicated.
-        */
-       return (0);
-}
 
+#if 0
 void
 hammer2_rollup_stats(hammer2_chain_t *parent, hammer2_chain_t *child, int how)
 {
@@ -1654,3 +992,4 @@ hammer2_rollup_stats(hammer2_chain_t *parent, hammer2_chain_t *child, int how)
                parent->inode_count = 0;
        }
 }
+#endif
index 2ca555a..020b52a 100644 (file)
@@ -349,7 +349,7 @@ hammer2_freemap_try_alloc(hammer2_trans_t *trans, hammer2_chain_t **parentp,
                                     HAMMER2_FREEMAP_LEVELN_PSIZE);
                KKASSERT(error == 0);
                if (error == 0) {
-                       hammer2_chain_modify(trans, &chain, 0);
+                       hammer2_chain_modify(trans, chain, 0);
                        bzero(&chain->data->bmdata[0],
                              HAMMER2_FREEMAP_LEVELN_PSIZE);
                        chain->bref.check.freemap.bigmask = (uint32_t)-1;
@@ -367,7 +367,7 @@ hammer2_freemap_try_alloc(hammer2_trans_t *trans, hammer2_chain_t **parentp,
                /*
                 * Modify existing chain to setup for adjustment.
                 */
-               hammer2_chain_modify(trans, &chain, 0);
+               hammer2_chain_modify(trans, chain, 0);
        }
 
        /*
@@ -384,7 +384,7 @@ hammer2_freemap_try_alloc(hammer2_trans_t *trans, hammer2_chain_t **parentp,
                start = (int)((iter->bnext - key) >>
                              HAMMER2_FREEMAP_LEVEL0_RADIX);
                KKASSERT(start >= 0 && start < HAMMER2_FREEMAP_COUNT);
-               hammer2_chain_modify(trans, &chain, 0);
+               hammer2_chain_modify(trans, chain, 0);
 
                error = ENOSPC;
                for (count = 0; count < HAMMER2_FREEMAP_COUNT; ++count) {
@@ -815,7 +815,7 @@ hammer2_freemap_adjust(hammer2_trans_t *trans, hammer2_mount_t *hmp,
                }
 
                if (error == 0) {
-                       hammer2_chain_modify(trans, &chain, 0);
+                       hammer2_chain_modify(trans, chain, 0);
                        bzero(&chain->data->bmdata[0],
                              HAMMER2_FREEMAP_LEVELN_PSIZE);
                        chain->bref.check.freemap.bigmask = (uint32_t)-1;
@@ -874,7 +874,7 @@ again:
                         */
                        if ((*bitmap & bmmask11) != bmmask11) {
                                if (modified == 0) {
-                                       hammer2_chain_modify(trans, &chain, 0);
+                                       hammer2_chain_modify(trans, chain, 0);
                                        modified = 1;
                                        goto again;
                                }
@@ -902,7 +902,7 @@ again:
                         * marked as being fully allocated.
                         */
                        if (!modified) {
-                               hammer2_chain_modify(trans, &chain, 0);
+                               hammer2_chain_modify(trans, chain, 0);
                                modified = 1;
                                goto again;
                        }
@@ -917,7 +917,7 @@ again:
                         */
                        if (how == HAMMER2_FREEMAP_DOREALFREE) {
                                if (!modified) {
-                                       hammer2_chain_modify(trans, &chain, 0);
+                                       hammer2_chain_modify(trans, chain, 0);
                                        modified = 1;
                                        goto again;
                                }
index 96090f3..1a69b49 100644 (file)
@@ -44,6 +44,7 @@
 #define INODE_DEBUG    0
 
 static void hammer2_inode_move_to_hidden(hammer2_trans_t *trans,
+                                        hammer2_cluster_t **cparentp,
                                         hammer2_cluster_t **clusterp,
                                         hammer2_tid_t inum);
 
@@ -84,8 +85,6 @@ hammer2_inode_lock_ex(hammer2_inode_t *ip)
        const hammer2_inode_data_t *ipdata;
        hammer2_cluster_t *cluster;
        hammer2_chain_t *chain;
-       hammer2_chain_t *ochain;
-       hammer2_chain_core_t *core;
        int error;
        int i;
 
@@ -103,25 +102,8 @@ hammer2_inode_lock_ex(hammer2_inode_t *ip)
                        kprintf("inode_lock: %p: missing chain\n", ip);
                        continue;
                }
-               core = chain->core;
-               for (;;) {
-                       if (chain->flags & HAMMER2_CHAIN_DUPLICATED) {
-                               spin_lock(&core->cst.spin);
-                               while (chain->flags & HAMMER2_CHAIN_DUPLICATED)
-                                       chain = TAILQ_NEXT(chain, core_entry);
-                               hammer2_chain_ref(chain);
-                               spin_unlock(&core->cst.spin);
-                               ochain = ip->cluster.array[i];
-                               ip->cluster.array[i] = chain;
-                               if (ip->cluster.focus == NULL)
-                                       ip->cluster.focus = chain;
-                               hammer2_chain_drop(ochain);
-                       }
-                       hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
-                       if ((chain->flags & HAMMER2_CHAIN_DUPLICATED) == 0)
-                               break;
-                       hammer2_chain_unlock(chain);
-               }
+
+               hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
                cluster->array[i] = chain;
                if (cluster->focus == NULL)
                        cluster->focus = chain;
@@ -135,9 +117,7 @@ hammer2_inode_lock_ex(hammer2_inode_t *ip)
        ipdata = &hammer2_cluster_data(cluster)->ipdata;
        if (ipdata->type == HAMMER2_OBJTYPE_HARDLINK &&
            (cluster->focus->flags & HAMMER2_CHAIN_DELETED) == 0) {
-               error = hammer2_hardlink_find(ip->pip, cluster);
-               KKASSERT((cluster->focus->flags &
-                         HAMMER2_CHAIN_DUPLICATED) == 0);
+               error = hammer2_hardlink_find(ip->pip, NULL, cluster);
                KKASSERT(error == 0);
        }
 
@@ -167,7 +147,6 @@ hammer2_inode_lock_sh(hammer2_inode_t *ip)
 {
        const hammer2_inode_data_t *ipdata;
        hammer2_cluster_t *cluster;
-       hammer2_chain_core_t *core;
        hammer2_chain_t *chain;
        int error = 0;
        int i;
@@ -187,31 +166,8 @@ hammer2_inode_lock_sh(hammer2_inode_t *ip)
                        continue;
                }
 
-               core = chain->core;
-
-               if (chain->flags & HAMMER2_CHAIN_DUPLICATED)
-                       goto cycle_excl;
                hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS |
                                          HAMMER2_RESOLVE_SHARED);
-               if (chain->flags & HAMMER2_CHAIN_DUPLICATED) {
-                       hammer2_chain_unlock(chain);
-
-                       /*
-                        * Cycle exclusive inode lock and start the loop
-                        * over again.
-                        */
-cycle_excl:
-                       while (--i >= 0) {
-                               chain = cluster->array[i];
-                               cluster->array[i] = NULL;
-                               hammer2_chain_unlock(chain);
-                       }
-                       ccms_thread_unlock(&ip->topo_cst);
-                       hammer2_inode_unlock_ex(ip, hammer2_inode_lock_ex(ip));
-                       ccms_thread_lock(&ip->topo_cst, CCMS_STATE_SHARED);
-                       cluster->focus = NULL;
-                       continue;       /* restart at i=-1 -> i=0 on loop */
-               }
                cluster->array[i] = chain;
                if (cluster->focus == NULL)
                        cluster->focus = chain;
@@ -223,9 +179,7 @@ cycle_excl:
        ipdata = &hammer2_cluster_data(cluster)->ipdata;
        if (ipdata->type == HAMMER2_OBJTYPE_HARDLINK &&
            (cluster->focus->flags & HAMMER2_CHAIN_DELETED) == 0) {
-               error = hammer2_hardlink_find(ip->pip, cluster);
-               KKASSERT((cluster->focus->flags &
-                         HAMMER2_CHAIN_DUPLICATED) == 0);
+               error = hammer2_hardlink_find(ip->pip, NULL, cluster);
                KKASSERT(error == 0);
        }
 
@@ -808,11 +762,12 @@ retry:
 }
 
 /*
- * Shift *chainp up to the specified directory, change the filename
- * to "0xINODENUMBER", and adjust the key.  The chain becomes the
- * invisible hardlink target.
+ * The cluster has been removed from the original directory and replaced
+ * with a hardlink pointer.  Move the cluster to the specified parent
+ * directory, change the filename to "0xINODENUMBER", and adjust the key.
+ * The cluster becomes our invisible hardlink target.
  *
- * The original *chainp has already been marked deleted.
+ * The original cluster must be deleted on entry.
  */
 static
 void
@@ -842,7 +797,6 @@ hammer2_hardlink_shiftup(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
         * WARNING! Must use inode_lock_ex() on dip to handle a stale
         *          dip->cluster cache.
         */
-retry:
        *errorp = 0;
        xcluster = hammer2_cluster_lookup(dcluster, &key_dummy,
                                      lhc, lhc, 0, &ddflag);
@@ -858,35 +812,6 @@ retry:
 #endif
        }
 
-       /*
-        * Create entry in common parent directory using the seek position
-        * calculated above.
-        *
-        * We must refactor cluster because it might have been shifted into
-        * an indirect cluster by the create.
-        */
-       if (*errorp == 0) {
-               KKASSERT(xcluster == NULL);
-#if 0
-               *errorp = hammer2_cluster_create(trans, dcluster, &xcluster,
-                                              lhc, 0,
-                                              HAMMER2_BREF_TYPE_INODE,/* n/a */
-                                              HAMMER2_INODE_BYTES);   /* n/a */
-#endif
-               /*XXX this somehow isn't working on cluster XXX*/
-               /*KKASSERT(xxx)*/
-       }
-
-       /*
-        * Cleanup and handle retries.
-        */
-       if (*errorp == EAGAIN) {
-               kprintf("R");
-               hammer2_cluster_wait(dcluster);
-               hammer2_cluster_drop(dcluster);
-               goto retry;
-       }
-
        /*
         * Handle the error case
         */
@@ -910,7 +835,7 @@ retry:
        hammer2_cluster_bref(cluster, &bref);
        bref.key = lhc;                 /* invisible dir entry key */
        bref.keybits = 0;
-       hammer2_cluster_duplicate(trans, dcluster, cluster, &bref, 0, 2);
+       hammer2_cluster_rename(trans, &bref, dcluster, cluster);
 
        /*
         * cluster is now 'live' again.. adjust the filename.
@@ -930,17 +855,15 @@ retry:
 }
 
 /*
- * Connect the target inode represented by (*chainp) to the media topology
+ * Connect the target inode represented by (cluster) to the media topology
  * at (dip, name, len).  The caller can pass a rough *chainp, this function
  * will issue lookup()s to position the parent chain properly for the
  * chain insertion.
  *
  * If hlink is TRUE this function creates an OBJTYPE_HARDLINK directory
- * entry instead of connecting (*chainp).
+ * entry instead of connecting (cluster).
  *
- * If hlink is FALSE this function uses chain_duplicate() to make a copy
- * if (*chainp) in the directory entry.  (*chainp) is likely to be deleted
- * by the caller in this case (e.g. rename).
+ * If hlink is FALSE this function expects (cluster) to be unparented.
  */
 int
 hammer2_inode_connect(hammer2_trans_t *trans,
@@ -964,13 +887,12 @@ hammer2_inode_connect(hammer2_trans_t *trans,
         *
         * WARNING! Must use inode_lock_ex() on dip to handle a stale
         *          dip->cluster.
-        */
-       ocluster = *clusterp;
-
-       /*
+        *
         * If name is non-NULL we calculate lhc, else we use the passed-in
         * lhc.
         */
+       ocluster = *clusterp;
+
        if (name) {
                lhc = hammer2_dirhash(name, name_len);
 
@@ -1021,17 +943,11 @@ hammer2_inode_connect(hammer2_trans_t *trans,
                                                       lhc, 0,
                                                       HAMMER2_BREF_TYPE_INODE,
                                                       HAMMER2_INODE_BYTES);
-                       hammer2_cluster_refactor(ocluster);
                } else {
                        /*
-                        * Reconnect the original cluster and rename.  Use
-                        * cluster_duplicate().  The caller will likely delete
-                        * or has already deleted the original chain in
-                        * this case.
-                        *
-                        * NOTE: cluster_duplicate() generates a new cluster
-                        *       with CHAIN_DELETED cleared (ocluster typically
-                        *       has it set from the file unlink).
+                        * Reconnect the original cluster under the new name.
+                        * Original cluster must have already been deleted by
+                        * teh caller.
                         *
                         * WARNING! Can cause held-over clusters to require a
                         *          refactor.  Fortunately we have none (our
@@ -1040,8 +956,6 @@ hammer2_inode_connect(hammer2_trans_t *trans,
                         */
                        ncluster = ocluster;
                        ocluster = NULL;
-                       hammer2_cluster_duplicate(trans, NULL, ncluster, NULL,
-                                                 0, 3);
                        error = hammer2_cluster_create(trans,
                                                       dcluster, &ncluster,
                                                       lhc, 0,
@@ -1098,8 +1012,8 @@ hammer2_inode_connect(hammer2_trans_t *trans,
        } else {
                /*
                 * ncluster is a duplicate of ocluster at the new location.
-                * We must fixup the name stored in oip.  The bref key
-                * has already been set up.
+                * We must fixup the name stored in the inode data.
+                * The bref key has already been adjusted by inode_connect().
                 */
                hammer2_cluster_modify(trans, ncluster, 0);
                wipdata = &hammer2_cluster_wdata(ncluster)->ipdata;
@@ -1216,16 +1130,21 @@ hammer2_inode_repoint(hammer2_inode_t *ip, hammer2_inode_t *pip,
  *
  *       The caller is responsible for fixing up ip->chain if e.g. a
  *       rename occurs (see chain_duplicate()).
+ *
+ * NOTE!  The chain is not deleted if it is moved to the hidden directory,
+ *       but otherwise will be deleted.
  */
 int
 hammer2_unlink_file(hammer2_trans_t *trans, hammer2_inode_t *dip,
                    const uint8_t *name, size_t name_len,
-                   int isdir, int *hlinkp, struct nchandle *nch)
+                   int isdir, int *hlinkp, struct nchandle *nch,
+                   int nlinks)
 {
        const hammer2_inode_data_t *ripdata;
        hammer2_inode_data_t *wipdata;
        hammer2_cluster_t *cparent;
-       hammer2_cluster_t *ocluster;
+       hammer2_cluster_t *hcluster;
+       hammer2_cluster_t *hparent;
        hammer2_cluster_t *cluster;
        hammer2_cluster_t *dparent;
        hammer2_cluster_t *dcluster;
@@ -1237,9 +1156,11 @@ hammer2_unlink_file(hammer2_trans_t *trans, hammer2_inode_t *dip,
        uint8_t type;
 
        error = 0;
-       ocluster = NULL;
+       hcluster = NULL;
+       hparent = NULL;
        lhc = hammer2_dirhash(name, name_len);
 
+again:
        /*
         * Search for the filename in the directory
         */
@@ -1291,23 +1212,24 @@ hammer2_unlink_file(hammer2_trans_t *trans, hammer2_inode_t *dip,
 
        /*
         * Hardlink must be resolved.  We can't hold the parent locked
-        * while we do this or we could deadlock.
+        * while we do this or we could deadlock.  The physical file will
+        * be located at or above the current directory.
         *
-        * On success cluster will be adjusted to point at the hardlink target
-        * and ocluster will point to the hardlink pointer in the original
-        * directory.  Otherwise cluster remains pointing to the original.
+        * We loop to reacquire the hardlink origination.
         *
-        * Lock ownership is transfered to cluster.  ocluster is merely
-        * referenced.
+        * NOTE: hammer2_hardlink_find() will locate the hardlink target,
+        *       returning a modified hparent and hcluster.
         */
        if (ripdata->type == HAMMER2_OBJTYPE_HARDLINK) {
-               hammer2_cluster_unlock(cparent);
-               cparent = NULL;
-
-               ocluster = cluster;
-               cluster = hammer2_cluster_copy(ocluster, 0);
-               error = hammer2_hardlink_find(dip, cluster);
-               KKASSERT(error == 0);
+               if (hcluster == NULL) {
+                       hcluster = cluster;
+                       hammer2_cluster_unlock(cparent);
+                       cparent = NULL; /* safety */
+                       error = hammer2_hardlink_find(dip, &hparent, hcluster);
+                       cluster = NULL; /* safety */
+                       KKASSERT(error == 0);
+                       goto again;
+               }
        }
 
        /*
@@ -1339,35 +1261,27 @@ hammer2_unlink_file(hammer2_trans_t *trans, hammer2_inode_t *dip,
        }
 
        /*
-        * Ok, we can now unlink the cluster.  We always decrement nlinks even
-        * if the entry can be deleted in case someone has the file open and
-        * does an fstat().
-        *
-        * The cluster itself will no longer be in the on-media topology but
-        * can still be flushed to the media (e.g. if an open descriptor
-        * remains).  When the last vnode/ip ref goes away the cluster will
-        * be marked unmodified, avoiding any further (now unnecesary) I/O.
-        *
-        * A non-NULL ocluster indicates a hardlink.
+        * If this was a hardlink (cparent, cluster) is the hardlink
+        * pointer, which we can simply destroy outright.  Discard the
+        * clusters and replace with the hardlink target.
         */
-       if (ocluster) {
-               /*
-                * Delete the original hardlink pointer unconditionally.
-                * (any open descriptors will migrate to the hardlink
-                * target and have no affect on this operation).
-                *
-                * NOTE: parent from above is NULL when ocluster != NULL
-                *       so we can reuse it.
-                */
-               hammer2_cluster_lock(ocluster, HAMMER2_RESOLVE_ALWAYS);
-               hammer2_cluster_delete(trans, ocluster, 0);
-               hammer2_cluster_unlock(ocluster);
+       if (hcluster) {
+               hammer2_cluster_delete(trans, cparent, cluster,
+                                      HAMMER2_DELETE_PERMANENT);
+               hammer2_cluster_unlock(cparent);
+               hammer2_cluster_unlock(cluster);
+               cparent = hparent;
+               cluster = hcluster;
+               hparent = NULL;
+               hcluster = NULL;
        }
 
        /*
-        * Decrement nlinks on the hardlink target (or original file if
-        * there it was not hardlinked).  Delete the target when nlinks
-        * reaches 0 with special handling if (isopen) is set.
+        * This leaves us with the hardlink target or non-hardlinked file
+        * or directory in (cparent, cluster).
+        *
+        * Delete the target when nlinks reaches 0 with special handling
+        * if (isopen) is set.
         *
         * NOTE! In DragonFly the vnops function calls cache_unlink() after
         *       calling us here to clean out the namecache association,
@@ -1383,7 +1297,7 @@ hammer2_unlink_file(hammer2_trans_t *trans, hammer2_inode_t *dip,
        hammer2_cluster_modify(trans, cluster, 0);
        wipdata = &hammer2_cluster_wdata(cluster)->ipdata;
        ripdata = wipdata;
-       --wipdata->nlinks;
+       wipdata->nlinks += nlinks;
        if ((int64_t)wipdata->nlinks < 0) {     /* XXX debugging */
                wipdata->nlinks = 0;
        }
@@ -1399,24 +1313,31 @@ hammer2_unlink_file(hammer2_trans_t *trans, hammer2_inode_t *dip,
                        goto done;
                }
                if (nch && cache_isopen(nch)) {
-                       kprintf("WARNING: unlinking open file\n");
-                       hammer2_cluster_set_chainflags(cluster,
-                                                       HAMMER2_CHAIN_UNLINKED);
-                       hammer2_inode_move_to_hidden(trans, &cluster,
+                       hammer2_inode_move_to_hidden(trans, &cparent, &cluster,
                                                     wipdata->inum);
                } else {
-                       hammer2_cluster_delete(trans, cluster, 0);
+                       hammer2_cluster_delete(trans, cparent, cluster, 0);
                }
+       } else if (*hlinkp == 0) {
+               /*
+                * If this wasn't a hardlinked file and wipdata->nlinks is
+                * still non-zero, the adjustment should be 0 (i.e. a rename),
+                * in which case we delete the object so the rename code can
+                * reconnect it elsewhere.
+                */
+               KKASSERT(nlinks == 0);
+               hammer2_cluster_delete(trans, cparent, cluster, 0);
        }
        error = 0;
 done:
+       if (cparent)
+               hammer2_cluster_unlock(cparent);
        if (cluster)
                hammer2_cluster_unlock(cluster);
-       if (cparent)
-               hammer2_cluster_lookup_done(cparent);
-       if (ocluster) {
-               hammer2_cluster_drop(ocluster);
-       }
+       if (hparent)
+               hammer2_cluster_unlock(hparent);
+       if (hcluster)
+               hammer2_cluster_unlock(hcluster);
 
        return error;
 }
@@ -1467,7 +1388,8 @@ hammer2_inode_install_hidden(hammer2_pfsmount_t *pmp)
                while (scan) {
                        if (hammer2_cluster_type(scan) ==
                            HAMMER2_BREF_TYPE_INODE) {
-                               hammer2_cluster_delete(&trans, scan, 0);
+                               hammer2_cluster_delete(&trans, cluster, scan,
+                                                  HAMMER2_DELETE_PERMANENT);
                                ++count;
                        }
                        scan = hammer2_cluster_next(cluster, scan, &key_next,
@@ -1517,7 +1439,9 @@ hammer2_inode_install_hidden(hammer2_pfsmount_t *pmp)
 static
 void
 hammer2_inode_move_to_hidden(hammer2_trans_t *trans,
-                            hammer2_cluster_t **clusterp, hammer2_tid_t inum)
+                            hammer2_cluster_t **cparentp,
+                            hammer2_cluster_t **clusterp,
+                            hammer2_tid_t inum)
 {
        hammer2_cluster_t *dcluster;
        hammer2_pfsmount_t *pmp;
@@ -1527,7 +1451,7 @@ hammer2_inode_move_to_hidden(hammer2_trans_t *trans,
        KKASSERT(pmp != NULL);
        KKASSERT(pmp->ihidden != NULL);
 
-       hammer2_cluster_delete(trans, *clusterp, 0);
+       hammer2_cluster_delete(trans, *cparentp, *clusterp, 0);
        dcluster = hammer2_inode_lock_ex(pmp->ihidden);
        error = hammer2_inode_connect(trans, clusterp, 0,
                                      pmp->ihidden, dcluster,
@@ -1537,9 +1461,11 @@ hammer2_inode_move_to_hidden(hammer2_trans_t *trans,
 }
 
 /*
- * Given an exclusively locked inode and cluster we consolidate its cluster
+ * Given an exclusively locked inode and cluster we consolidate the cluster
  * for hardlink creation, adding (nlinks) to the file's link count and
- * potentially relocating the inode to a directory common to ip->pip and tdip.
+ * potentially relocating the inode to (cdip) which is a parent directory
+ * common to both the current location of the inode and the intended new
+ * hardlink.
  *
  * Replaces (*clusterp) if consolidation occurred, unlocking the old cluster
  * and returning a new locked cluster.
@@ -1557,7 +1483,7 @@ hammer2_hardlink_consolidate(hammer2_trans_t *trans,
        const hammer2_inode_data_t *ripdata;
        hammer2_inode_data_t *wipdata;
        hammer2_cluster_t *cluster;
-       hammer2_cluster_t *ncluster;
+       hammer2_cluster_t *cparent;
        int error;
 
        cluster = *clusterp;
@@ -1573,6 +1499,8 @@ hammer2_hardlink_consolidate(hammer2_trans_t *trans,
                return (ENOTSUP);
        }
 
+       cparent = NULL;
+
        /*
         * If no change in the hardlink's target directory is required and
         * this is already a hardlink target, all we need to do is adjust
@@ -1592,33 +1520,46 @@ hammer2_hardlink_consolidate(hammer2_trans_t *trans,
                goto done;
        }
 
-
        /*
-        * cluster is the real inode.  If it's visible we have to convert it
-        * to a hardlink pointer.  If it is not visible then it is already
-        * a hardlink target and only needs to be deleted.
+        * Cluster is the real inode.  The originating directory is locked
+        * by the caller so we can manipulate it without worrying about races
+        * against other lookups.
+        *
+        * If cluster is visible we need to delete it from the current
+        * location and create a hardlink pointer in its place.  If it is
+        * not visible we need only delete it.  Then later cluster will be
+        * renamed to a parent directory and converted (if necessary) to
+        * a hidden inode (via shiftup).
+        *
+        * NOTE! We must hold cparent locked through the delete/create/rename
+        *       operation to ensure that other threads block resolving to
+        *       the same hardlink, otherwise the other threads may not see
+        *       the hardlink.
         */
        KKASSERT((cluster->focus->flags & HAMMER2_CHAIN_DELETED) == 0);
+       cparent = hammer2_cluster_parent(cluster);
+
+       hammer2_cluster_delete(trans, cparent, cluster, 0);
+
        ripdata = &hammer2_cluster_data(cluster)->ipdata;
        KKASSERT(ripdata->type != HAMMER2_OBJTYPE_HARDLINK);
        if (ripdata->name_key & HAMMER2_DIRHASH_VISIBLE) {
-               /*
-                * We are going to duplicate cluster later, causing its
-                * media block to be shifted to the duplicate.  Even though
-                * we are delete-duplicating ncluster here it might decide not
-                * to reallocate the block.  Set FORCECOW to force it to.
-                */
-               ncluster = hammer2_cluster_copy(cluster,
-                                               HAMMER2_CLUSTER_COPY_NOREF);
-               hammer2_cluster_lock(ncluster, HAMMER2_RESOLVE_ALWAYS);
-               hammer2_cluster_set_chainflags(ncluster,
-                                              HAMMER2_CHAIN_FORCECOW);
-               hammer2_cluster_delete_duplicate(trans, ncluster,
-                                                HAMMER2_DELDUP_RECORE);
-               KKASSERT((ncluster->focus->flags &
-                        HAMMER2_CHAIN_DUPLICATED) == 0);
+               hammer2_cluster_t *ncluster;
+               hammer2_key_t lhc;
+
+               ncluster = NULL;
+               lhc = cluster->focus->bref.key;
+               error = hammer2_cluster_create(trans, cparent, &ncluster,
+                                            lhc, 0,
+                                            HAMMER2_BREF_TYPE_INODE,
+                                            HAMMER2_INODE_BYTES);
+               hammer2_cluster_modify(trans, ncluster, 0);
                wipdata = &hammer2_cluster_wdata(ncluster)->ipdata;
-               wipdata->target_type = wipdata->type;
+
+               wipdata->comp_algo = ripdata->comp_algo;
+               wipdata->version = HAMMER2_INODE_VERSION_ONE;
+               wipdata->inum = ripdata->inum;
+               wipdata->target_type = ripdata->type;
                wipdata->type = HAMMER2_OBJTYPE_HARDLINK;
                wipdata->uflags = 0;
                wipdata->rmajor = 0;
@@ -1646,11 +1587,12 @@ hammer2_hardlink_consolidate(hammer2_trans_t *trans,
                wipdata->attr_tid = 0;
                wipdata->dirent_tid = 0;
                bzero(&wipdata->u, sizeof(wipdata->u));
+               bcopy(ripdata->filename, wipdata->filename, ripdata->name_len);
+               wipdata->name_key = ncluster->focus->bref.key;
+               wipdata->name_len = ripdata->name_len;
                /* XXX transaction ids */
                hammer2_cluster_modsync(ncluster);
-       } else {
-               hammer2_cluster_delete(trans, cluster, 0);
-               ncluster = NULL;
+               hammer2_cluster_unlock(ncluster);
        }
        ripdata = wipdata;
 
@@ -1669,17 +1611,14 @@ hammer2_hardlink_consolidate(hammer2_trans_t *trans,
        if (error == 0)
                hammer2_inode_repoint(ip, cdip, cluster);
 
-       /*
-        * Unlock and destroy ncluster.
-        * Return the shifted cluster in *clusterp.
-        */
-       if (ncluster)
-               hammer2_cluster_unlock(ncluster);
-
 done:
        /*
         * Cleanup, cluster/ncluster already dealt with.
+        *
+        * Return the shifted cluster in *clusterp.
         */
+       if (cparent)
+               hammer2_cluster_unlock(cparent);
        *clusterp = cluster;
        hammer2_inode_drop(cdip);
 
@@ -1706,20 +1645,18 @@ hammer2_hardlink_deconsolidate(hammer2_trans_t *trans,
 }
 
 /*
- * The caller presents a locked *chainp pointing to a HAMMER2_BREF_TYPE_INODE
- * with an obj_type of HAMMER2_OBJTYPE_HARDLINK.  This routine will gobble
- * the *chainp and return a new locked *chainp representing the file target
- * (the original *chainp will be unlocked).
- *
- * When a match is found the chain representing the original HARDLINK
- * will be returned in *ochainp with a ref, but not locked.
+ * The caller presents a locked cluster with an obj_type of
+ * HAMMER2_OBJTYPE_HARDLINK.  This routine will replace the cluster with
+ * the target hardlink (which typically exists in some parent directory as
+ * a hidden file).  If cparentp is not NULL a locked cluster representing
+ * the hardlink's parent is also returned.
  *
- * When no match is found *chainp is set to NULL and EIO is returned.
- * (*ochainp) will still be set to the original chain with a ref but not
- * locked.
+ * If no match is found EIO is returned, *cparentp will be set to NULL,
+ * and the cluster will be unlocked and eaten up.
  */
 int
-hammer2_hardlink_find(hammer2_inode_t *dip, hammer2_cluster_t *cluster)
+hammer2_hardlink_find(hammer2_inode_t *dip,
+                     hammer2_cluster_t **cparentp, hammer2_cluster_t *cluster)
 {
        const hammer2_inode_data_t *ipdata;
        hammer2_cluster_t *cparent;
@@ -1750,6 +1687,7 @@ hammer2_hardlink_find(hammer2_inode_t *dip, hammer2_cluster_t *cluster)
        cluster->nchains = 0;                   /* hack */
 
        rcluster = NULL;
+       cparent = NULL;
 
        while ((ip = pip) != NULL) {
                cparent = hammer2_inode_lock_ex(ip);
@@ -1758,9 +1696,9 @@ hammer2_hardlink_find(hammer2_inode_t *dip, hammer2_cluster_t *cluster)
                         HAMMER2_BREF_TYPE_INODE);
                rcluster = hammer2_cluster_lookup(cparent, &key_dummy,
                                             lhc, lhc, 0, &ddflag);
-               hammer2_cluster_lookup_done(cparent);   /* discard parent */
                if (rcluster)
                        break;
+               hammer2_cluster_lookup_done(cparent);   /* discard parent */
                pip = ip->pip;          /* safe, ip held locked */
                if (pip)
                        hammer2_inode_ref(pip);         /* loop */
@@ -1773,14 +1711,19 @@ hammer2_hardlink_find(hammer2_inode_t *dip, hammer2_cluster_t *cluster)
         *
         * (cparent is already unlocked).
         */
-       if (ip)
-               hammer2_inode_unlock_ex(ip, NULL);
-
        if (rcluster) {
                hammer2_cluster_replace(cluster, rcluster);
                hammer2_cluster_drop(rcluster);
+               if (cparentp)
+                       *cparentp = cparent;
+               else
+                       hammer2_inode_unlock_ex(ip, cparent);
                return (0);
        } else {
+               if (cparentp)
+                       *cparentp = NULL;
+               if (ip)
+                       hammer2_inode_unlock_ex(ip, cparent);
                return (EIO);
        }
 }
@@ -1888,7 +1831,8 @@ hammer2_inode_fsync(hammer2_trans_t *trans, hammer2_inode_t *ip,
                                cluster = NULL;
                                break;
                        case HAMMER2_BREF_TYPE_DATA:
-                               hammer2_cluster_delete(trans, cluster, 0);
+                               hammer2_cluster_delete(trans, dparent, cluster,
+                                                  HAMMER2_DELETE_PERMANENT);
                                /* fall through */
                        default:
                                cluster = hammer2_cluster_next(dparent, cluster,
index 288bf36..5f9ab11 100644 (file)
@@ -575,7 +575,7 @@ hammer2_ioctl_pfs_delete(hammer2_inode_t *ip, void *data)
        hammer2_trans_init(&trans, ip->pmp, 0);
        error = hammer2_unlink_file(&trans, hmp->spmp->iroot,
                                    pfs->name, strlen(pfs->name),
-                                   2, NULL, NULL);
+                                   2, NULL, NULL, -1);
        hammer2_trans_done(&trans);
 
        return (error);
index 0f2772f..664a29e 100644 (file)
 void
 hammer2_mount_exlock(hammer2_mount_t *hmp)
 {
-       ccms_thread_lock(&hmp->vchain.core->cst, CCMS_STATE_EXCLUSIVE);
+       ccms_thread_lock(&hmp->vchain.core.cst, CCMS_STATE_EXCLUSIVE);
 }
 
 void
 hammer2_mount_shlock(hammer2_mount_t *hmp)
 {
-       ccms_thread_lock(&hmp->vchain.core->cst, CCMS_STATE_SHARED);
+       ccms_thread_lock(&hmp->vchain.core.cst, CCMS_STATE_SHARED);
 }
 
 void
 hammer2_mount_unlock(hammer2_mount_t *hmp)
 {
-       ccms_thread_unlock(&hmp->vchain.core->cst);
+       ccms_thread_unlock(&hmp->vchain.core.cst);
 }
 
 /*
index c2fdb42..bae81b0 100644 (file)
@@ -334,7 +334,7 @@ hammer2_pfsalloc(const hammer2_inode_data_t *ipdata, hammer2_tid_t alloc_tid)
        spin_init(&pmp->inum_spin);
        RB_INIT(&pmp->inum_tree);
        TAILQ_INIT(&pmp->unlinkq);
-       spin_init(&pmp->unlinkq_spin);
+       spin_init(&pmp->list_spin);
 
        pmp->alloc_tid = alloc_tid + 1;   /* our first media transaction id */
        pmp->flush_tid = pmp->alloc_tid;
@@ -525,6 +525,9 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
                kmalloc_create(&hmp->mchain, "HAMMER2-chains");
                TAILQ_INSERT_TAIL(&hammer2_mntlist, hmp, mntentry);
                RB_INIT(&hmp->iotree);
+               spin_init(&hmp->io_spin);
+               spin_init(&hmp->list_spin);
+               TAILQ_INIT(&hmp->flushq);
 
                lockinit(&hmp->vollk, "h2vol", 0, 0);
 
@@ -540,9 +543,8 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
                hmp->vchain.bref.type = HAMMER2_BREF_TYPE_VOLUME;
                hmp->vchain.bref.data_off = 0 | HAMMER2_PBUFRADIX;
                hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid;
-               hmp->vchain.delete_xid = HAMMER2_XID_MAX;
 
-               hammer2_chain_core_alloc(NULL, &hmp->vchain, NULL);
+               hammer2_chain_core_alloc(NULL, &hmp->vchain);
                /* hmp->vchain.u.xxx is left NULL */
 
                /*
@@ -564,9 +566,8 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
                hmp->fchain.bref.methods =
                        HAMMER2_ENC_CHECK(HAMMER2_CHECK_FREEMAP) |
                        HAMMER2_ENC_COMP(HAMMER2_COMP_NONE);
-               hmp->fchain.delete_xid = HAMMER2_XID_MAX;
 
-               hammer2_chain_core_alloc(NULL, &hmp->fchain, NULL);
+               hammer2_chain_core_alloc(NULL, &hmp->fchain);
                /* hmp->fchain.u.xxx is left NULL */
 
                /*
@@ -596,15 +597,9 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
                xid = 0;
                hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid;
                hmp->vchain.bref.modify_tid = hmp->vchain.bref.mirror_tid;
-               hmp->vchain.modify_xid = xid;
-               hmp->vchain.update_xlo = xid;
-               hmp->vchain.update_xhi = xid;
                hmp->vchain.pmp = spmp;
                hmp->fchain.bref.mirror_tid = hmp->voldata.freemap_tid;
                hmp->fchain.bref.modify_tid = hmp->fchain.bref.mirror_tid;
-               hmp->fchain.modify_xid = xid;
-               hmp->fchain.update_xlo = xid;
-               hmp->fchain.update_xhi = xid;
                hmp->fchain.pmp = spmp;
 
                /*
@@ -664,8 +659,7 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
                 */
                kdmsg_iocom_init(&hmp->iocom, hmp,
                                 KDMSG_IOCOMF_AUTOCONN |
-                                KDMSG_IOCOMF_AUTORXSPAN |
-                                KDMSG_IOCOMF_AUTORXCIRC,
+                                KDMSG_IOCOMF_AUTORXSPAN,
                                 hmp->mchain, hammer2_rcvdmsg);
 
                /*
@@ -717,7 +711,6 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
 
        for (i = 0; i < cluster->nchains; ++i) {
                rchain = cluster->array[i];
-               KKASSERT(rchain->pmp == NULL);
                if (rchain->flags & HAMMER2_CHAIN_MOUNTED) {
                        kprintf("hammer2_mount: PFS label already mounted!\n");
                        hammer2_cluster_unlock(cluster);
@@ -727,6 +720,7 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
                        hammer2_vfs_unmount(mp, MNT_FORCE);
                        return EBUSY;
                }
+               KKASSERT(rchain->pmp == NULL);
 #if 0
                if (rchain->flags & HAMMER2_CHAIN_RECYCLE) {
                        kprintf("hammer2_mount: PFS label is recycling\n");
@@ -1334,8 +1328,10 @@ hammer2_compress_and_write(struct buf *bp, hammer2_trans_t *trans,
                        }
 
                        /*
-                        * Device buffer is now valid, chain is no
-                        * longer in the initial state.
+                        * Device buffer is now valid, chain is no longer in
+                        * the initial state.
+                        *
+                        * (No blockref table worries with file data)
                         */
                        atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
 
@@ -1417,9 +1413,9 @@ test_block_zeros(const char *buf, size_t bytes)
 static
 void
 zero_write(struct buf *bp, hammer2_trans_t *trans,
-       hammer2_inode_t *ip, const hammer2_inode_data_t *ipdata,
-       hammer2_cluster_t *cparent,
-       hammer2_key_t lbase, int *errorp __unused)
+          hammer2_inode_t *ip, const hammer2_inode_data_t *ipdata,
+          hammer2_cluster_t *cparent,
+          hammer2_key_t lbase, int *errorp __unused)
 {
        hammer2_cluster_t *cluster;
        hammer2_media_data_t *data;
@@ -1438,7 +1434,8 @@ zero_write(struct buf *bp, hammer2_trans_t *trans,
                        bzero(data->ipdata.u.data, HAMMER2_EMBEDDED_BYTES);
                        hammer2_cluster_modsync(cluster);
                } else {
-                       hammer2_cluster_delete(trans, cluster, 0);
+                       hammer2_cluster_delete(trans, cparent, cluster,
+                                              HAMMER2_DELETE_PERMANENT);
                }
                hammer2_cluster_unlock(cluster);
        }
@@ -1498,8 +1495,10 @@ hammer2_write_bp(hammer2_cluster_t *cluster, struct buf *bp, int ioflag,
                        bcopy(bp->b_data, bdata, chain->bytes);
 
                        /*
-                        * Device buffer is now valid, chain is no
-                        * longer in the initial state.
+                        * Device buffer is now valid, chain is no longer in
+                        * the initial state.
+                        *
+                        * (No blockref table worries with file data)
                         */
                        atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
 
@@ -1679,31 +1678,21 @@ hammer2_vfs_unmount_hmp1(struct mount *mp, hammer2_mount_t *hmp)
         * recovering from a crash).
         */
        hammer2_voldata_lock(hmp);
-       if (((hmp->vchain.flags | hmp->fchain.flags) &
-            HAMMER2_CHAIN_MODIFIED) ||
-           hmp->vchain.update_xhi > hmp->vchain.update_xlo ||
-           hmp->fchain.update_xhi > hmp->fchain.update_xlo) {
+       if ((hmp->vchain.flags | hmp->fchain.flags) &
+           HAMMER2_CHAIN_FLUSH_MASK) {
                hammer2_voldata_unlock(hmp);
                hammer2_vfs_sync(mp, MNT_WAIT);
-               /*hammer2_vfs_sync(mp, MNT_WAIT);*/
+               hammer2_vfs_sync(mp, MNT_WAIT);
        } else {
                hammer2_voldata_unlock(hmp);
        }
        if (hmp->pmp_count == 0) {
-               if (((hmp->vchain.flags | hmp->fchain.flags) &
-                    HAMMER2_CHAIN_MODIFIED) ||
-                   hmp->vchain.update_xhi > hmp->vchain.update_xlo ||
-                   hmp->fchain.update_xhi > hmp->fchain.update_xlo) {
+               if ((hmp->vchain.flags | hmp->fchain.flags) &
+                   HAMMER2_CHAIN_FLUSH_MASK) {
                        kprintf("hammer2_unmount: chains left over "
                                "after final sync\n");
-                       kprintf("    vchain %08x update_xlo/hi %08x/%08x\n",
-                               hmp->vchain.flags,
-                               hmp->vchain.update_xlo,
-                               hmp->vchain.update_xhi);
-                       kprintf("    fchain %08x update_xhi/hi %08x/%08x\n",
-                               hmp->fchain.flags,
-                               hmp->fchain.update_xlo,
-                               hmp->fchain.update_xhi);
+                       kprintf("    vchain %08x\n", hmp->vchain.flags);
+                       kprintf("    fchain %08x\n", hmp->fchain.flags);
 
                        if (hammer2_debug & 0x0010)
                                Debugger("entered debugger");
@@ -1760,32 +1749,24 @@ hammer2_vfs_unmount_hmp2(struct mount *mp, hammer2_mount_t *hmp)
                if (hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) {
                        atomic_clear_int(&hmp->vchain.flags,
                                         HAMMER2_CHAIN_MODIFIED);
+                       hammer2_pfs_memory_wakeup(hmp->vchain.pmp);
                        hammer2_chain_drop(&hmp->vchain);
                }
-               if (hmp->vchain.flags & HAMMER2_CHAIN_FLUSH_CREATE) {
+               if (hmp->vchain.flags & HAMMER2_CHAIN_UPDATE) {
                        atomic_clear_int(&hmp->vchain.flags,
-                                        HAMMER2_CHAIN_FLUSH_CREATE);
-                       hammer2_chain_drop(&hmp->vchain);
-               }
-               if (hmp->vchain.flags & HAMMER2_CHAIN_FLUSH_DELETE) {
-                       atomic_clear_int(&hmp->vchain.flags,
-                                        HAMMER2_CHAIN_FLUSH_DELETE);
+                                        HAMMER2_CHAIN_UPDATE);
                        hammer2_chain_drop(&hmp->vchain);
                }
 
                if (hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) {
                        atomic_clear_int(&hmp->fchain.flags,
                                         HAMMER2_CHAIN_MODIFIED);
+                       hammer2_pfs_memory_wakeup(hmp->fchain.pmp);
                        hammer2_chain_drop(&hmp->fchain);
                }
-               if (hmp->fchain.flags & HAMMER2_CHAIN_FLUSH_CREATE) {
-                       atomic_clear_int(&hmp->fchain.flags,
-                                        HAMMER2_CHAIN_FLUSH_CREATE);
-                       hammer2_chain_drop(&hmp->fchain);
-               }
-               if (hmp->fchain.flags & HAMMER2_CHAIN_FLUSH_DELETE) {
+               if (hmp->fchain.flags & HAMMER2_CHAIN_UPDATE) {
                        atomic_clear_int(&hmp->fchain.flags,
-                                        HAMMER2_CHAIN_FLUSH_DELETE);
+                                        HAMMER2_CHAIN_UPDATE);
                        hammer2_chain_drop(&hmp->fchain);
                }
 
@@ -2191,25 +2172,9 @@ hammer2_vfs_sync(struct mount *mp, int waitfor)
                chain = iroot->cluster.array[i];
                if (chain) {
                        hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
-                       hammer2_flush(&info.trans, &chain);
+                       hammer2_flush(&info.trans, chain);
                        hammer2_chain_unlock(chain);
                }
-               if (chain) {
-                       hammer2_chain_t *nchain;
-                       chain = TAILQ_FIRST(&chain->core->ownerq);
-                       hammer2_chain_ref(chain);
-                       while (chain) {
-                               hammer2_chain_lock(chain,
-                                                  HAMMER2_RESOLVE_ALWAYS);
-                               hammer2_flush(&info.trans, &chain);
-                               hammer2_chain_unlock(chain);
-                               nchain = TAILQ_NEXT(chain, core_entry);
-                               if (nchain)
-                                       hammer2_chain_ref(nchain);
-                               hammer2_chain_drop(chain);
-                               chain = nchain;
-                       }
-               }
        }
 #if 0
        hammer2_trans_done(&info.trans);
@@ -2249,9 +2214,9 @@ hammer2_vfs_sync(struct mount *mp, int waitfor)
                 * allow the flush code to find the transition point and
                 * then update on the way back up.
                 */
-               parent = TAILQ_LAST(&chain->above->ownerq, h2_core_list);
+               parent = chain->parent;
                KKASSERT(chain->pmp != parent->pmp);
-               hammer2_chain_setsubmod(&info.trans, parent);
+               hammer2_chain_setflush(&info.trans, parent);
 
                /*
                 * Media mounts have two 'roots', vchain for the topology
@@ -2264,25 +2229,23 @@ hammer2_vfs_sync(struct mount *mp, int waitfor)
                 */
                hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
                hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
-               if ((hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) ||
-                   hmp->fchain.update_xhi > hmp->fchain.update_xlo) {
+               if (hmp->fchain.flags & HAMMER2_CHAIN_FLUSH_MASK) {
                        /*
                         * This will also modify vchain as a side effect,
                         * mark vchain as modified now.
                         */
                        hammer2_voldata_modify(hmp);
                        chain = &hmp->fchain;
-                       hammer2_flush(&info.trans, &chain);
+                       hammer2_flush(&info.trans, chain);
                        KKASSERT(chain == &hmp->fchain);
                }
                hammer2_chain_unlock(&hmp->fchain);
                hammer2_chain_unlock(&hmp->vchain);
 
                hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
-               if ((hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) ||
-                   hmp->vchain.update_xhi > hmp->vchain.update_xlo) {
+               if (hmp->vchain.flags & HAMMER2_CHAIN_FLUSH_MASK) {
                        chain = &hmp->vchain;
-                       hammer2_flush(&info.trans, &chain);
+                       hammer2_flush(&info.trans, chain);
                        KKASSERT(chain == &hmp->vchain);
                        force_fchain = 1;
                } else {
@@ -2292,12 +2255,11 @@ hammer2_vfs_sync(struct mount *mp, int waitfor)
 
 #if 0
                hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
-               if ((hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) ||
-                   hmp->fchain.update_xhi > hmp->fchain.update_xlo ||
+               if ((hmp->fchain.flags & HAMMER2_CHAIN_FLUSH_MASK) ||
                    force_fchain) {
                        /* this will also modify vchain as a side effect */
                        chain = &hmp->fchain;
-                       hammer2_flush(&info.trans, &chain);
+                       hammer2_flush(&info.trans, chain);
                        KKASSERT(chain&nb