hammer2 - Major restructuring, part 1/several
authorMatthew Dillon <dillon@apollo.backplane.com>
Sun, 28 Apr 2013 05:38:20 +0000 (22:38 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Sun, 28 Apr 2013 07:37:59 +0000 (00:37 -0700)
* This breaks a lot of things.  The next few commits will get it all
  working again.

* Significantly rework the data structures.  Instead of embedding the
  RBTREE for a chain's children in the chain, the chain instead points
  to a secondary structure containing the RBTREE.

  Chains can no longer be moved within the in-memory topology.  That is,
  if a file is renamed or a block is resized or a block is moved into or
  out of an indirect block, the in-memory chain representing that block
  is NOT moved.  Instead, the in-memory chain is marked deleted and a
  copy is created at the new location.  Both the old and the new chain
  reference the same secondary structure and thus share the same RBTREE,
  and reference the same media storage.

  In addition, chain->duplink points from the deleted chain to its relocated
  copy and maintains a reference on the target until the deleted chain is
  deallocated.  It is possible for the linked list to span more than one
  element.

  This link will soon be used to retarget inode->chain pointers (which can
  wind up pointing to stale data) and also eventually effect chain->parent
  traversals (real parent becomes chain->parent->[duplink*]).  A rethink
  might be needed down the line.

* This will allow the flush code to run 100% asynchronous from the
  frontend and still be able to flush to a synchronization point no
  matter how complex a set of changes have occured to the filesystem
  concurrent to the flush (but after its synchronization point).

* The change also stabilizes chain->parent, which simplifies quite a bit
  of code.

* Simplify nearly all the hammer2_chain_*() API functions, and other
  functions.

* Add a hammer2_trans (transaction) structure to keep track of modifying
  transactions.  This will be flushed out later and used to detect flush
  synchronization points.  It currently contains the transaction id.

* Start adding API infrastructure and start reworking the flush and other
  tree-modifying code to work under the new abstraction.

sys/vfs/hammer2/hammer2.h
sys/vfs/hammer2/hammer2_ccms.c
sys/vfs/hammer2/hammer2_chain.c
sys/vfs/hammer2/hammer2_flush.c
sys/vfs/hammer2/hammer2_freemap.c
sys/vfs/hammer2/hammer2_inode.c
sys/vfs/hammer2/hammer2_ioctl.c
sys/vfs/hammer2/hammer2_msgops.c
sys/vfs/hammer2/hammer2_subr.c
sys/vfs/hammer2/hammer2_vfsops.c
sys/vfs/hammer2/hammer2_vnops.c

index d1bc94f..86ea530 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2011-2013 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
  * by Matthew Dillon <dillon@dragonflybsd.org>
@@ -79,48 +79,78 @@ struct hammer2_state;
 struct hammer2_msg;
 
 /*
- * The chain structure tracks blockref recursions all the way to
- * the root volume.  These consist of indirect blocks, inodes,
- * and eventually the volume header.
+ * The chain structure tracks blockref recursions all the way to the root
+ * volume.  These consist of indirect blocks, inodes, and eventually the
+ * volume header itself.
  *
- * The chain structure is embedded in the hammer2_mount, hammer2_inode,
- * and other system memory structures.  The chain structure typically
- * implements the reference count and busy flag for the larger structure.
+ * In situations where a duplicate is needed to represent different snapshots
+ * or flush points a new chain will be allocated but associated with the
+ * same shared chain_core.  The RBTREE is contained in the shared chain_core
+ * and entries in the RBTREE are versioned.
  *
- * It is always possible to track a chain element all the way back to the
- * root by following the (parent) links.  (index) is a type-dependent index
- * in the parent indicating where in the parent the chain element resides.
+ * Duplication can occur whenever a chain must be modified.  Note that
+ * a deletion is not considered a modification.
  *
- * When a blockref is added or deleted the related chain element is marked
- * modified and all of its parents are marked SUBMODIFIED (the parent
- * recursion can stop once we hit a node that is already marked SUBMODIFIED).
- * A deleted chain element must remain intact until synchronized against
- * its parent.
+ *     (a) General modifications at data leafs
+ *     (b) When a chain is resized
+ *     (c) When a chain's blockref array is updated
+ *     (d) When a chain is renamed
+ *     (e) When a chain is moved (when an indirect block is split)
  *
- * The blockref at (parent, index) is not adjusted until the modified chain
- * element is flushed and unmarked.  Until then the child's blockref may
- * not match the blockref at (parent, index).
+ * Advantages:
+ *
+ *     (1) Fully coherent snapshots can be taken without requiring
+ *         a pre-flush, resulting in extremely fast (sub-millisecond)
+ *         snapshots.
+ *
+ *     (2) Multiple synchronization points can be in-flight at the same
+ *         time, representing multiple snapshots or flushes.
+ *
+ *     (3) The algorithms needed to keep track of everything are actually
+ *         not that complex.
+ *
+ * Special Considerations:
+ *
+ *     A chain is ref-counted on a per-chain basis, but the chain's lock
+ *     is associated with the shared chain_core and is not per-chain.
+ *
+ *     Each chain is representative of a filesystem topology.  Even
+ *     though the shared chain_core's are effectively multi-homed, the
+ *     chain structure is not.
+ *
+ *     chain->parent is a stable pointer and can be iterated without locking
+ *     as long as either the chain or *any* deep child under the chain
+ *     is held.
  */
 RB_HEAD(hammer2_chain_tree, hammer2_chain);
 TAILQ_HEAD(flush_deferral_list, hammer2_chain);
 
+struct hammer2_chain_core {
+       struct ccms_cst cst;
+       u_int           sharecnt;
+       struct hammer2_chain_tree rbtree;
+};
+
+typedef struct hammer2_chain_core hammer2_chain_core_t;
+
 struct hammer2_chain {
-       ccms_cst_t      cst;                    /* attr or data cst */
-       struct hammer2_blockref bref;
-       struct hammer2_blockref bref_flush;     /* synchronized w/MOVED bit */
-       struct hammer2_chain    *parent;        /* return chain to root */
-       struct hammer2_chain_tree rbhead;
-       struct hammer2_state    *state;         /* if active cache msg */
        RB_ENTRY(hammer2_chain) rbnode;
-       TAILQ_ENTRY(hammer2_chain) flush_node;  /* flush deferral list */
-
-       struct buf      *bp;            /* buffer cache (ro) */
-       hammer2_media_data_t *data;     /* modified copy of data (rw) */
-       u_int           bytes;          /* physical size of data */
-       int             index;          /* index in parent */
-       u_int           flushing;       /* element undergoing flush (count) */
-       u_int           refs;
+       hammer2_blockref_t      bref;
+       hammer2_chain_core_t    *core;
+       struct hammer2_chain    *parent;
+       struct hammer2_state    *state;         /* if active cache msg */
+       struct hammer2_mount    *hmp;
+       struct hammer2_chain    *duplink;       /* duplication link */
+
+       hammer2_tid_t   create_tid;             /* snapshot/flush filter */
+       hammer2_tid_t   delete_tid;
+       struct buf      *bp;                    /* physical data buffer */
+       u_int           bytes;                  /* physical data size */
+       int             index;                  /* blockref index in parent */
        u_int           flags;
+       u_int           refs;
+       hammer2_media_data_t *data;             /* data pointer shortcut */
+       TAILQ_ENTRY(hammer2_chain) flush_node;  /* flush deferral list */
 };
 
 typedef struct hammer2_chain hammer2_chain_t;
@@ -128,17 +158,8 @@ typedef struct hammer2_chain hammer2_chain_t;
 int hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2);
 RB_PROTOTYPE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
 
-/*
- * MOVED - This bit is set during the flush when the MODIFIED bit is cleared,
- *        indicating that the parent's blocktable must inherit a change to
- *        the bref (typically a block reallocation)
- *
- *        It must also be set in situations where a chain is not MODIFIED
- *        but whos bref has changed (typically due to fields other than
- *        a block reallocation).
- */
-#define HAMMER2_CHAIN_MODIFIED         0x00000001      /* active mods */
-#define HAMMER2_CHAIN_UNUSED0002       0x00000002
+#define HAMMER2_CHAIN_MODIFIED         0x00000001      /* dirty chain data */
+#define HAMMER2_CHAIN_ALLOCATED                0x00000002      /* kmalloc'd chain */
 #define HAMMER2_CHAIN_DIRTYBP          0x00000004      /* dirty on unlock */
 #define HAMMER2_CHAIN_SUBMODIFIED      0x00000008      /* 1+ subs modified */
 #define HAMMER2_CHAIN_DELETED          0x00000010      /* deleted chain */
@@ -146,10 +167,10 @@ RB_PROTOTYPE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
 #define HAMMER2_CHAIN_FLUSHED          0x00000040      /* flush on unlock */
 #define HAMMER2_CHAIN_MOVED            0x00000080      /* bref changed */
 #define HAMMER2_CHAIN_IOFLUSH          0x00000100      /* bawrite on put */
-#define HAMMER2_CHAIN_DEFERRED         0x00000200      /* on a deferral list*/
+#define HAMMER2_CHAIN_DEFERRED         0x00000200      /* on a deferral list */
 #define HAMMER2_CHAIN_DESTROYED                0x00000400      /* destroying inode */
-#define HAMMER2_CHAIN_MODIFIED_AUX     0x00000800      /* hmp->vchain only */
-#define HAMMER2_CHAIN_MODIFY_TID       0x00001000      /* mod updates field */
+#define HAMMER2_CHAIN_VOLUMESYNC       0x00000800      /* needs volume sync */
+#define HAMMER2_CHAIN_UNUSED1000       0x00001000
 #define HAMMER2_CHAIN_MOUNTED          0x00002000      /* PFS is mounted */
 #define HAMMER2_CHAIN_ONRBTREE         0x00004000      /* on parent RB tree */
 
@@ -165,10 +186,6 @@ RB_PROTOTYPE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
  *
  * NOTE: OPTDATA allows us to avoid instantiating buffers for INDIRECT
  *      blocks in the INITIAL-create state.
- *
- * NOTE: NO_MODIFY_TID tells the function to not set HAMMER2_CHAIN_MODIFY_TID
- *      when marking the chain modified (used when a sub-chain modification
- *      propagates upward).
  */
 #define HAMMER2_MODIFY_NOSUB           0x00000001      /* do not set SUBMOD */
 #define HAMMER2_MODIFY_OPTDATA         0x00000002      /* data can be NULL */
@@ -183,6 +200,7 @@ RB_PROTOTYPE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
 #define HAMMER2_RESOLVE_MASK           0x0F
 
 #define HAMMER2_RESOLVE_SHARED         0x10
+#define HAMMER2_RESOLVE_NOREF          0x20
 
 /*
  * Cluster different types of storage together for allocations
@@ -237,7 +255,7 @@ struct hammer2_inode {
        struct hammer2_pfsmount *pmp;           /* PFS mount */
        struct hammer2_inode    *pip;           /* parent inode */
        struct vnode            *vp;
-       hammer2_chain_t         *chain;
+       hammer2_chain_t         *chain;         /* NOTE: rehomed on rename */
        struct lockf            advlock;
        u_int                   flags;
        u_int                   refs;           /* +vpref, +flushref */
@@ -250,6 +268,22 @@ typedef struct hammer2_inode hammer2_inode_t;
 #define HAMMER2_INODE_RENAME_INPROG    0x0004
 
 /*
+ * A hammer2 transaction placeholder.
+ *
+ * This structure is required for all modifying operations, including
+ * flushes.  It holds the transaction id allocated for the modifying
+ * operation and is also used to interlock flushes and snapshots.
+ */
+struct hammer2_trans {
+       struct hammer2_mount    *hmp;
+       hammer2_tid_t           sync_tid;
+       uint8_t                 inodes_created;
+       uint8_t                 dummy[7];
+};
+
+typedef struct hammer2_trans hammer2_trans_t;
+
+/*
  * XXX
  */
 struct hammer2_freecache {
@@ -359,12 +393,12 @@ extern long hammer2_ioa_volu_write;
 #define hammer2_icrc32(buf, size)      iscsi_crc32((buf), (size))
 #define hammer2_icrc32c(buf, size, crc)        iscsi_crc32_ext((buf), (size), (crc))
 
-hammer2_chain_t *hammer2_inode_lock_ex(hammer2_inode_t *ip);
-hammer2_chain_t *hammer2_inode_lock_sh(hammer2_inode_t *ip);
-void hammer2_inode_unlock_ex(hammer2_inode_t *ip, hammer2_chain_t *chain);
-void hammer2_inode_unlock_sh(hammer2_inode_t *ip, hammer2_chain_t *chain);
+void hammer2_inode_lock_ex(hammer2_inode_t *ip);
+void hammer2_inode_lock_sh(hammer2_inode_t *ip);
+void hammer2_inode_unlock_ex(hammer2_inode_t *ip);
+void hammer2_inode_unlock_sh(hammer2_inode_t *ip);
 void hammer2_voldata_lock(hammer2_mount_t *hmp);
-void hammer2_voldata_unlock(hammer2_mount_t *hmp);
+void hammer2_voldata_unlock(hammer2_mount_t *hmp, int modify);
 ccms_state_t hammer2_inode_lock_temp_release(hammer2_inode_t *ip);
 ccms_state_t hammer2_inode_lock_upgrade(hammer2_inode_t *ip);
 void hammer2_inode_lock_restore(hammer2_inode_t *ip, ccms_state_t ostate);
@@ -398,30 +432,34 @@ void hammer2_inode_unlock_nlinks(hammer2_inode_t *ip);
 hammer2_inode_t *hammer2_inode_get(hammer2_mount_t *hmp,
                        hammer2_pfsmount_t *pmp, hammer2_inode_t *dip,
                        hammer2_chain_t *chain);
-void hammer2_inode_put(hammer2_inode_t *ip, hammer2_chain_t *passed_chain);
+void hammer2_inode_put(hammer2_inode_t *ip);
 void hammer2_inode_free(hammer2_inode_t *ip);
 void hammer2_inode_ref(hammer2_inode_t *ip);
 void hammer2_inode_drop(hammer2_inode_t *ip);
 int hammer2_inode_calc_alloc(hammer2_key_t filesize);
 
-int hammer2_inode_create(hammer2_inode_t *dip,
+hammer2_inode_t *hammer2_inode_create(hammer2_trans_t *trans,
+                       hammer2_inode_t *dip,
                        struct vattr *vap, struct ucred *cred,
                        const uint8_t *name, size_t name_len,
-                       hammer2_inode_t **nipp, hammer2_chain_t **nchainp);
-
-int hammer2_inode_duplicate(hammer2_inode_t *dip,
-                       hammer2_chain_t *ochain, hammer2_chain_t **nchainp);
-int hammer2_inode_connect(hammer2_inode_t *dip, hammer2_chain_t **chainp,
+                       int *errorp);
+hammer2_chain_t *hammer2_inode_duplicate(hammer2_trans_t *trans,
+                       hammer2_chain_t *ochain,
+                        hammer2_inode_t *dip, int *errorp);
+int hammer2_inode_connect(hammer2_trans_t *trans,
+                       hammer2_inode_t *ip,
+                       hammer2_inode_t *dip,
+                       hammer2_chain_t **chainp,
                        const uint8_t *name, size_t name_len);
-hammer2_inode_t *hammer2_inode_common_parent(hammer2_mount_t *hmp,
-                       hammer2_inode_t *fdip, hammer2_inode_t *tdip);
+hammer2_inode_t *hammer2_inode_common_parent(hammer2_inode_t *fdip,
+                       hammer2_inode_t *tdip);
 
-int hammer2_unlink_file(hammer2_inode_t *dip,
-                       const uint8_t *name, size_t name_len,
-                       int isdir, hammer2_chain_t *retain_chain);
-int hammer2_hardlink_consolidate(hammer2_inode_t *ip, hammer2_chain_t **chainp,
+int hammer2_unlink_file(hammer2_trans_t *trans, hammer2_inode_t *dip,
+                       const uint8_t *name, size_t name_len, int isdir);
+int hammer2_hardlink_consolidate(hammer2_trans_t *trans, hammer2_inode_t *ip,
+                       hammer2_chain_t **chainp,
                        hammer2_inode_t *tdip, int linkcnt);
-int hammer2_hardlink_deconsolidate(hammer2_inode_t *dip,
+int hammer2_hardlink_deconsolidate(hammer2_trans_t *trans, hammer2_inode_t *dip,
                        hammer2_chain_t **chainp, hammer2_chain_t **ochainp);
 int hammer2_hardlink_find(hammer2_inode_t *dip,
                        hammer2_chain_t **chainp, hammer2_chain_t **ochainp);
@@ -432,44 +470,53 @@ int hammer2_hardlink_find(hammer2_inode_t *dip,
 void hammer2_modify_volume(hammer2_mount_t *hmp);
 hammer2_chain_t *hammer2_chain_alloc(hammer2_mount_t *hmp,
                                hammer2_blockref_t *bref);
-void hammer2_chain_free(hammer2_mount_t *hmp, hammer2_chain_t *chain);
-void hammer2_chain_ref(hammer2_mount_t *hmp, hammer2_chain_t *chain);
-void hammer2_chain_drop(hammer2_mount_t *hmp, hammer2_chain_t *chain);
-int hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain, int how);
-void hammer2_chain_moved(hammer2_mount_t *hmp, hammer2_chain_t *chain);
-void hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain,
-                               int flags);
-void hammer2_chain_resize(hammer2_inode_t *ip, hammer2_chain_t *chain,
-                               int nradix, int flags);
-void hammer2_chain_unlock(hammer2_mount_t *hmp, hammer2_chain_t *chain);
-void hammer2_chain_wait(hammer2_mount_t *hmp, hammer2_chain_t *chain);
-hammer2_chain_t *hammer2_chain_find(hammer2_mount_t *hmp,
-                               hammer2_chain_t *parent, int index);
-hammer2_chain_t *hammer2_chain_get(hammer2_mount_t *hmp,
+void hammer2_chain_core_alloc(hammer2_chain_t *chain,
+                               hammer2_chain_core_t *core);
+void hammer2_chain_free(hammer2_chain_t *chain);
+void hammer2_chain_ref(hammer2_chain_t *chain);
+void hammer2_chain_drop(hammer2_chain_t *chain);
+int hammer2_chain_lock(hammer2_chain_t *chain, int how);
+void hammer2_chain_moved(hammer2_chain_t *chain);
+void hammer2_chain_modify(hammer2_trans_t *trans,
+                               hammer2_chain_t *chain, int flags);
+void hammer2_chain_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
+                               struct buf *bp,
                                hammer2_chain_t *parent,
-                               int index, int flags);
-hammer2_chain_t *hammer2_chain_lookup(hammer2_mount_t *hmp,
-                               hammer2_chain_t **parentp,
+                               hammer2_chain_t **chainp,
+                               int nradix, int flags);
+void hammer2_chain_unlock(hammer2_chain_t *chain);
+void hammer2_chain_wait(hammer2_chain_t *chain);
+hammer2_chain_t *hammer2_chain_find(hammer2_chain_t *parent, int index);
+hammer2_chain_t *hammer2_chain_get(hammer2_chain_t *parent, int index,
+                               int flags);
+hammer2_chain_t *hammer2_chain_lookup_init(hammer2_chain_t *parent, int flags);
+void hammer2_chain_lookup_done(hammer2_chain_t *parent);
+hammer2_chain_t *hammer2_chain_lookup(hammer2_chain_t **parentp,
                                hammer2_key_t key_beg, hammer2_key_t key_end,
                                int flags);
-hammer2_chain_t *hammer2_chain_next(hammer2_mount_t *hmp,
-                               hammer2_chain_t **parentp,
+hammer2_chain_t *hammer2_chain_next(hammer2_chain_t **parentp,
                                hammer2_chain_t *chain,
                                hammer2_key_t key_beg, hammer2_key_t key_end,
                                int flags);
-hammer2_chain_t *hammer2_chain_create(hammer2_mount_t *hmp,
+int hammer2_chain_create(hammer2_trans_t *trans,
                                hammer2_chain_t *parent,
-                               hammer2_chain_t *chain,
+                               hammer2_chain_t **chainp,
                                hammer2_key_t key, int keybits,
-                               int type, size_t bytes,
-                               int *errorp);
-void hammer2_chain_delete(hammer2_mount_t *hmp, hammer2_chain_t *parent,
-                               hammer2_chain_t *chain, int retain);
-void hammer2_chain_flush(hammer2_mount_t *hmp, hammer2_chain_t *chain,
-                               hammer2_tid_t modify_tid);
-void hammer2_chain_commit(hammer2_mount_t *hmp, hammer2_chain_t *chain);
-void hammer2_chain_parent_setsubmod(hammer2_mount_t *hmp,
+                               int type, size_t bytes);
+void hammer2_chain_duplicate(hammer2_trans_t *trans,
+                               hammer2_chain_t *parent, int i,
+                               hammer2_chain_t **chainp);
+void hammer2_chain_delete(hammer2_trans_t *trans, hammer2_chain_t *parent,
                                hammer2_chain_t *chain);
+void hammer2_chain_flush(hammer2_trans_t *trans, hammer2_chain_t *chain);
+void hammer2_chain_commit(hammer2_trans_t *trans, hammer2_chain_t *chain);
+void hammer2_chain_parent_setsubmod(hammer2_chain_t *chain);
+
+/*
+ * hammer2_trans.c
+ */
+void hammer2_trans_init(hammer2_trans_t *trans, hammer2_mount_t *hmp);
+void hammer2_trans_done(hammer2_trans_t *trans);
 
 /*
  * hammer2_ioctl.c
index ab21d84..f861713 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006,2012 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2006,2012-2013 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
  * by Matthew Dillon <dillon@backplane.com>
index 490278a..1bff81b 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2011-2013 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
  * by Matthew Dillon <dillon@dragonflybsd.org>
  * SUCH DAMAGE.
  */
 /*
- * This subsystem handles direct and indirect block searches, recursions,
- * creation, and deletion.  Chains of blockrefs are tracked and modifications
- * are flagged for propagation... eventually all the way back to the volume
- * header.  Any chain except the volume header can be flushed to disk at
- * any time... none of it matters until the volume header is dealt with
- * (which is not here, see hammer2_vfsops.c for the volume header disk
- * sequencing).
- *
- * Serialized flushes are not handled here, see hammer2_flush.c.  This module
- * can essentially work on the current version of data, which can be in memory
- * as well as on-disk due to the above.  However, we are responsible for
- * making a copy of the state when a modified chain is part of a flush
- * and we attempt to modify it again before the flush gets to it.  In that
- * situation we create an allocated copy of the state that the flush can
- * deal with.  If a chain undergoing deletion is part of a flush it is
- * marked DELETED and its bref index is kept intact for the flush, but the
- * chain is thereafter ignored by this module's because it is no longer
- * current.
+ * This subsystem implements most of the core support functions for
+ * the hammer2_chain and hammer2_chain_core structures.
+ *
+ * Chains represent the filesystem media topology in-memory.  Any given
+ * chain can represent an inode, indirect block, data, or other types
+ * of blocks.
+ *
+ * This module provides APIs for direct and indirect block searches,
+ * iterations, recursions, creation, deletion, replication, and snapshot
+ * views (used by the flush and snapshot code).
+ *
+ * Generally speaking any modification made to a chain must propagate all
+ * the way back to the volume header, issuing copy-on-write updates to the
+ * blockref tables all the way up.  Any chain except the volume header itself
+ * can be flushed to disk at any time, in any order.  None of it matters
+ * until we get to the point where we want to synchronize the volume header
+ * (see the flush code).
+ *
+ * The chain structure supports snapshot views in time, which are primarily
+ * used until the related data and meta-data is flushed to allow the
+ * filesystem to make snapshots without requiring it to first flush,
+ * and to allow the filesystem flush and modify the filesystem concurrently
+ * with minimal or no stalls.
  */
 #include <sys/cdefs.h>
 #include <sys/param.h>
 static int hammer2_indirect_optimize;  /* XXX SYSCTL */
 
 static hammer2_chain_t *hammer2_chain_create_indirect(
-                       hammer2_mount_t *hmp, hammer2_chain_t *parent,
-                       hammer2_key_t key, int keybits,
-                       int *errorp);
+               hammer2_trans_t *trans, hammer2_chain_t *parent,
+               hammer2_key_t key, int keybits, int *errorp);
 
 /*
  * We use a red-black tree to guarantee safe lookups under shared locks.
+ *
+ * Chains can be overloaded onto the same index, creating a different
+ * view of a blockref table based on a transaction id.  The RBTREE
+ * deconflicts the view by sub-sorting on delete_tid.
+ *
+ * NOTE: Any 'current' chain which is not yet deleted will have a
+ *      delete_tid of HAMMER2_MAX_TID (0xFFF....FFFLLU).
  */
 RB_GENERATE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
 
 int
 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2)
 {
-       return(chain2->index - chain1->index);
+       if (chain1->index < chain2->index)
+               return(-1);
+       if (chain1->index > chain2->index)
+               return(1);
+       if (chain1->delete_tid < chain2->delete_tid)
+               return(-1);
+       if (chain1->delete_tid > chain2->delete_tid)
+               return(1);
+       return(0);
 }
 
 /*
- * Recursively mark the parent chain elements so flushes can find
- * modified elements.  Stop when we hit a chain already flagged
- * SUBMODIFIED, but ignore the SUBMODIFIED bit that might be set
- * in chain itself.
+ * Flag chain->parent SUBMODIFIED recursively up to the root.  The
+ * recursion can terminate when a parent is encountered with SUBMODIFIED
+ * already set.  The flag is NOT set on the passed-in chain.
+ *
+ * This can be confusing because even though chains are multi-homed,
+ * each chain has a specific idea of its parent (chain->parent) which
+ * is singly-homed.
  *
- * SUBMODIFIED is not set on the chain passed in.
+ * This flag is used by the flusher's downward recursion to detect
+ * modifications and can only be cleared bottom-up.
  *
- * The chain->cst.spin lock can be held to stabilize the chain->parent
- * pointer.  The first parent is stabilized by virtue of chain being
- * fully locked.
+ * The parent pointer is protected by all the modified children below it
+ * and cannot be changed until they have all been flushed.  However, setsubmod
+ * operations on new modifications can race flushes in progress, so we use
+ * the chain->core->cst.spin lock to handle collisions.
  */
 void
-hammer2_chain_parent_setsubmod(hammer2_mount_t *hmp, hammer2_chain_t *chain)
+hammer2_chain_parent_setsubmod(hammer2_chain_t *chain)
 {
        hammer2_chain_t *parent;
+       hammer2_chain_core_t *core;
 
-       parent = chain->parent;
-       if (parent && (parent->flags & HAMMER2_CHAIN_SUBMODIFIED) == 0) {
-               spin_lock(&parent->cst.spin);
-               for (;;) {
-                       atomic_set_int(&parent->flags,
-                                      HAMMER2_CHAIN_SUBMODIFIED);
-                       if ((chain = parent->parent) == NULL)
-                               break;
-                       spin_lock(&chain->cst.spin);    /* upward interlock */
-                       spin_unlock(&parent->cst.spin);
-                       parent = chain;
+       while ((parent = chain->parent) != NULL) {
+               core = parent->core;
+               spin_lock(&core->cst.spin);
+               if (parent->flags & HAMMER2_CHAIN_SUBMODIFIED) {
+                       spin_unlock(&core->cst.spin);
+                       break;
                }
-               spin_unlock(&parent->cst.spin);
+               atomic_set_int(&parent->flags, HAMMER2_CHAIN_SUBMODIFIED);
+               spin_unlock(&core->cst.spin);
+               chain = parent;
        }
 }
 
 /*
  * Allocate a new disconnected chain element representing the specified
- * bref.  The chain element is locked exclusively and refs is set to 1.
- * Media data (data) and meta-structure (u) pointers are left NULL.
+ * bref.  chain->refs is set to 1 and the passed bref is copied to
+ * chain->bref.  chain->bytes is derived from the bref.
+ *
+ * chain->core is NOT allocated and the media data and bp pointers are left
+ * NULL.  The caller must call chain_core_alloc() to allocate or associate
+ * a core with the chain.
  *
- * This essentially allocates a system memory structure representing one
- * of the media structure types, including inodes.
+ * NOTE: Returns a referenced but unlocked (because there is no core) chain.
  */
 hammer2_chain_t *
 hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_blockref_t *bref)
@@ -147,81 +172,100 @@ hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_blockref_t *bref)
                      bref->type);
        }
 
-       /*
-        * Only set bref_flush if the bref has a real media offset, otherwise
-        * the caller has to wait for the chain to be modified/block-allocated
-        * before a blockref can be synchronized with its (future) parent.
-        */
+       chain->hmp = hmp;
        chain->bref = *bref;
-       if (bref->data_off & ~HAMMER2_OFF_MASK_RADIX)
-               chain->bref_flush = *bref;
        chain->index = -1;              /* not yet assigned */
-       chain->refs = 1;
        chain->bytes = bytes;
-       ccms_cst_init(&chain->cst, chain);
-       ccms_thread_lock(&chain->cst, CCMS_STATE_EXCLUSIVE);
+       chain->refs = 1;
+       chain->flags = HAMMER2_CHAIN_ALLOCATED;
+       chain->delete_tid = HAMMER2_MAX_TID;
 
        return (chain);
 }
 
 /*
- * Deallocate a chain (the step before freeing it).  Remove the chain from
- * its parent's tree.
+ * Associate an existing core with the chain or allocate a new core.
  *
- * Caller must hold the parent and the chain exclusively locked, and
- * chain->refs must be 0.
- *
- * This function unlocks, removes, and destroys chain, and will recursively
- * destroy any sub-chains under chain (whos refs must also be 0 at this
- * point).
+ * The core is not locked.  No additional refs on the chain are made.
+ */
+void
+hammer2_chain_core_alloc(hammer2_chain_t *chain, hammer2_chain_core_t *core)
+{
+       KKASSERT(chain->core == NULL);
+
+       if (core == NULL) {
+               core = kmalloc(sizeof(*core), chain->hmp->mchain,
+                              M_WAITOK | M_ZERO);
+               RB_INIT(&core->rbtree);
+               core->sharecnt = 1;
+               chain->core = core;
+               ccms_cst_init(&core->cst, chain);
+       } else {
+               atomic_add_int(&core->sharecnt, 1);
+               chain->core = core;
+       }
+}
+
+/*
+ * Deallocate a chain after the caller has transitioned its refs to 0
+ * and disassociated it from its parent.
  *
- * parent can be NULL.
+ * We must drop sharecnt on the core (if any) and handle its 1->0 transition
+ * too.
  */
 static void
-hammer2_chain_dealloc(hammer2_mount_t *hmp, hammer2_chain_t *chain)
+hammer2_chain_dealloc(hammer2_chain_t *chain)
 {
-       hammer2_chain_t *parent;
-       hammer2_chain_t *child;
+       hammer2_chain_core_t *core;
 
-       KKASSERT(chain->refs == 0);
-       KKASSERT(chain->flushing == 0);
-       KKASSERT((chain->flags &
-                 (HAMMER2_CHAIN_MOVED | HAMMER2_CHAIN_MODIFIED)) == 0);
+       /*
+        * Chain's flags are expected to be sane.
+        */
+       KKASSERT((chain->flags & (HAMMER2_CHAIN_MOVED |
+                                 HAMMER2_CHAIN_MODIFIED |
+                                 HAMMER2_CHAIN_ONRBTREE)) == 0);
+       KKASSERT(chain->duplink == NULL);
 
        /*
-        * If the sub-tree is not empty all the elements on it must have
-        * 0 refs and be deallocatable.
+        * Disconnect chain->core from chain and free core if it was the
+        * last core.  If any children are present in the core's rbtree
+        * they cannot have a pointer to our chain by definition because
+        * our chain's refs have dropped to 0.  If this is the last sharecnt
+        * on core, then core's rbtree must be empty by definition.
         */
-       while ((child = RB_ROOT(&chain->rbhead)) != NULL) {
-               ccms_thread_lock(&child->cst, CCMS_STATE_EXCLUSIVE);
-               hammer2_chain_dealloc(hmp, child);
+       if ((core = chain->core) != NULL) {
+               /*
+                * Other chains may reference the same core so the core's
+                * spinlock is needed to safely disconnect it.
+                */
+               spin_lock(&core->cst.spin);
+               chain->core = NULL;
+               if (atomic_fetchadd_int(&core->sharecnt, -1) == 1) {
+                       spin_unlock(&core->cst.spin);
+                       KKASSERT(RB_EMPTY(&core->rbtree));
+                       KKASSERT(core->cst.count == 0);
+                       KKASSERT(core->cst.upgrade == 0);
+                       kfree(core, chain->hmp->mchain);
+               } else {
+                       spin_unlock(&core->cst.spin);
+               }
+               core = NULL;            /* safety */
        }
 
        /*
-        * If the DELETED flag is not set the chain must be removed from
-        * its parent's tree.
-        *
-        * WARNING! chain->cst.spin must be held when chain->parent is
-        *          modified, even though we own the full blown lock,
-        *          to deal with setsubmod and rename races.
-        */
-       if (chain->flags & HAMMER2_CHAIN_ONRBTREE) {
-               spin_lock(&chain->cst.spin);    /* shouldn't be needed */
-               parent = chain->parent;
-               RB_REMOVE(hammer2_chain_tree, &parent->rbhead, chain);
-               atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
-               chain->parent = NULL;
-               spin_unlock(&chain->cst.spin);
-       }
-       hammer2_chain_free(hmp, chain);
+        * Finally free the structure and return for possible recursion.
+        */
+       hammer2_chain_free(chain);
 }
 
 /*
- * Free a disconnected chain element
+ * Free a disconnected chain element.
  */
 void
-hammer2_chain_free(hammer2_mount_t *hmp, hammer2_chain_t *chain)
+hammer2_chain_free(hammer2_chain_t *chain)
 {
+       hammer2_mount_t *hmp = chain->hmp;
+
        switch(chain->bref.type) {
        case HAMMER2_BREF_TYPE_VOLUME:
                chain->data = NULL;
@@ -237,70 +281,37 @@ hammer2_chain_free(hammer2_mount_t *hmp, hammer2_chain_t *chain)
                break;
        }
 
+       KKASSERT(chain->core == NULL);
        KKASSERT(chain->bp == NULL);
+       chain->hmp = NULL;
 
-       ccms_thread_unlock(&chain->cst);
-       KKASSERT(chain->cst.count == 0);
-       KKASSERT(chain->cst.upgrade == 0);
-
-       kfree(chain, hmp->mchain);
+       if (chain->flags & HAMMER2_CHAIN_ALLOCATED)
+               kfree(chain, hmp->mchain);
 }
 
 /*
  * Add a reference to a chain element, preventing its destruction.
- *
- * The parent chain must be locked shared or exclusive or otherwise be
- * stable and already have a reference.
  */
 void
-hammer2_chain_ref(hammer2_mount_t *hmp, hammer2_chain_t *chain)
+hammer2_chain_ref(hammer2_chain_t *chain)
 {
-       u_int refs;
-
-       while (chain) {
-               refs = chain->refs;
-               KKASSERT(chain->refs >= 0);
-               cpu_ccfence();
-               if (refs == 0) {
-                       /*
-                        * 0 -> 1 transition must bump the refs on the parent
-                        * too.  The caller has stabilized the parent.
-                        */
-                       if (atomic_cmpset_int(&chain->refs, 0, 1)) {
-                               chain = chain->parent;
-                               KKASSERT(chain == NULL || chain->refs > 0);
-                       }
-                       /* retry or continue along the parent chain */
-               } else {
-                       /*
-                        * N -> N+1
-                        */
-                       if (atomic_cmpset_int(&chain->refs, refs, refs + 1))
-                               break;
-                       /* retry */
-               }
-       }
+       atomic_add_int(&chain->refs, 1);
 }
 
 /*
- * Drop the callers reference to the chain element.  If the ref count
- * reaches zero we attempt to recursively drop the parent.
- *
- * MOVED and MODIFIED elements hold additional references so it should not
- * be possible for the count on a modified element to drop to 0.
- *
- * The chain element must NOT be locked by the caller on the 1->0 transition.
- *
- * The parent might or might not be locked by the caller.  If we are unable
- * to lock the parent on the 1->0 transition the destruction of the chain
- * will be deferred but we still recurse upward and drop the ref on the
- * parent (see the lastdrop() function)
+ * Drop the caller's reference to the chain.  When the ref count drops to
+ * zero this function will disassociate the chain from its parent and
+ * deallocate it, then recursely drop the parent using the implied ref
+ * from the chain's chain->parent.
+ *
+ * WARNING! Just because we are able to deallocate a chain doesn't mean
+ *         that chain->core->rbtree is empty.  There can still be a sharecnt
+ *         on chain->core and RBTREE entries that refer to different parents.
  */
-static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_mount_t *hmp,
-                                               hammer2_chain_t *chain);
+static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_chain_t *chain);
 
 void
-hammer2_chain_drop(hammer2_mount_t *hmp, hammer2_chain_t *chain)
+hammer2_chain_drop(hammer2_chain_t *chain)
 {
        u_int refs;
 
@@ -308,110 +319,61 @@ hammer2_chain_drop(hammer2_mount_t *hmp, hammer2_chain_t *chain)
                refs = chain->refs;
                cpu_ccfence();
                KKASSERT(refs > 0);
+
                if (refs == 1) {
-                       /*
-                        * (1) lastdrop successfully drops the chain to 0
-                        *     refs and may may not have destroyed it.
-                        *     lastdrop will return the parent so we can
-                        *     recursively drop the implied ref from the
-                        *     1->0 transition.
-                        *
-                        * (2) lastdrop fails to transition refs from 1 to 0
-                        *     and returns the same chain, we retry.
-                        */
-                       chain = hammer2_chain_lastdrop(hmp, chain);
+                       if (chain->parent) {
+                               chain = hammer2_chain_lastdrop(chain);
+                               /* recursively drop parent or retry same */
+                       } else if (atomic_cmpset_int(&chain->refs, 1, 0)) {
+                               hammer2_chain_dealloc(chain);
+                               chain = NULL;
+                               /* no parent to recurse on */
+                       } else {
+                               /* retry the same chain */
+                       }
                } else {
-                       if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) {
-                               /*
-                                * Succeeded, count did not reach zero so
-                                * cut out of the loop.
-                                */
+                       if (atomic_cmpset_int(&chain->refs, refs, refs - 1))
                                break;
-                       }
                        /* retry the same chain */
                }
        }
 }
 
 /*
- * Handle SMP races during the last drop.  We must obtain a lock on
- * chain->parent to stabilize the last pointer reference to chain
- * (if applicable).  This reference does not have a parallel ref count,
- * that is idle chains in the topology can have a ref count of 0.
+ * Safe handling of the 1->0 transition on chain when the chain has a
+ * parent.
  *
- * The 1->0 transition implies a ref on the parent.
+ * NOTE: A chain can only be removed from its parent core's RBTREE on
+ *      the 1->0 transition by definition.  No other code is allowed
+ *      to remove chain from its RBTREE, so no race is possible.
  */
 static
 hammer2_chain_t *
-hammer2_chain_lastdrop(hammer2_mount_t *hmp, hammer2_chain_t *chain)
+hammer2_chain_lastdrop(hammer2_chain_t *chain)
 {
        hammer2_chain_t *parent;
+       hammer2_chain_core_t *parent_core;
 
-       /*
-        * Stablize chain->parent with the chain cst's spinlock.
-        * (parent can be NULL here).
-        *
-        * cst.spin locks are allowed to be nested bottom-up (the reverse
-        * of the normal top-down for full-blown cst locks), so this also
-        * allows us to attempt to obtain the parent's cst lock non-blocking
-        * (which must acquire the parent's spinlock unconditionally) while
-        * we are still holding the chain's spinlock.
-        */
-       spin_lock(&chain->cst.spin);
        parent = chain->parent;
+       parent_core = parent->core;
+       KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE);
 
-       /*
-        * If chain->flushing is non-zero we cannot deallocate the chain
-        * here.  The flushing field will be serialized for the inline
-        * unlock made by the flusher itself and we don't care about races
-        * in any other situation because the thread lock on the parent
-        * will fail in other situations.
-        *
-        * If we have a non-NULL parent but cannot acquire its thread
-        * lock, we also cannot deallocate the chain.
-        */
-       if (chain->flushing ||
-           (parent && ccms_thread_lock_nonblock(&parent->cst,
-                                                CCMS_STATE_EXCLUSIVE))) {
-               if (atomic_cmpset_int(&chain->refs, 1, 0)) {
-                       spin_unlock(&chain->cst.spin);  /* success */
-                       return(parent);
-               } else {
-                       spin_unlock(&chain->cst.spin);  /* failure */
-                       return(chain);
+       spin_lock(&parent_core->cst.spin);
+       if (atomic_cmpset_int(&chain->refs, 1, 0)) {
+               RB_REMOVE(hammer2_chain_tree, &parent_core->rbtree, chain);
+               atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
+               chain->parent = NULL;   /* NULL field, must drop implied ref */
+               spin_unlock(&parent_core->cst.spin);
+               if (chain->duplink) {
+                       hammer2_chain_drop(chain->duplink);
+                       chain->duplink = NULL;
                }
+               hammer2_chain_dealloc(chain);
+               chain = parent;         /* recursively drop parent */
+       } else {
+               spin_unlock(&parent_core->cst.spin);
        }
-       spin_unlock(&chain->cst.spin);
-
-       /*
-        * With the parent now held we control the last pointer reference
-        * to chain ONLY IF this is the 1->0 drop.  If we fail to transition
-        * from 1->0 we raced a refs change and must retry at chain.
-        */
-       if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) {
-               /* failure */
-               if (parent)
-                       ccms_thread_unlock(&parent->cst);
-               return(chain);
-       }
-
-       /*
-        * Ok, we succeeded.  We now own the implied ref on the parent
-        * associated with the 1->0 transition of the child.  It should not
-        * be possible for ANYTHING to access the child now, as we own the
-        * lock on the parent, so we should be able to safely lock the
-        * child and destroy it.
-        */
-       ccms_thread_lock(&chain->cst, CCMS_STATE_EXCLUSIVE);
-       hammer2_chain_dealloc(hmp, chain);
-
-       /*
-        * We want to return parent with its implied ref to the caller
-        * to recurse and drop the parent.
-        */
-       if (parent)
-               ccms_thread_unlock(&parent->cst);
-       return (parent);
+       return (chain);
 }
 
 /*
@@ -452,10 +414,16 @@ hammer2_chain_lastdrop(hammer2_mount_t *hmp, hammer2_chain_t *chain)
  *      so as not to instantiate a device buffer, which could alias against
  *      a logical file buffer.  However, if ALWAYS is specified the
  *      device buffer will be instantiated anyway.
+ *
+ * WARNING! If data must be fetched a shared lock will temporarily be
+ *         upgraded to exclusive.  However, a deadlock can occur if
+ *         the caller owns more than one shared lock.
  */
 int
-hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain, int how)
+hammer2_chain_lock(hammer2_chain_t *chain, int how)
 {
+       hammer2_mount_t *hmp;
+       hammer2_chain_core_t *core;
        hammer2_blockref_t *bref;
        hammer2_off_t pbase;
        hammer2_off_t peof;
@@ -468,11 +436,19 @@ hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain, int how)
        /*
         * Ref and lock the element.  Recursive locks are allowed.
         */
-       hammer2_chain_ref(hmp, chain);
+       if ((how & HAMMER2_RESOLVE_NOREF) == 0)
+               hammer2_chain_ref(chain);
+       hmp = chain->hmp;
+       KKASSERT(hmp != NULL);
+
+       /*
+        * Get the appropriate lock.
+        */
+       core = chain->core;
        if (how & HAMMER2_RESOLVE_SHARED)
-               ccms_thread_lock(&chain->cst, CCMS_STATE_SHARED);
+               ccms_thread_lock(&core->cst, CCMS_STATE_SHARED);
        else
-               ccms_thread_lock(&chain->cst, CCMS_STATE_EXCLUSIVE);
+               ccms_thread_lock(&core->cst, CCMS_STATE_EXCLUSIVE);
 
        /*
         * If we already have a valid data pointer no further action is
@@ -504,9 +480,9 @@ hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain, int how)
         * buffer cache.  If another thread got to it before us we
         * can just return.
         */
-       ostate = ccms_thread_lock_upgrade(&chain->cst);
+       ostate = ccms_thread_lock_upgrade(&core->cst);
        if (chain->data) {
-               ccms_thread_lock_restore(&chain->cst, ostate);
+               ccms_thread_lock_restore(&core->cst, ostate);
                return (0);
        }
 
@@ -553,7 +529,7 @@ hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain, int how)
                        (intmax_t)pbase, error);
                bqrelse(chain->bp);
                chain->bp = NULL;
-               ccms_thread_lock_restore(&chain->cst, ostate);
+               ccms_thread_lock_restore(&core->cst, ostate);
                return (error);
        }
 
@@ -624,7 +600,7 @@ hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain, int how)
         */
        if (chain->bp)
                BUF_KERNPROC(chain->bp);
-       ccms_thread_lock_restore(&chain->cst, ostate);
+       ccms_thread_lock_restore(&core->cst, ostate);
        return (0);
 }
 
@@ -635,18 +611,20 @@ hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain, int how)
  * retired.
  */
 void
-hammer2_chain_unlock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
+hammer2_chain_unlock(hammer2_chain_t *chain)
 {
+       hammer2_chain_core_t *core = chain->core;
        long *counterp;
 
        /*
-        * Release the CST lock but with a special 1->0 transition case.
+        * Release the CST lock but with a special 1->0 transition case
+        * to also drop the refs on chain.  Multiple CST locks only
         *
         * Returns non-zero if lock references remain.  When zero is
         * returned the last lock reference is retained and any shared
         * lock is upgraded to an exclusive lock for final disposition.
         */
-       if (ccms_thread_unlock_zero(&chain->cst)) {
+       if (ccms_thread_unlock_zero(&core->cst)) {
                KKASSERT(chain->refs > 1);
                atomic_add_int(&chain->refs, -1);
                return;
@@ -663,8 +641,8 @@ hammer2_chain_unlock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
         */
        if (chain->bp == NULL) {
                atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
-               ccms_thread_unlock(&chain->cst);
-               hammer2_chain_drop(hmp, chain);
+               ccms_thread_unlock(&core->cst);
+               hammer2_chain_drop(chain);
                return;
        }
 
@@ -760,35 +738,43 @@ hammer2_chain_unlock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
                }
        }
        chain->bp = NULL;
-       ccms_thread_unlock(&chain->cst);
-       hammer2_chain_drop(hmp, chain);
+       ccms_thread_unlock(&core->cst);
+       hammer2_chain_drop(chain);
 }
 
 /*
- * Resize the chain's physical storage allocation.  Chains can be resized
- * smaller without reallocating the storage.  Resizing larger will reallocate
- * the storage.
+ * Resize the chain's physical storage allocation in-place.  This may
+ * replace the passed-in chain with a new chain.
  *
- * Must be passed a locked chain.
+ * Chains can be resized smaller without reallocating the storage.
+ * Resizing larger will reallocate the storage.
+ *
+ * Must be passed an exclusively locked parent and chain, returns a new
+ * exclusively locked chain at the same index and unlocks the old chain.
+ * Flushes the buffer if necessary.
  *
  * If you want the resize code to copy the data to the new block then the
  * caller should lock the chain RESOLVE_MAYBE or RESOLVE_ALWAYS.
  *
  * If the caller already holds a logical buffer containing the data and
  * intends to bdwrite() that buffer resolve with RESOLVE_NEVER.  The resize
- * operation will then not copy the data.
+ * operation will then not copy the (stale) data from the media.
  *
  * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order
  * to avoid instantiating a device buffer that conflicts with the vnode
  * data buffer.
  *
  * XXX flags currently ignored, uses chain->bp to detect data/no-data.
+ * XXX return error if cannot resize.
  */
 void
-hammer2_chain_resize(hammer2_inode_t *ip, hammer2_chain_t *chain,
+hammer2_chain_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
+                    struct buf *bp,
+                    hammer2_chain_t *parent, hammer2_chain_t **chainp,
                     int nradix, int flags)
 {
-       hammer2_mount_t *hmp = ip->hmp;
+       hammer2_mount_t *hmp = trans->hmp;
+       hammer2_chain_t *chain = *chainp;
        struct buf *nbp;
        hammer2_off_t pbase;
        size_t obytes;
@@ -815,8 +801,22 @@ hammer2_chain_resize(hammer2_inode_t *ip, hammer2_chain_t *chain,
                return;
 
        /*
+        * Delete the old chain and duplicate it at the same (parent, index),
+        * returning a new chain.  This allows the old chain to still be
+        * used by the flush code.  Duplication occurs in-place.
+        *
+        * NOTE: If we are not crossing a synchronization point the
+        *       duplication code will simply reuse the existing chain
+        *       structure.
+        */
+       hammer2_chain_delete(trans, parent, chain);
+       hammer2_chain_duplicate(trans, parent, chain->index, &chain);
+
+       /*
         * Set MODIFIED and add a chain ref to prevent destruction.  Both
-        * modified flags share the same ref.
+        * modified flags share the same ref.  (duplicated chains do not
+        * start out MODIFIED unless possibly if the duplication code
+        * decided to reuse the existing chain as-is).
         *
         * If the chain is already marked MODIFIED then we can safely
         * return the previous allocation to the pool without having to
@@ -824,9 +824,8 @@ hammer2_chain_resize(hammer2_inode_t *ip, hammer2_chain_t *chain,
         */
        if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) {
                atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
-               atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED |
-                                             HAMMER2_CHAIN_MODIFY_TID);
-               hammer2_chain_ref(hmp, chain);
+               atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
+               hammer2_chain_ref(chain);
        } else {
                hammer2_freemap_free(hmp, chain->bref.data_off,
                                     chain->bref.type);
@@ -871,6 +870,13 @@ hammer2_chain_resize(hammer2_inode_t *ip, hammer2_chain_t *chain,
                }
                bdata = (char *)nbp->b_data + boff;
 
+               /*
+                * chain->bp and chain->data represent the on-disk version
+                * of the data, where as the passed-in bp is usually a
+                * more up-to-date logical buffer.  However, there is no
+                * need to synchronize the more up-to-date data in (bp)
+                * as it will do that on its own when it flushes.
+                */
                if (nbytes < obytes) {
                        bcopy(chain->data, bdata, nbytes);
                } else {
@@ -892,7 +898,7 @@ hammer2_chain_resize(hammer2_inode_t *ip, hammer2_chain_t *chain,
                brelse(chain->bp);
                chain->bp = nbp;
                chain->data = (void *)bdata;
-               hammer2_chain_modify(hmp, chain, 0);
+               hammer2_chain_modify(trans, chain, 0);
        }
 
        /*
@@ -900,10 +906,10 @@ hammer2_chain_resize(hammer2_inode_t *ip, hammer2_chain_t *chain,
         * parent(s) so the adjustments are picked up by flush.
         */
        if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
-               hammer2_chain_ref(hmp, chain);
+               hammer2_chain_ref(chain);
                atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
        }
-       hammer2_chain_parent_setsubmod(hmp, chain);
+       hammer2_chain_parent_setsubmod(chain);
 }
 
 /*
@@ -912,28 +918,37 @@ hammer2_chain_resize(hammer2_inode_t *ip, hammer2_chain_t *chain,
  * If not already marked modified a new physical block will be allocated
  * and assigned to the bref.
  *
+ * If already modified and the new modification crosses a synchronization
+ * point the chain is duplicated in order to allow the flush to synchronize
+ * the old chain.  The new chain replaces the old.
+ *
  * Non-data blocks - The chain should be locked to at least the RESOLVE_MAYBE
  *                  level or the COW operation will not work.
  *
  * Data blocks    - The chain is usually locked RESOLVE_NEVER so as not to
  *                  run the data through the device buffers.
+ *
+ * This function may return a different chain than was passed, in which case
+ * the old chain will be unlocked and the new chain will be locked.
  */
 void
-hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain, int flags)
+hammer2_chain_modify(hammer2_trans_t *trans, hammer2_chain_t *chain, int flags)
 {
+       hammer2_mount_t *hmp = trans->hmp;
+       hammer2_off_t pbase;
        struct buf *nbp;
        int error;
-       hammer2_off_t pbase;
        size_t bbytes;
        size_t boff;
        void *bdata;
 
        /*
-        * Tells flush that modify_tid must be updated, otherwise only
-        * mirror_tid is updated.  This is the default.
+        * modify_tid is only update for primary modifications, not for
+        * propagated brefs.  mirror_tid will be updated regardless during
+        * the flush, no need to set it here.
         */
        if ((flags & HAMMER2_MODIFY_NO_MODIFY_TID) == 0)
-               atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFY_TID);
+               chain->bref.modify_tid = trans->sync_tid;
 
        /*
         * If the chain is already marked MODIFIED we can just return.
@@ -958,7 +973,7 @@ hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain, int flags)
         * modified flags share the same ref.
         */
        atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
-       hammer2_chain_ref(hmp, chain);
+       hammer2_chain_ref(chain);
 
        /*
         * We must allocate the copy-on-write block.
@@ -1079,53 +1094,68 @@ skip1:
        }
 skip2:
        if ((flags & HAMMER2_MODIFY_NOSUB) == 0)
-               hammer2_chain_parent_setsubmod(hmp, chain);
+               hammer2_chain_parent_setsubmod(chain);
 }
 
 /*
  * Mark the volume as having been modified.  This short-cut version
  * does not have to lock the volume's chain, which allows the ioctl
- * code to make adjustments to connections without deadlocking.
+ * code to make adjustments to connections without deadlocking.  XXX
+ *
+ * No ref is made on vchain when flagging it MODIFIED.
  */
 void
 hammer2_modify_volume(hammer2_mount_t *hmp)
 {
        hammer2_voldata_lock(hmp);
-       atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED_AUX);
-       hammer2_voldata_unlock(hmp);
+       hammer2_voldata_unlock(hmp, 1);
 }
 
 /*
  * Locate an in-memory chain.  The parent must be locked.  The in-memory
- * chain is returned or NULL if no in-memory chain is present.
+ * chain is returned with a reference and without a lock, or NULL
+ * if not found.
  *
  * NOTE: A chain on-media might exist for this index when NULL is returned.
+ *
+ * NOTE: Can only be used to locate chains which have not been deleted.
  */
 hammer2_chain_t *
-hammer2_chain_find(hammer2_mount_t *hmp, hammer2_chain_t *parent, int index)
+hammer2_chain_find(hammer2_chain_t *parent, int index)
 {
        hammer2_chain_t dummy;
        hammer2_chain_t *chain;
 
+       dummy.flags = 0;
        dummy.index = index;
-       chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy);
+       dummy.delete_tid = HAMMER2_MAX_TID;
+       spin_lock(&parent->core->cst.spin);
+       chain = RB_FIND(hammer2_chain_tree, &parent->core->rbtree, &dummy);
+       if (chain)
+               hammer2_chain_ref(chain);
+       spin_unlock(&parent->core->cst.spin);
+
        return (chain);
 }
 
 /*
  * Return a locked chain structure with all associated data acquired.
+ * (if LOOKUP_NOLOCK is requested the returned chain is only referenced).
  *
- * Caller must lock the parent on call, the returned child will be locked.
+ * Caller must hold the parent locked shared or exclusive since we may
+ * need the parent's bref array to find our block.
+ *
+ * The returned child is locked as requested.  If NOLOCK, the returned
+ * child is still at least referenced.
  */
 hammer2_chain_t *
-hammer2_chain_get(hammer2_mount_t *hmp, hammer2_chain_t *parent,
-                 int index, int flags)
+hammer2_chain_get(hammer2_chain_t *parent, int index, int flags)
 {
        hammer2_blockref_t *bref;
+       hammer2_mount_t *hmp = parent->hmp;
        hammer2_chain_t *chain;
        hammer2_chain_t dummy;
        int how;
-       ccms_state_t ostate;
 
        /*
         * Figure out how to lock.  MAYBE can be used to optimized
@@ -1138,6 +1168,7 @@ hammer2_chain_get(hammer2_mount_t *hmp, hammer2_chain_t *parent,
        if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK))
                how |= HAMMER2_RESOLVE_SHARED;
 
+retry:
        /*
         * First see if we have a (possibly modified) chain element cached
         * for this (parent, index).  Acquire the data if necessary.
@@ -1145,45 +1176,31 @@ hammer2_chain_get(hammer2_mount_t *hmp, hammer2_chain_t *parent,
         * If chain->data is non-NULL the chain should already be marked
         * modified.
         */
+       dummy.flags = 0;
        dummy.index = index;
-       chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy);
+       dummy.delete_tid = HAMMER2_MAX_TID;
+       spin_lock(&parent->core->cst.spin);
+       chain = RB_FIND(hammer2_chain_tree, &parent->core->rbtree, &dummy);
        if (chain) {
-               if (flags & HAMMER2_LOOKUP_NOLOCK)
-                       hammer2_chain_ref(hmp, chain);
-               else
-                       hammer2_chain_lock(hmp, chain, how);
+               hammer2_chain_ref(chain);
+               spin_unlock(&parent->core->cst.spin);
+               if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
+                       hammer2_chain_lock(chain, how | HAMMER2_RESOLVE_NOREF);
                return(chain);
        }
+       spin_unlock(&parent->core->cst.spin);
 
        /*
-        * Upgrade our thread lock and handle any race that may have
-        * occurred.  Leave the lock upgraded for the rest of the get.
-        * We have to do this because we will be modifying the chain
-        * structure.
-        */
-       ostate = ccms_thread_lock_upgrade(&parent->cst);
-       chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy);
-       if (chain) {
-               if (flags & HAMMER2_LOOKUP_NOLOCK)
-                       hammer2_chain_ref(hmp, chain);
-               else
-                       hammer2_chain_lock(hmp, chain, how);
-               ccms_thread_lock_restore(&parent->cst, ostate);
-               return(chain);
-       }
-
-       /*
-        * The get function must always succeed, panic if there's no
-        * data to index.
+        * The parent chain must not be in the INITIAL state.
         */
        if (parent->flags & HAMMER2_CHAIN_INITIAL) {
-               ccms_thread_lock_restore(&parent->cst, ostate);
                panic("hammer2_chain_get: Missing bref(1)");
                /* NOT REACHED */
        }
 
        /*
-        * Otherwise lookup the bref and issue I/O (switch on the parent)
+        * No RBTREE entry found, lookup the bref and issue I/O (switch on
+        * the parent's bref to determine where and how big the array is).
         */
        switch(parent->bref.type) {
        case HAMMER2_BREF_TYPE_INODE:
@@ -1214,42 +1231,71 @@ hammer2_chain_get(hammer2_mount_t *hmp, hammer2_chain_t *parent,
 
        /*
         * Allocate a chain structure representing the existing media
-        * entry.
+        * entry.  Resulting chain has one ref and is not locked.
         *
         * The locking operation we do later will issue I/O to read it.
         */
        chain = hammer2_chain_alloc(hmp, bref);
+       hammer2_chain_core_alloc(chain, NULL);  /* ref'd chain returned */
 
        /*
-        * Link the chain into its parent.  Caller is expected to hold an
-        * exclusive lock on the parent.
+        * Link the chain into its parent.  A spinlock is required to safely
+        * access the RBTREE, and it is possible to collide with another
+        * hammer2_chain_get() operation because the caller might only hold
+        * a shared lock on the parent.
         */
+       KKASSERT(parent->refs > 0);
+       spin_lock(&parent->core->cst.spin);
        chain->parent = parent;
        chain->index = index;
-       if (RB_INSERT(hammer2_chain_tree, &parent->rbhead, chain))
-               panic("hammer2_chain_link: collision");
+       if (RB_INSERT(hammer2_chain_tree, &parent->core->rbtree, chain)) {
+               chain->parent = NULL;
+               chain->index = -1;
+               spin_unlock(&parent->core->cst.spin);
+               hammer2_chain_drop(chain);
+               goto retry;
+       }
        atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
-       KKASSERT(parent->refs > 0);
-       atomic_add_int(&parent->refs, 1);       /* for red-black entry */
-       ccms_thread_lock_restore(&parent->cst, ostate);
+       hammer2_chain_ref(parent);              /* chain->parent ref */
+       spin_unlock(&parent->core->cst.spin);
 
        /*
-        * Our new chain structure has already been referenced and locked
-        * but the lock code handles the I/O so call it to resolve the data.
-        * Then release one of our two exclusive locks.
+        * Our new chain is referenced but NOT locked.  Lock the chain
+        * below.  The locking operation also resolves its data.
         *
         * If NOLOCK is set the release will release the one-and-only lock.
         */
        if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) {
-               hammer2_chain_lock(hmp, chain, how);    /* recusive lock */
-               hammer2_chain_drop(hmp, chain);         /* excess ref */
+               hammer2_chain_lock(chain, how); /* recusive lock */
+               hammer2_chain_drop(chain);      /* excess ref */
        }
-       ccms_thread_unlock(&chain->cst);                        /* from alloc */
-
        return (chain);
 }
 
 /*
+ * Lookup initialization/completion API
+ */
+hammer2_chain_t *
+hammer2_chain_lookup_init(hammer2_chain_t *parent, int flags)
+{
+       if (flags & HAMMER2_LOOKUP_SHARED) {
+               hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS |
+                                          HAMMER2_RESOLVE_SHARED);
+       } else {
+               hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
+       }
+       return (parent);
+}
+
+void
+hammer2_chain_lookup_done(hammer2_chain_t *parent)
+{
+       if (parent)
+               hammer2_chain_unlock(parent);
+}
+
+
+/*
  * Locate any key between key_beg and key_end inclusive.  (*parentp)
  * typically points to an inode but can also point to a related indirect
  * block and this function will recurse upwards and find the inode again.
@@ -1266,7 +1312,8 @@ hammer2_chain_get(hammer2_mount_t *hmp, hammer2_chain_t *parent,
  * deletion.   The new (*parentp) will be locked and referenced and the old
  * will be unlocked and dereferenced (no change if they are both the same).
  *
- * The matching chain will be returned exclusively locked and referenced.
+ * The matching chain will be returned exclusively locked.  If NOLOCK is
+ * requested the chain will be returned only referenced.
  *
  * NULL is returned if no match was found, but (*parentp) will still
  * potentially be adjusted.
@@ -1276,10 +1323,11 @@ hammer2_chain_get(hammer2_mount_t *hmp, hammer2_chain_t *parent,
  * can simply allow (*parentp) to float inside the loop.
  */
 hammer2_chain_t *
-hammer2_chain_lookup(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
+hammer2_chain_lookup(hammer2_chain_t **parentp,
                     hammer2_key_t key_beg, hammer2_key_t key_end,
                     int flags)
 {
+       hammer2_mount_t *hmp;
        hammer2_chain_t *parent;
        hammer2_chain_t *chain;
        hammer2_chain_t *tmp;
@@ -1302,6 +1350,8 @@ hammer2_chain_lookup(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
         * encloses the key range or we hit the inode.
         */
        parent = *parentp;
+       hmp = parent->hmp;
+
        while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
               parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
                scan_beg = parent->bref.key;
@@ -1309,12 +1359,12 @@ hammer2_chain_lookup(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
                           ((hammer2_key_t)1 << parent->bref.keybits) - 1;
                if (key_beg >= scan_beg && key_end <= scan_end)
                        break;
-               hammer2_chain_ref(hmp, parent);         /* ref old parent */
-               hammer2_chain_unlock(hmp, parent);      /* unlock old parent */
+               hammer2_chain_ref(parent);              /* ref old parent */
+               hammer2_chain_unlock(parent);           /* unlock old parent */
                parent = parent->parent;
                                                        /* lock new parent */
-               hammer2_chain_lock(hmp, parent, how_maybe);
-               hammer2_chain_drop(hmp, *parentp);      /* drop old parent */
+               hammer2_chain_lock(parent, how_maybe);
+               hammer2_chain_drop(*parentp);           /* drop old parent */
                *parentp = parent;                      /* new parent */
        }
 
@@ -1334,9 +1384,9 @@ again:
                 */
                if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
                        if (flags & HAMMER2_LOOKUP_NOLOCK)
-                               hammer2_chain_ref(hmp, parent);
+                               hammer2_chain_ref(parent);
                        else
-                               hammer2_chain_lock(hmp, parent, how_always);
+                               hammer2_chain_lock(parent, how_always);
                        return (parent);
                }
                base = &parent->data->ipdata.u.blockset.blockref[0];
@@ -1372,16 +1422,17 @@ again:
        /*
         * If the element and key overlap we use the element.
         *
-        * NOTE!  Deleted elements are effectively invisible.  A Deleted
-        *        elements covers (makes invisible) any original media
-        *        data.
+        * NOTE! Deleted elements are effectively invisible.  Deletions
+        *       proactively clear the parent bref to the deleted child
+        *       so we do not try to shadow here to avoid parent updates
+        *       (which would be difficult since multiple deleted elements
+        *       might represent different flush synchronization points).
         */
        bref = NULL;
        for (i = 0; i < count; ++i) {
-               tmp = hammer2_chain_find(hmp, parent, i);
+               tmp = hammer2_chain_find(parent, i);
                if (tmp) {
-                       if (tmp->flags & HAMMER2_CHAIN_DELETED)
-                               continue;
+                       KKASSERT((tmp->flags & HAMMER2_CHAIN_DELETED) == 0);
                        bref = &tmp->bref;
                        KKASSERT(bref->type != 0);
                } else if (base == NULL || base[i].type == 0) {
@@ -1391,21 +1442,28 @@ again:
                }
                scan_beg = bref->key;
                scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
+               if (tmp)
+                       hammer2_chain_drop(tmp);
                if (key_beg <= scan_end && key_end >= scan_beg)
                        break;
        }
        if (i == count) {
                if (key_beg == key_end)
                        return (NULL);
-               return (hammer2_chain_next(hmp, parentp, NULL,
+               return (hammer2_chain_next(parentp, NULL,
                                           key_beg, key_end, flags));
        }
 
        /*
         * Acquire the new chain element.  If the chain element is an
         * indirect block we must search recursively.
+        *
+        * It is possible for the tmp chain above to be removed from
+        * the RBTREE but the parent lock ensures it would not have been
+        * destroyed from the media, so the chain_get() code will simply
+        * reload it from the media in that case.
         */
-       chain = hammer2_chain_get(hmp, parent, i, flags);
+       chain = hammer2_chain_get(parent, i, flags);
        if (chain == NULL)
                return (NULL);
 
@@ -1413,25 +1471,30 @@ again:
         * If the chain element is an indirect block it becomes the new
         * parent and we loop on it.
         *
-        * The parent always has to be locked with at least RESOLVE_MAYBE,
-        * so it might need a fixup if the caller passed incompatible flags.
+        * The parent always has to be locked with at least RESOLVE_MAYBE
+        * so we can access its data.  It might need a fixup if the caller
+        * passed incompatible flags.  Be careful not to cause a deadlock
+        * as a data-load requires an exclusive lock.
         */
        if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
            chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
-               hammer2_chain_unlock(hmp, parent);
+               hammer2_chain_unlock(parent);
                *parentp = parent = chain;
                if (flags & HAMMER2_LOOKUP_NOLOCK) {
-                       hammer2_chain_lock(hmp, chain, how_maybe);
-                       hammer2_chain_drop(hmp, chain); /* excess ref */
-               } else if (flags & HAMMER2_LOOKUP_NODATA) {
-                       hammer2_chain_lock(hmp, chain, how_maybe);
-                       hammer2_chain_unlock(hmp, chain);
+                       hammer2_chain_lock(chain, how_maybe);
+                       hammer2_chain_drop(chain);      /* excess ref */
+               } else if ((flags & HAMMER2_LOOKUP_NODATA) &&
+                          chain->data == NULL) {
+                       hammer2_chain_ref(chain);
+                       hammer2_chain_unlock(chain);
+                       hammer2_chain_lock(chain, how_maybe |
+                                                 HAMMER2_RESOLVE_NOREF);
                }
                goto again;
        }
 
        /*
-        * All done, return chain
+        * All done, return the chain
         */
        return (chain);
 }
@@ -1445,14 +1508,14 @@ again:
  * iteration at the next parent.
  *
  * parent must be locked on entry and remains locked throughout.  chain's
- * lock status must match flags.
+ * lock status must match flags.  Chain is always at least referenced.
  */
 hammer2_chain_t *
-hammer2_chain_next(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
-                  hammer2_chain_t *chain,
+hammer2_chain_next(hammer2_chain_t **parentp, hammer2_chain_t *chain,
                   hammer2_key_t key_beg, hammer2_key_t key_end,
                   int flags)
 {
+       hammer2_mount_t *hmp;
        hammer2_chain_t *parent;
        hammer2_chain_t *tmp;
        hammer2_blockref_t *base;
@@ -1467,6 +1530,7 @@ hammer2_chain_next(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
                how_maybe |= HAMMER2_RESOLVE_SHARED;
 
        parent = *parentp;
+       hmp = parent->hmp;
 
 again:
        /*
@@ -1481,9 +1545,9 @@ again:
                 */
                i = chain->index + 1;
                if (flags & HAMMER2_LOOKUP_NOLOCK)
-                       hammer2_chain_drop(hmp, chain);
+                       hammer2_chain_drop(chain);
                else
-                       hammer2_chain_unlock(hmp, chain);
+                       hammer2_chain_unlock(chain);
 
                /*
                 * Any scan where the lookup returned degenerate data embedded
@@ -1513,11 +1577,11 @@ again:
 
                i = parent->index + 1;
                nparent = parent->parent;
-               hammer2_chain_ref(hmp, nparent);        /* ref new parent */
-               hammer2_chain_unlock(hmp, parent);      /* unlock old parent */
-                                                       /* lock new parent */
-               hammer2_chain_lock(hmp, nparent, how_maybe);
-               hammer2_chain_drop(hmp, nparent);       /* drop excess ref */
+               hammer2_chain_ref(nparent);     /* ref new parent */
+               hammer2_chain_unlock(parent);   /* unlock old parent */
+                                               /* lock new parent */
+               hammer2_chain_lock(nparent, how_maybe);
+               hammer2_chain_drop(nparent);    /* drop excess ref */
                *parentp = parent = nparent;
        }
 
@@ -1560,18 +1624,17 @@ again2:
         * match was requested we return NULL.  If a range was requested we
         * run hammer2_chain_next() to iterate.
         *
-        * NOTE!  Deleted elements are effectively invisible.  A Deleted
-        *        elements covers (makes invisible) any original media
-        *        data.
+        * NOTE! Deleted elements are effectively invisible.  Deletions
+        *       proactively clear the parent bref to the deleted child
+        *       so we do not try to shadow here to avoid parent updates
+        *       (which would be difficult since multiple deleted elements
+        *       might represent different flush synchronization points).
         */
        bref = NULL;
        while (i < count) {
-               tmp = hammer2_chain_find(hmp, parent, i);
+               tmp = hammer2_chain_find(parent, i);
                if (tmp) {
-                       if (tmp->flags & HAMMER2_CHAIN_DELETED) {
-                               ++i;
-                               continue;
-                       }
+                       KKASSERT((tmp->flags & HAMMER2_CHAIN_DELETED) == 0);
                        bref = &tmp->bref;
                } else if (base == NULL || base[i].type == 0) {
                        ++i;
@@ -1581,6 +1644,8 @@ again2:
                }
                scan_beg = bref->key;
                scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
+               if (tmp)
+                       hammer2_chain_drop(tmp);
                if (key_beg <= scan_end && key_end >= scan_beg)
                        break;
                ++i;
@@ -1597,7 +1662,7 @@ again2:
         * Acquire the new chain element.  If the chain element is an
         * indirect block we must search recursively.
         */
-       chain = hammer2_chain_get(hmp, parent, i, flags);
+       chain = hammer2_chain_get(parent, i, flags);
        if (chain == NULL)
                return (NULL);
 
@@ -1605,20 +1670,25 @@ again2:
         * If the chain element is an indirect block it becomes the new
         * parent and we loop on it.
         *
-        * The parent always has to be locked with at least RESOLVE_MAYBE,
-        * so it might need a fixup if the caller passed incompatible flags.
+        * The parent always has to be locked with at least RESOLVE_MAYBE
+        * so we can access its data.  It might need a fixup if the caller
+        * passed incompatible flags.  Be careful not to cause a deadlock
+        * as a data-load requires an exclusive lock.
         */
        if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
            chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
-               hammer2_chain_unlock(hmp, parent);
+               hammer2_chain_unlock(parent);
                *parentp = parent = chain;
                chain = NULL;
                if (flags & HAMMER2_LOOKUP_NOLOCK) {
-                       hammer2_chain_lock(hmp, parent, how_maybe);
-                       hammer2_chain_drop(hmp, parent);        /* excess ref */
-               } else if (flags & HAMMER2_LOOKUP_NODATA) {
-                       hammer2_chain_lock(hmp, parent, how_maybe);
-                       hammer2_chain_unlock(hmp, parent);
+                       hammer2_chain_lock(parent, how_maybe);
+                       hammer2_chain_drop(parent);     /* excess ref */
+               } else if ((flags & HAMMER2_LOOKUP_NODATA) &&
+                          parent->data == NULL) {
+                       hammer2_chain_ref(parent);
+                       hammer2_chain_unlock(parent);
+                       hammer2_chain_lock(parent, how_maybe |
+                                                  HAMMER2_RESOLVE_NOREF);
                }
                i = 0;
                goto again2;
@@ -1639,6 +1709,11 @@ again2:
  * must be locked and held.  Do not pass the inode chain to this function
  * unless that is the chain returned by the failed lookup.
  *
+ * (chain) is either NULL, a newly allocated chain, or a chain allocated
+ * via hammer2_chain_duplicate().  When not NULL, the passed-in chain must
+ * NOT be attached to any parent, and will be attached by this function.
+ * This mechanic is used by the rename code.
+ *
  * Non-indirect types will automatically allocate indirect blocks as required
  * if the new item does not fit in the current (parent).
  *
@@ -1646,8 +1721,8 @@ again2:
  * (parent) into the new indirect type and then use one of the free slots
  * to emplace the new indirect type.
  *
- * A new locked, referenced chain element is returned of the specified type.
- * The element may or may not have a data area associated with it:
+ * A new locked chain element is returned of the specified type.  The
+ * element may or may not have a data area associated with it:
  *
  *     VOLUME          not allowed here
  *     INODE           kmalloc()'d data area is set up
@@ -1658,22 +1733,25 @@ again2:
  *
  * Requires an exclusively locked parent.
  */
-hammer2_chain_t *
-hammer2_chain_create(hammer2_mount_t *hmp, hammer2_chain_t *parent,
-                    hammer2_chain_t *chain,
-                    hammer2_key_t key, int keybits, int type, size_t bytes,
-                    int *errorp)
+int
+hammer2_chain_create(hammer2_trans_t *trans, hammer2_chain_t *parent,
+                    hammer2_chain_t **chainp,
+                    hammer2_key_t key, int keybits, int type, size_t bytes)
 {
+       hammer2_mount_t *hmp;
+       hammer2_chain_t *chain;
        hammer2_blockref_t dummy;
        hammer2_blockref_t *base;
        hammer2_chain_t dummy_chain;
        int unlock_parent = 0;
        int allocated = 0;
+       int error = 0;
        int count;
        int i;
 
-       KKASSERT(ccms_thread_lock_owned(&parent->cst));
-       *errorp = 0;
+       KKASSERT(ccms_thread_lock_owned(&parent->core->cst));
+       hmp = parent->hmp;
+       chain = *chainp;
 
        if (chain == NULL) {
                /*
@@ -1687,6 +1765,8 @@ hammer2_chain_create(hammer2_mount_t *hmp, hammer2_chain_t *parent,
                dummy.data_off = hammer2_allocsize(bytes);
                dummy.methods = parent->bref.methods;
                chain = hammer2_chain_alloc(hmp, &dummy);
+               hammer2_chain_core_alloc(chain, NULL);
+               ccms_thread_lock(&chain->core->cst, CCMS_STATE_EXCLUSIVE);
                allocated = 1;
 
                /*
@@ -1772,23 +1852,31 @@ again:
         * Scan for an unallocated bref, also skipping any slots occupied
         * by in-memory chain elements that may not yet have been updated
         * in the parent's bref array.
+        *
+        * We don't have to hold the spinlock to save an empty slot as
+        * new slots can only transition from empty if the parent is
+        * locked exclusively.
         */
        bzero(&dummy_chain, sizeof(dummy_chain));
+       dummy_chain.delete_tid = HAMMER2_MAX_TID;
+
+       spin_lock(&parent->core->cst.spin);
        for (i = 0; i < count; ++i) {
                if (base == NULL) {
                        dummy_chain.index = i;
                        if (RB_FIND(hammer2_chain_tree,
-                                   &parent->rbhead, &dummy_chain) == NULL) {
+                                   &parent->core->rbtree, &dummy_chain) == NULL) {
                                break;
                        }
                } else if (base[i].type == 0) {
                        dummy_chain.index = i;
                        if (RB_FIND(hammer2_chain_tree,
-                                   &parent->rbhead, &dummy_chain) == NULL) {
+                                   &parent->core->rbtree, &dummy_chain) == NULL) {
                                break;
                        }
                }
        }
+       spin_unlock(&parent->core->cst.spin);
 
        /*
         * If no free blockref could be found we must create an indirect
@@ -1797,24 +1885,23 @@ again:
         * causing a deadlock.
         *
         * This may return the new indirect block or the old parent depending
-        * on where the key falls.  NULL is returned on error.  The most
-        * typical error is EAGAIN (flush conflict during chain move).
+        * on where the key falls.  NULL is returned on error.
         */
        if (i == count) {
                hammer2_chain_t *nparent;
 
-               nparent = hammer2_chain_create_indirect(hmp, parent,
+               nparent = hammer2_chain_create_indirect(trans, parent,
                                                        key, keybits,
-                                                       errorp);
+                                                       &error);
                if (nparent == NULL) {
                        if (allocated)
-                               hammer2_chain_free(hmp, chain);
+                               hammer2_chain_free(chain);
                        chain = NULL;
                        goto done;
                }
                if (parent != nparent) {
                        if (unlock_parent)
-                               hammer2_chain_unlock(hmp, parent);
+                               hammer2_chain_unlock(parent);
                        parent = nparent;
                        unlock_parent = 1;
                }
@@ -1829,14 +1916,17 @@ again:
        if (chain->parent != NULL)
                panic("hammer2: hammer2_chain_create: chain already connected");
        KKASSERT(chain->parent == NULL);
+       KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0);
+
        chain->parent = parent;
        chain->index = i;
-       if (RB_INSERT(hammer2_chain_tree, &parent->rbhead, chain))
+       KKASSERT(parent->refs > 0);
+       spin_lock(&parent->core->cst.spin);
+       if (RB_INSERT(hammer2_chain_tree, &parent->core->rbtree, chain))
                panic("hammer2_chain_link: collision");
        atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
-       KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0);
-       KKASSERT(parent->refs > 0);
-       atomic_add_int(&parent->refs, 1);
+       hammer2_chain_ref(parent);              /* chain->parent ref */
+       spin_unlock(&parent->core->cst.spin);
 
        /*
         * (allocated) indicates that this is a newly-created chain element
@@ -1859,7 +1949,7 @@ again:
                switch(chain->bref.type) {
                case HAMMER2_BREF_TYPE_DATA:
                case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
-                       hammer2_chain_modify(hmp, chain,
+                       hammer2_chain_modify(trans, chain,
                                             HAMMER2_MODIFY_OPTDATA);
                        break;
                case HAMMER2_BREF_TYPE_INDIRECT:
@@ -1868,11 +1958,11 @@ again:
                        /* not supported in this function */
                        panic("hammer2_chain_create: bad type");
                        atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
-                       hammer2_chain_modify(hmp, chain,
+                       hammer2_chain_modify(trans, chain,
                                             HAMMER2_MODIFY_OPTDATA);
                        break;
                default:
-                       hammer2_chain_modify(hmp, chain, 0);
+                       hammer2_chain_modify(trans, chain, 0);
                        break;
                }
        } else {
@@ -1889,16 +1979,169 @@ again:
                 * reallocations.
                 */
                if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
-                       hammer2_chain_ref(hmp, chain);
+                       hammer2_chain_ref(chain);
                        atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
                }
-               hammer2_chain_parent_setsubmod(hmp, chain);
+               hammer2_chain_parent_setsubmod(chain);
        }
 
 done:
+       *chainp = chain;
        if (unlock_parent)
-               hammer2_chain_unlock(hmp, parent);
-       return (chain);
+               hammer2_chain_unlock(parent);
+       return (error);
+}
+
+/*
+ * Replace (*chainp) with a duplicate.  The original *chainp is unlocked
+ * and the replacement will be returned locked.  Both the original and the
+ * new chain will share the same RBTREE (have the same chain->core), with
+ * the new chain becoming the 'current' chain (meaning it is the first in
+ * the linked list at core->chain_first).
+ *
+ * If (parent, i) then the new duplicated chain is inserted under the parent
+ * at the specified index (the parent must not have a ref at that index).
+ *
+ * If (NULL, -1) then the new duplicated chain is not inserted anywhere,
+ * similar to if it had just been chain_alloc()'d (suitable for passing into
+ * hammer2_chain_create() after this function returns).
+ *
+ * NOTE! Duplication is used in order to retain the original topology to
+ *      support flush synchronization points.  Both the original and the
+ *      new chain will have the same transaction id and thus the operation
+ *      appears atomic on the media.
+ */
+void
+hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t *parent,
+                       int i, hammer2_chain_t **chainp)
+{
+       hammer2_mount_t *hmp = trans->hmp;
+       hammer2_blockref_t *base;
+       hammer2_chain_t *chain;
+       size_t bytes;
+       int count;
+
+       /*
+        * First create a duplicate of the chain structure, associating
+        * it with the same core, making it the same size, pointing it
+        * to the same bref (the same media block), and copying any inline
+        * data.
+        */
+       KKASSERT(((*chainp)->flags & HAMMER2_CHAIN_INITIAL) == 0);
+       chain = hammer2_chain_alloc(hmp, &(*chainp)->bref);
+       hammer2_chain_core_alloc(chain, (*chainp)->core);
+
+       bytes = (hammer2_off_t)1 <<
+               (int)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX);
+       chain->bytes = bytes;
+
+       switch(chain->bref.type) {
+       case HAMMER2_BREF_TYPE_VOLUME:
+               panic("hammer2_chain_duplicate: cannot be called w/volhdr");
+               break;
+       case HAMMER2_BREF_TYPE_INODE:
+               KKASSERT(bytes == HAMMER2_INODE_BYTES);
+               if ((*chainp)->data) {
+                       chain->data = kmalloc(sizeof(chain->data->ipdata),
+                                             hmp->minode, M_WAITOK | M_ZERO);
+                       chain->data->ipdata = (*chainp)->data->ipdata;
+               }
+               break;
+       case HAMMER2_BREF_TYPE_INDIRECT:
+#if 0
+               panic("hammer2_chain_duplicate: cannot be used to"
+                     "create an indirect block");
+#endif
+               break;
+       case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
+       case HAMMER2_BREF_TYPE_FREEMAP_NODE:
+               panic("hammer2_chain_duplicate: cannot be used to"
+                     "create a freemap root or node");
+               break;
+       case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
+       case HAMMER2_BREF_TYPE_DATA:
+       default:
+               /* leave chain->data NULL */
+               KKASSERT(chain->data == NULL);
+               break;
+       }
+
+       /*
+        * Both chains must be locked for us to be able to set the
+        * duplink.  To avoid buffer cache deadlocks we do not try
+        * to resolve the new chain until after we've unlocked the
+        * old one.
+        */
+       hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER);
+       KKASSERT((*chainp)->duplink == NULL);
+       (*chainp)->duplink = chain;     /* inherits excess ref from alloc */
+       hammer2_chain_unlock(*chainp);
+       *chainp = chain;
+       hammer2_chain_lock(chain, HAMMER2_RESOLVE_MAYBE);
+       hammer2_chain_unlock(chain);
+
+
+       /*
+        * If parent is not NULL, insert into the parent at the requested
+        * index.  The newly duplicated chain must be marked MOVED and
+        * SUBMODIFIED set in its parent(s).
+        */
+       if (parent) {
+               /*
+                * Locate a free blockref in the parent's array
+                */
+               KKASSERT(ccms_thread_lock_owned(&parent->core->cst));
+               switch(parent->bref.type) {
+               case HAMMER2_BREF_TYPE_INODE:
+                       KKASSERT((parent->data->ipdata.op_flags &
+                                 HAMMER2_OPFLAG_DIRECTDATA) == 0);
+                       KKASSERT(parent->data != NULL);
+                       base = &parent->data->ipdata.u.blockset.blockref[0];
+                       count = HAMMER2_SET_COUNT;
+                       break;
+               case HAMMER2_BREF_TYPE_INDIRECT:
+               case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
+               case HAMMER2_BREF_TYPE_FREEMAP_NODE:
+                       if (parent->flags & HAMMER2_CHAIN_INITIAL) {
+                               base = NULL;
+                       } else {
+                               KKASSERT(parent->data != NULL);
+                               base = &parent->data->npdata.blockref[0];
+                       }
+                       count = parent->bytes / sizeof(hammer2_blockref_t);
+                       break;
+               case HAMMER2_BREF_TYPE_VOLUME:
+                       KKASSERT(parent->data != NULL);
+                       base = &hmp->voldata.sroot_blockset.blockref[0];
+                       count = HAMMER2_SET_COUNT;
+                       break;
+               default:
+                       panic("hammer2_chain_create: unrecognized "
+                             "blockref type: %d",
+                             parent->bref.type);
+                       count = 0;
+                       break;
+               }
+               KKASSERT(i >= 0 && i < count);
+               KKASSERT(base == NULL || base[i].type == 0);
+
+               chain->parent = parent;
+               chain->index = i;
+               KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0);
+               KKASSERT(parent->refs > 0);
+               spin_lock(&parent->core->cst.spin);
+               if (RB_INSERT(hammer2_chain_tree, &parent->core->rbtree, chain))
+                       panic("hammer2_chain_link: collision");
+               atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
+               hammer2_chain_ref(parent);      /* chain->parent ref */
+               spin_unlock(&parent->core->cst.spin);
+
+               if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
+                       hammer2_chain_ref(chain);
+                       atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
+               }
+               hammer2_chain_parent_setsubmod(chain);
+       }
 }
 
 /*
@@ -1908,8 +2151,6 @@ done:
  * and leaving the original parent lock/ref intact as well.
  *
  * If an error occurs, NULL is returned and *errorp is set to the error.
- * EAGAIN can be returned to indicate a flush collision which requires the
- * caller to retry.
  *
  * The returned chain depends on where the specified key falls.
  *
@@ -1946,10 +2187,11 @@ done:
  */
 static
 hammer2_chain_t *
-hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
+hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent,
                              hammer2_key_t create_key, int create_bits,
                              int *errorp)
 {
+       hammer2_mount_t *hmp = trans->hmp;
        hammer2_blockref_t *base;
        hammer2_blockref_t *bref;
        hammer2_chain_t *chain;
@@ -1968,10 +2210,10 @@ hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
         * is known to be empty.  We need to calculate the array count
         * for RB lookups either way.
         */
-       KKASSERT(ccms_thread_lock_owned(&parent->cst));
+       KKASSERT(ccms_thread_lock_owned(&parent->core->cst));
        *errorp = 0;
 
-       hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_OPTDATA);
+       hammer2_chain_modify(trans, parent, HAMMER2_MODIFY_OPTDATA);
        if (parent->flags & HAMMER2_CHAIN_INITIAL) {
                base = NULL;
 
@@ -2023,18 +2265,21 @@ hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
         * Scan for an unallocated bref, also skipping any slots occupied
         * by in-memory chain elements which may not yet have been updated
         * in the parent's bref array.
+        *
+        * Deleted elements are ignored.
         */
        bzero(&dummy, sizeof(dummy));
+       dummy.delete_tid = HAMMER2_MAX_TID;
+
+       spin_lock(&parent->core->cst.spin);
        for (i = 0; i < count; ++i) {
                int nkeybits;
 
                dummy.index = i;
-               chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy);
+               chain = RB_FIND(hammer2_chain_tree, &parent->core->rbtree,
+                               &dummy);
                if (chain) {
-                       /*
-                        * NOTE! CHAIN_DELETED elements have to be adjusted
-                        *       too, they cannot be ignored.
-                        */
+                       KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0);
                        bref = &chain->bref;
                } else if (base && base[i].type) {
                        bref = &base[i];
@@ -2082,6 +2327,8 @@ hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
                else
                        ++locount;
        }
+       spin_unlock(&parent->core->cst.spin);
+       bref = NULL;    /* now invalid (safety) */
 
        /*
         * Adjust keybits to represent half of the full range calculated
@@ -2145,13 +2392,20 @@ hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
        dummy.bref.keybits = keybits;
        dummy.bref.data_off = hammer2_allocsize(nbytes);
        dummy.bref.methods = parent->bref.methods;
+
        ichain = hammer2_chain_alloc(hmp, &dummy.bref);
        atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL);
+       hammer2_chain_core_alloc(ichain, NULL);
+       hammer2_chain_lock(ichain, HAMMER2_RESOLVE_MAYBE);
+       hammer2_chain_drop(ichain);     /* excess ref from alloc */
 
        /*
         * Iterate the original parent and move the matching brefs into
         * the new indirect block.
+        *
+        * XXX handle flushes.
         */
+       spin_lock(&parent->core->cst.spin);
        for (i = 0; i < count; ++i) {
                /*
                 * For keying purposes access the bref from the media or
@@ -2161,12 +2415,10 @@ hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
                 * has a key.
                 */
                dummy.index = i;
-               chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy);
+               chain = RB_FIND(hammer2_chain_tree, &parent->core->rbtree,
+                               &dummy);
                if (chain) {
-                       /*
-                        * NOTE! CHAIN_DELETED elements have to be adjusted
-                        *       too, they cannot be ignored.
-                        */
+                       KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0);
                        bref = &chain->bref;
                } else if (base && base[i].type) {
                        bref = &base[i];
@@ -2195,86 +2447,35 @@ hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
 
                /*
                 * Load the new indirect block by acquiring or allocating
-                * the related chain entries, then simply move them to the
-                * new parent (ichain).  We cannot move chains which are
-                * undergoing flushing and will break out of the loop in
-                * that case.
-                *
-                * When adjusting the parent/child relationship we must
-                * set the MOVED bit but we do NOT update bref_flush
-                * because otherwise we might synchronize a bref that has
-                * not yet been flushed.  We depend on chain's bref_flush
-                * either being correct or the chain being in a MODIFIED
-                * state.
-                *
-                * We do not want to set MODIFIED here as this would result
-                * in unnecessary reallocations.
+                * the related chain entries, then move them to the new
+                * parent (ichain) by deleting them from their old location
+                * and inserting a duplicate of the chain and any modified
+                * sub-chain in the new location.
                 *
-                * We must still set SUBMODIFIED in the parent but we do
-                * that after the loop.
+                * We must set MOVED in the chain being duplicated and
+                * SUBMODIFIED in the parent(s) so the flush code knows
+                * what is going on.  The latter is done after the loop.
                 *
                 * WARNING! chain->cst.spin must be held when chain->parent is
                 *          modified, even though we own the full blown lock,
                 *          to deal with setsubmod and rename races.
+                *          (XXX remove this req).
                 */
-               chain = hammer2_chain_get(hmp, parent, i,
-                                         HAMMER2_LOOKUP_NODATA);
-               if (chain->flushing) {
-                       hammer2_chain_unlock(hmp, chain);
-                       break;
-               }
-
-               spin_lock(&chain->cst.spin);
-               RB_REMOVE(hammer2_chain_tree, &parent->rbhead, chain);
-               if (RB_INSERT(hammer2_chain_tree, &ichain->rbhead, chain))
-                       panic("hammer2_chain_create_indirect: collision");
-               chain->parent = ichain;
-               spin_unlock(&chain->cst.spin);
+               spin_unlock(&parent->core->cst.spin);
+               chain = hammer2_chain_get(parent, i, HAMMER2_LOOKUP_NODATA);
+               hammer2_chain_delete(trans, parent, chain);
+               hammer2_chain_duplicate(trans, ichain, i, &chain);
 
+#if 0
                if (base)
                        bzero(&base[i], sizeof(base[i]));
-               atomic_add_int(&parent->refs, -1);
-               atomic_add_int(&ichain->refs, 1);
-               if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
-                       hammer2_chain_ref(hmp, chain);
-                       atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
-               }
-               hammer2_chain_unlock(hmp, chain);
+#endif
+               hammer2_chain_unlock(chain);
                KKASSERT(parent->refs > 0);
                chain = NULL;
+               spin_lock(&parent->core->cst.spin);
        }
-
-       /*
-        * If we hit a chain that is undergoing flushing we're screwed and
-        * we have to undo the whole mess.  Since ichain has not been linked
-        * in yet, the moved chains are not reachable and will not have been
-        * disposed of.
-        *
-        * WARNING! This code is pretty hairy because the flusher is sitting
-        *          on the parent processing one of the children that we
-        *          haven't yet moved, and will do a RB_NEXT loop on that
-        *          child.  So the children we're moving back have to be
-        *          returned to the same place in the iteration that they
-        *          were removed from.
-        */
-       if (i != count) {
-               kprintf("hammer2_chain_create_indirect: EAGAIN\n");
-               *errorp = EAGAIN;
-               while ((chain = RB_ROOT(&ichain->rbhead)) != NULL) {
-                       hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_NEVER);
-                       KKASSERT(chain->flushing == 0);
-                       RB_REMOVE(hammer2_chain_tree, &ichain->rbhead, chain);
-                       if (RB_INSERT(hammer2_chain_tree, &parent->rbhead, chain))
-                               panic("hammer2_chain_create_indirect: collision");
-                       chain->parent = parent;
-                       atomic_add_int(&parent->refs, 1);
-                       atomic_add_int(&ichain->refs, -1);
-                       /* MOVED bit might have been inherited, cannot undo */
-                       hammer2_chain_unlock(hmp, chain);
-               }
-               hammer2_chain_free(hmp, ichain);
-               return(NULL);
-       }
+       spin_unlock(&parent->core->cst.spin);
 
        /*
         * Insert the new indirect block into the parent now that we've
@@ -2284,13 +2485,19 @@ hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
         * We don't have to set MOVED here because we mark ichain modified
         * down below (so the normal modified -> flush -> set-moved sequence
         * applies).
+        *
+        * The insertion shouldn't race as this is a completely new block
+        * and the parent is locked.
         */
        KKASSERT(ichain->index >= 0);
-       if (RB_INSERT(hammer2_chain_tree, &parent->rbhead, ichain))
+       KKASSERT((ichain->flags & HAMMER2_CHAIN_ONRBTREE) == 0);
+       spin_lock(&parent->core->cst.spin);
+       if (RB_INSERT(hammer2_chain_tree, &parent->core->rbtree, ichain))
                panic("hammer2_chain_create_indirect: ichain insertion");
        atomic_set_int(&ichain->flags, HAMMER2_CHAIN_ONRBTREE);
        ichain->parent = parent;
-       atomic_add_int(&parent->refs, 1);
+       hammer2_chain_ref(parent);      /* ichain->parent ref */
+       spin_unlock(&parent->core->cst.spin);
 
        /*
         * Mark the new indirect block modified after insertion, which
@@ -2304,8 +2511,8 @@ hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
         * our moved blocks, then call setsubmod() to set the bit
         * recursively.
         */
-       hammer2_chain_modify(hmp, ichain, HAMMER2_MODIFY_OPTDATA);
-       hammer2_chain_parent_setsubmod(hmp, ichain);
+       hammer2_chain_modify(trans, ichain, HAMMER2_MODIFY_OPTDATA);
+       hammer2_chain_parent_setsubmod(ichain);
        atomic_set_int(&ichain->flags, HAMMER2_CHAIN_SUBMODIFIED);
 
        /*
@@ -2316,14 +2523,14 @@ hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
                 * Key being created is way outside the key range,
                 * return the original parent.
                 */
-               hammer2_chain_unlock(hmp, ichain);
+               hammer2_chain_unlock(ichain);
        } else if (~(((hammer2_key_t)1 << keybits) - 1) &
                   (create_key ^ key)) {
                /*
                 * Key being created is outside the key range,
                 * return the original parent.
                 */
-               hammer2_chain_unlock(hmp, ichain);
+               hammer2_chain_unlock(ichain);
        } else {
                /*
                 * Otherwise its in the range, return the new parent.
@@ -2336,54 +2543,70 @@ hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
 }
 
 /*
- * Physically delete the specified chain element.  Note that inodes with
- * open descriptors should not be deleted (as with other filesystems) until
- * the last open descriptor is closed.
- *
- * This routine will remove the chain element from its parent and potentially
- * also recurse upward and delete indirect blocks which become empty as a
- * side effect.
- *
- * The caller must pass a pointer to the chain's parent, also locked and
- * referenced.  (*parentp) will be modified in a manner similar to a lookup
- * or iteration when indirect blocks are also deleted as a side effect.
- *
- * Must be called with an exclusively locked parent and chain.  parent and
- * chain are both left locked on return.
- *
- * XXX This currently does not adhere to the MOVED flag protocol in that
- *     the removal is immediately indicated in the parent's blockref[]
- *     array.
+ * Sets CHAIN_DELETED and CHAIN_MOVED in the chain being deleted and
+ * remove the parent's bref reference to chain, generating a modification
+ * on the parent.
+ *
+ * We do not attempt to defer adjustment of the parent bref to the chain
+ * as this could become quite complex with multiple deletions / replacements.
+ * Intead, a modification is generated in the parent which can cause it to
+ * be duplicated if the current parent's data is required for a flush in
+ * progress.
+ *
+ * NOTE: We can trivially adjust the parent if it is in the INITIAL state.
+ *
+ * NOTE: The flush code handles the actual removal of the chain from
+ *      the BTREE (also, depending on synchronization points, the
+ *      chain may still be relevant to the flush).
+ *
+ * NOTE: chain->delete_tid distinguishes deleted chains from live chains,
+ *      by setting it to something less than HAMMER2_MAX_TID the
+ *      chain_lookup(), chain_next(), and chain_get() functions will
+ *      not have visibility.
+ *
+ * This function is NOT recursive.  Any entity already pushed into the
+ * chain (such as an inode) may still need visibility into its contents,
+ * as well as the ability to read and modify the contents.  For example,
+ * for an unlinked file which is still open.
  */
 void
-hammer2_chain_delete(hammer2_mount_t *hmp, hammer2_chain_t *parent,
-                    hammer2_chain_t *chain, int retain)
+hammer2_chain_delete(hammer2_trans_t *trans, hammer2_chain_t *parent,
+                    hammer2_chain_t *chain)
 {
+       hammer2_mount_t *hmp = trans->hmp;
        hammer2_blockref_t *base;
        int count;
 
        if (chain->parent != parent)
                panic("hammer2_chain_delete: parent mismatch");
-       KKASSERT(ccms_thread_lock_owned(&parent->cst));
+       KKASSERT(ccms_thread_lock_owned(&parent->core->cst));
+
+       /*
+        * Nothing to do if already marked.
+        */
+       if (chain->flags & HAMMER2_CHAIN_DELETED)
+               return;
 
        /*
         * Mark the parent modified so our base[] pointer remains valid
         * while we move entries.  For the optimized indirect block
         * case mark the parent moved instead.
         *
-        * Calculate the blockref reference in the parent
+        * Calculate the blockref reference in the parent and zero it out.
         */
        switch(parent->bref.type) {
        case HAMMER2_BREF_TYPE_INODE:
-               hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_NO_MODIFY_TID);
+               hammer2_chain_modify(trans, parent,
+                                    HAMMER2_MODIFY_NO_MODIFY_TID);
                base = &parent->data->ipdata.u.blockset.blockref[0];
                count = HAMMER2_SET_COUNT;
                break;
        case HAMMER2_BREF_TYPE_INDIRECT:
        case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
        case HAMMER2_BREF_TYPE_FREEMAP_NODE:
-               hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_OPTDATA |
-                                                 HAMMER2_MODIFY_NO_MODIFY_TID);
+               hammer2_chain_modify(trans, parent,
+                                    HAMMER2_MODIFY_OPTDATA |
+                                    HAMMER2_MODIFY_NO_MODIFY_TID);
                if (parent->flags & HAMMER2_CHAIN_INITIAL)
                        base = NULL;
                else
@@ -2391,89 +2614,45 @@ hammer2_chain_delete(hammer2_mount_t *hmp, hammer2_chain_t *parent,
                count = parent->bytes / sizeof(hammer2_blockref_t);
                break;
        case HAMMER2_BREF_TYPE_VOLUME:
-               hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_NO_MODIFY_TID);
+               hammer2_chain_modify(trans, parent,
+                                    HAMMER2_MODIFY_NO_MODIFY_TID);
                base = &hmp->voldata.sroot_blockset.blockref[0];
                count = HAMMER2_SET_COUNT;
                break;
        default:
                panic("hammer2_chain_delete: unrecognized blockref type: %d",
                      parent->bref.type);
-               count = 0;
-               break;
+               base = NULL;    /* NOT REACHED */
+               count = 0;      /* NOT REACHED */
+               break;          /* NOT REACHED */
        }
        KKASSERT(chain->index >= 0 && chain->index < count);
 
        /*
-        * We may not be able to immediately disconnect the chain if a
-        * flush is in progress.  If retain is non-zero we MUST disconnect
-        * the chain now and callers are responsible for making sure that
-        * flushing is zero.
+        * Clean out the blockref immediately.
         */
-       spin_lock(&chain->cst.spin);
-       if ((retain || chain->flushing == 0) &&
-           (chain->flags & HAMMER2_CHAIN_ONRBTREE)) {
-               if (base)
-                       bzero(&base[chain->index], sizeof(*base));
-               KKASSERT(chain->flushing == 0);
-               RB_REMOVE(hammer2_chain_tree, &parent->rbhead, chain);
-               atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
-               atomic_add_int(&parent->refs, -1);   /* for red-black entry */
-               chain->index = -1;
-               chain->parent = NULL;
-       }
-       spin_unlock(&chain->cst.spin);
+       if (base)
+               bzero(&base[chain->index], sizeof(*base));
 
        /*
-        * Cumulative adjustments must be propagated to the parent inode
-        * when deleting and synchronized to ip.  This occurs even if we
-        * cannot detach the chain from its parent.
+        * Must set MOVED along with DELETED for the flush code to recognize
+        * the operation and properly disconnect the chain in-memory.
         *
-        * NOTE:  We do not propagate ip->delta_*count to the parent because
-        *        these represent adjustments that have not yet been
-        *        propagated upward, so we don't need to remove them from
-        *        the parent.
-        *
-        * Clear the pointer to the parent inode.
+        * The setting of DELETED causes finds, lookups, and _next iterations
+        * to no longer recognize the chain.  RB_SCAN()s will still have
+        * visibility (needed for flush serialization points).
         */
-       if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0 &&
-           chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
-               /* XXX */
-       }
-
-       /*
-        * If retain is 0 the deletion is permanent.  Because the chain is
-        * no longer connected to the topology a flush will have no
-        * visibility into it.  We must dispose of the references related
-        * to the MODIFIED and MOVED flags, otherwise the ref count will
-        * never transition to 0.
-        *
-        * If retain is non-zero the deleted element is likely an inode
-        * which the vnops frontend will mark DESTROYED and flush.  In that
-        * situation we must retain the flags for any open file descriptors
-        * on the (removed) inode.  The final close will destroy the
-        * disconnected chain.
-        */
-       if (retain == 0) {
-               atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
-               if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
-                       atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
-                       hammer2_chain_drop(hmp, chain);
-               }
-               if (chain->flags & HAMMER2_CHAIN_MOVED) {
-                       atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MOVED);
-                       hammer2_chain_drop(hmp, chain);
-               }
+       atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
+       if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
+               hammer2_chain_ref(chain);
+               atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
        }
-
-       /*
-        * The chain is still likely referenced, possibly even by a vnode
-        * (if an inode), so defer further action until the chain gets
-        * dropped.
-        */
+       chain->delete_tid = trans->sync_tid;
+       hammer2_chain_parent_setsubmod(chain);
 }
 
 void
-hammer2_chain_wait(hammer2_mount_t *hmp, hammer2_chain_t *chain)
+hammer2_chain_wait(hammer2_chain_t *chain)
 {
        tsleep(chain, 0, "chnflw", 1);
 }
index 487355b..ea750a1 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2011-2013 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
  * by Matthew Dillon <dillon@dragonflybsd.org>
  * processes.
  */
 struct hammer2_flush_info {
-       struct flush_deferral_list flush_list;
+       hammer2_mount_t *hmp;
+       hammer2_chain_t *parent;
+       hammer2_trans_t *trans;
        int             depth;
-       hammer2_tid_t   modify_tid;
+       int             diddeferral;
+       struct flush_deferral_list flush_list;
+       hammer2_tid_t   sync_tid;       /* flush synchronization point */
+       hammer2_tid_t   mirror_tid;     /* collect mirror TID updates */
 };
 
 typedef struct hammer2_flush_info hammer2_flush_info_t;
 
-static void hammer2_chain_flush_pass1(hammer2_mount_t *hmp,
-                       hammer2_chain_t *chain, hammer2_flush_info_t *info);
-static void hammer2_saved_child_cleanup(hammer2_mount_t *hmp,
-                       hammer2_chain_t *parent, hammer2_chain_t *child);
+static void hammer2_chain_flush_core(hammer2_flush_info_t *info,
+                               hammer2_chain_t *chain);
+static int hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data);
+static int hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data);
+
+/*
+ * Transaction support functions for writing to the filesystem.
+ *
+ * Initializing a new transaction allocates a transaction ID.  We
+ * don't bother marking the volume header MODIFIED.  Instead, the volume
+ * header will be updated only if the operation actually makes modifications
+ * (when then propagate to the root).
+ *
+ * WARNING! Modifications to the root volume cannot dup the root volume
+ *         header to handle synchronization points, so alloc_tid can
+ *         wind up (harmlessly) more advanced on flush.
+ */
+void
+hammer2_trans_init(hammer2_trans_t *trans, hammer2_mount_t *hmp)
+{
+       bzero(trans, sizeof(*trans));
+       trans->hmp = hmp;
+       hammer2_voldata_lock(hmp);
+       trans->sync_tid = hmp->voldata.alloc_tid++;
+       hammer2_voldata_unlock(hmp, 0); /* don't immediately mark modified */
+}
+
+void
+hammer2_trans_done(hammer2_trans_t *trans)
+{
+       trans->hmp = NULL;
+}
 
 /*
- * Stand-alone flush.  If the chain is unable to completely flush we have
- * to be sure that SUBMODIFIED propagates up the parent chain.  We must not
- * clear the MOVED bit after flushing in this situation or our desynchronized
- * bref will not properly update in the parent.
+ * Flush the chain and all modified sub-chains through the specified
+ * synchronization point (sync_tid), propagating parent chain modifications
+ * and mirror_tid updates back up as needed.  Since we are recursing downward
+ * we do not have to deal with the complexities of multi-homed chains (chains
+ * with multiple parents).
+ *
+ * Caller must have interlocked against any non-flush-related modifying
+ * operations in progress whos modify_tid values are less than or equal
+ * to the passed sync_tid.
+ *
+ * Caller must have already vetted synchronization points to ensure they
+ * are properly flushed.  Only snapshots and cluster flushes can create
+ * these sorts of synchronization points.
+ *
+ * SUBMODIFIED is not cleared if modified elements with higher modify_tid
+ * values (thus not flushed) are still present after the flush.
+ *
+ * If a chain is unable to completely flush we have to be sure that
+ * SUBMODIFIED remains set up the parent chain, and that MOVED is not
+ * cleared or our desynchronized bref will not properly update in the
+ * parent.  The parent's indirect block is copied-on-write and adjusted
+ * as needed so it no longer needs to be placemarked by the subchains,
+ * allowing the sub-chains to be cleaned out.
  *
  * This routine can be called from several places but the most important
  * is from the hammer2_vop_reclaim() function.  We want to try to completely
  * clean out the inode structure to prevent disconnected inodes from
- * building up and blowing out the kmalloc pool.
+ * building up and blowing out the kmalloc pool.  However, it is not actually
+ * necessary to flush reclaimed inodes to maintain HAMMER2's crash recovery
+ * capability.
  *
- * If modify_tid is 0 (usual case), a new modify_tid is allocated and
- * applied to the flush.  The depth-limit handling code is the only
- * code which passes a non-zero modify_tid to hammer2_chain_flush().
- *
- * chain is locked on call and will remain locked on return.
+ * chain is locked on call and will remain locked on return.  If a flush
+ * occured, the chain's MOVED bit will be set indicating that its parent
+ * (which is not part of the flush) should be updated.
  */
 void
-hammer2_chain_flush(hammer2_mount_t *hmp, hammer2_chain_t *chain,
-                   hammer2_tid_t modify_tid)
+hammer2_chain_flush(hammer2_trans_t *trans, hammer2_chain_t *chain)
 {
-       hammer2_chain_t *parent;
        hammer2_chain_t *scan;
-       hammer2_blockref_t *base;
        hammer2_flush_info_t info;
-       int count;
-       int reflush;
 
        /*
         * Execute the recursive flush and handle deferrals.
@@ -100,30 +147,18 @@ hammer2_chain_flush(hammer2_mount_t *hmp, hammer2_chain_t *chain,
         */
        bzero(&info, sizeof(info));
        TAILQ_INIT(&info.flush_list);
+       info.hmp = trans->hmp;
+       info.trans = trans;
+       info.sync_tid = trans->sync_tid;
+       info.mirror_tid = 0;
 
-       if (modify_tid == 0) {
-               hammer2_voldata_lock(hmp);
-               info.modify_tid = hmp->voldata.alloc_tid++;
-               atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED_AUX);
-               hammer2_voldata_unlock(hmp);
-       } else {
-               info.modify_tid = modify_tid;
-       }
-       reflush = 1;
-
-       while (reflush) {
+       for (;;) {
                /*
-                * Primary recursion
+                * Unwind deep recursions which had been deferred.  This
+                * can leave MOVED set for these chains, which will be
+                * handled when we [re]flush chain after the unwind.
                 */
-               hammer2_chain_flush_pass1(hmp, chain, &info);
-               reflush = 0;
-
                while ((scan = TAILQ_FIRST(&info.flush_list)) != NULL) {
-                       /*
-                        * Secondary recursion.  Note that a reference is
-                        * retained from the element's presence on the
-                        * deferral list.
-                        */
                        KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED);
                        TAILQ_REMOVE(&info.flush_list, scan, flush_node);
                        atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED);
@@ -134,134 +169,63 @@ hammer2_chain_flush(hammer2_mount_t *hmp, hammer2_chain_t *chain,
                         */
                        if (hammer2_debug & 0x0040)
                                kprintf("defered flush %p\n", scan);
-                       hammer2_chain_lock(hmp, scan, HAMMER2_RESOLVE_MAYBE);
-                       hammer2_chain_flush(hmp, scan, info.modify_tid);
-                       hammer2_chain_unlock(hmp, scan);
-
-                       /*
-                        * Only flag a reflush if SUBMODIFIED is no longer
-                        * set.  If SUBMODIFIED is set the element will just
-                        * wind up on our flush_list again.
-                        */
-                       if ((scan->flags & (HAMMER2_CHAIN_SUBMODIFIED |
-                                           HAMMER2_CHAIN_MODIFIED |
-                                           HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
-                               reflush = 1;
-                       }
-                       hammer2_chain_drop(hmp, scan);
+                       hammer2_chain_lock(scan, HAMMER2_RESOLVE_MAYBE);
+                       hammer2_chain_flush(trans, scan);
+                       hammer2_chain_unlock(scan);
+                       hammer2_chain_drop(scan);       /* ref from deferral */
                }
-               if ((hammer2_debug & 0x0040) && reflush)
-                       kprintf("reflush %p\n", chain);
-       }
 
-       /*
-        * The SUBMODIFIED bit must propagate upward if the chain could not
-        * be completely flushed.
-        */
-       if (chain->flags & (HAMMER2_CHAIN_SUBMODIFIED |
-                           HAMMER2_CHAIN_MODIFIED |
-                           HAMMER2_CHAIN_MODIFIED_AUX |
-                           HAMMER2_CHAIN_MOVED)) {
-               hammer2_chain_parent_setsubmod(hmp, chain);
-       }
-
-       /*
-        * If the only thing left is a simple bref update try to
-        * pro-actively update the parent, otherwise return early.
-        */
-       parent = chain->parent;
-       if (parent == NULL) {
-               return;
-       }
-       if (chain->bref.type != HAMMER2_BREF_TYPE_INODE ||
-           (chain->flags & (HAMMER2_CHAIN_SUBMODIFIED |
-                            HAMMER2_CHAIN_MODIFIED |
-                            HAMMER2_CHAIN_MODIFIED_AUX |
-                            HAMMER2_CHAIN_MOVED)) != HAMMER2_CHAIN_MOVED) {
-               return;
-       }
-
-       /*
-        * We are locking backwards so allow the lock to fail.
-        */
-       if (ccms_thread_lock_nonblock(&parent->cst, CCMS_STATE_EXCLUSIVE))
-               return;
-
-       /*
-        * We are updating brefs but we have to call chain_modify()
-        * because our caller is not being run from a recursive flush.
-        *
-        * This will also chain up the parent list and set the SUBMODIFIED
-        * flag.
-        *
-        * We do not want to set HAMMER2_CHAIN_MODIFY_TID here because the
-        * modification is only related to updating a bref in the parent.
-        *
-        * When updating the blockset embedded in the volume header we must
-        * also update voldata.mirror_tid.
-        */
-       hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_MAYBE);
-       hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_NO_MODIFY_TID);
+               /*
+                * Flush pass1 on root.  SUBMODIFIED can remain set after
+                * this call for numerous reasons, including write failures,
+                * but most likely due to only a partial flush being
+                * requested.
+                */
+               info.diddeferral = 0;
+               hammer2_chain_flush_core(&info, chain);
 
-       switch(parent->bref.type) {
-       case HAMMER2_BREF_TYPE_INODE:
-               base = &parent->data->ipdata.u.blockset.
-                       blockref[0];
-               count = HAMMER2_SET_COUNT;
-               break;
-       case HAMMER2_BREF_TYPE_INDIRECT:
-               base = &parent->data->npdata.blockref[0];
-               count = parent->bytes /
-                       sizeof(hammer2_blockref_t);
-               break;
-       case HAMMER2_BREF_TYPE_VOLUME:
-               base = &hmp->voldata.sroot_blockset.blockref[0];
-               count = HAMMER2_SET_COUNT;
-               if (chain->flags & HAMMER2_CHAIN_MOVED) {
-                       if (hmp->voldata.mirror_tid < chain->bref.mirror_tid) {
-                               hmp->voldata.mirror_tid =
-                                       chain->bref.mirror_tid;
-                       }
-               }
-               break;
-       default:
-               base = NULL;
-               panic("hammer2_chain_flush: "
-                     "unrecognized blockref type: %d",
-                     parent->bref.type);
+               /*
+                * Only loop if deep recursions have been deferred.
+                */
+               if (TAILQ_EMPTY(&info.flush_list))
+                       break;
        }
 
        /*
-        * Update the blockref in the parent.  We do not have to set
-        * MOVED in the parent because the parent has been marked modified,
-        * so the flush sequence will pick up the bref change.
-        *
-        * We do have to propagate mirror_tid upward.
+        * SUBMODIFIED can be temporarily cleared and then re-set, which
+        * can prevent concurrent setsubmods from reaching all the way to
+        * the root.  If after the flush we find the node is still in need
+        * of flushing (though possibly due to modifications made outside
+        * the requested synchronization zone), we must call setsubmod again
+        * to cover the race.
         */
-       KKASSERT(chain->index >= 0 &&
-                chain->index < count);
-       KKASSERT(chain->parent == parent);
-       if (chain->flags & HAMMER2_CHAIN_MOVED) {
-               base[chain->index] = chain->bref_flush;
-               if (parent->bref.mirror_tid < chain->bref_flush.mirror_tid)
-                       parent->bref.mirror_tid = chain->bref_flush.mirror_tid;
-               atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MOVED);
-               hammer2_chain_drop(hmp, chain);
-       } else if (bcmp(&base[chain->index], &chain->bref_flush,
-                  sizeof(chain->bref)) != 0) {
-               panic("hammer2: unflagged bref update(2)");
+       if (chain->flags & (HAMMER2_CHAIN_MOVED |
+                           HAMMER2_CHAIN_DELETED |
+                           HAMMER2_CHAIN_MODIFIED |
+                           HAMMER2_CHAIN_SUBMODIFIED)) {
+               hammer2_chain_parent_setsubmod(chain);
        }
-       ccms_thread_unlock(&parent->cst);               /* release manual op */
-       hammer2_chain_unlock(hmp, parent);
 }
 
 /*
- * chain is locked by the caller and remains locked on return.
+ * (chain) is locked by the caller and remains locked on return.
+ * This function is keyed off of SUBMODIFIED but must make fine-grained
+ * choices based on the synchronization point we are flushing to.
+ *
+ * If the flush accomplished any work chain will be flagged MOVED
+ * indicating a copy-on-write propagation back up is required.
+ * Deep sub-nodes may also have been entered onto the deferral list.
+ * MOVED is never set on the volume root.
+ *
+ * NOTE: modify_tid is different from MODIFIED.  modify_tid is updated
+ *      only when a chain is specifically modified, and not updated
+ *      for copy-on-write propagations.  MODIFIED is set on any modification
+ *      including copy-on-write propagations.
  */
 static void
-hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *chain,
-                         hammer2_flush_info_t *info)
+hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain)
 {
+       hammer2_mount_t *hmp;
        hammer2_blockref_t *bref;
        hammer2_off_t pbase;
        size_t bbytes;
@@ -270,33 +234,9 @@ hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *chain,
        struct buf *bp;
        int error;
        int wasmodified;
+       int diddeferral = 0;
 
-       /*
-        * If we hit the stack recursion depth limit defer the operation.
-        * The controller of the info structure will execute the deferral
-        * list and then retry.
-        *
-        * This is only applicable if SUBMODIFIED is set.  After a reflush
-        * SUBMODIFIED will probably be cleared and we want to drop through
-        * to finish processing the current element so our direct parent
-        * can process the results.
-        */
-       if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT &&
-           (chain->flags & HAMMER2_CHAIN_SUBMODIFIED)) {
-               if ((chain->flags & HAMMER2_CHAIN_DEFERRED) == 0) {
-                       hammer2_chain_ref(hmp, chain);
-                       TAILQ_INSERT_TAIL(&info->flush_list,
-                                         chain, flush_node);
-                       atomic_set_int(&chain->flags, HAMMER2_CHAIN_DEFERRED);
-               }
-               return;
-       }
-
-       if (hammer2_debug & 0x0008)
-               kprintf("%*.*sCHAIN type=%d@%08jx %p/%d %04x {\n",
-                       info->depth, info->depth, "",
-                       chain->bref.type, chain->bref.data_off,
-                       chain, chain->refs, chain->flags);
+       hmp = info->hmp;
 
        /*
         * If SUBMODIFIED is set we recurse the flush and adjust the
@@ -306,13 +246,13 @@ hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *chain,
         *       finishing in the face of filesystem activity.
         */
        if (chain->flags & HAMMER2_CHAIN_SUBMODIFIED) {
-               hammer2_chain_t *child;
-               hammer2_chain_t *saved;
-               hammer2_blockref_t *base;
-               int count;
+               hammer2_chain_t *saved_parent;
 
                /*
-                * Clear SUBMODIFIED to catch races.  Note that if any
+                * Clear SUBMODIFIED to catch races.  Note that any child
+                * with MODIFIED, DELETED, or MOVED set during Scan2, after
+                * it processes the child, will cause SUBMODIFIED to be
+                * re-set.
                 * child has to be flushed SUBMODIFIED will wind up being
                 * set again (for next time), but this does not stop us from
                 * synchronizing block updates which occurred.
@@ -322,290 +262,183 @@ hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *chain,
                 * We need an extra ref on chain because we are going to
                 * release its lock temporarily in our child loop.
                 */
-               /* XXX SUBMODIFIED not interlocked, can race */
                atomic_clear_int(&chain->flags, HAMMER2_CHAIN_SUBMODIFIED);
-               hammer2_chain_ref(hmp, chain);
+               hammer2_chain_ref(chain);
 
                /*
-                * Flush the children and update the blockrefs in the chain.
-                * Be careful of ripouts during the loop.
+                * Run two passes.  The first pass handles MODIFIED and
+                * SUBMODIFIED chains and recurses while the second pass
+                * handles MOVED chains on the way back up.
+                *
+                * If the stack gets too deep we defer scan1, but must
+                * be sure to still run scan2 if on the next loop the
+                * deferred chain has been flushed and now needs MOVED
+                * handling on the way back up.
+                *
+                * Scan1 is recursive.
+                *
+                * NOTE: The act of handling a modified/submodified chain can
+                *       cause the MOVED Flag to be set.  It can also be set
+                *       via hammer2_chain_delete() and in other situations.
                 *
-                * The flushing counter prevents ripouts on lastdrop and
-                * also prevents moves (causes renames to sleep/retry).
-                * Be very careful with it.
+                * NOTE: RB_SCAN() must be used instead of RB_FOREACH()
+                *       because children can be physically removed during
+                *       the scan.
                 */
-               RB_FOREACH(child, hammer2_chain_tree, &chain->rbhead) {
-                       KASSERT(child->parent == chain,
-                               ("hammer2_flush: child->parent mismatch %p/%p",
-                                child->parent, chain));
-
-                       /*
-                        * We only recurse if SUBMODIFIED (internal node)
-                        * or MODIFIED (internal node or leaf) is set.
-                        * However, we must still track whether any MOVED
-                        * entries are present to determine if the chain's
-                        * blockref's need updating or not.
-                        */
-                       if ((child->flags & (HAMMER2_CHAIN_SUBMODIFIED |
-                                            HAMMER2_CHAIN_MODIFIED |
-                                           HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
-                               continue;
-                       }
-
-                       /*
-                        * flushing can only be adjusted while its parent
-                        * is locked, and prevent the destruction/removal
-                        * of the child from the parent's B-Tree.  This allows
-                        * us to temporarily unlock the parent.
-                        *
-                        * To unwind, we must hold the parent locked before
-                        * decrementing flushing to prevent child corruption
-                        * during our loop.
-                        */
-                       atomic_add_int(&child->flushing, 1);
-                       hammer2_chain_unlock(hmp, chain);
-                       hammer2_chain_lock(hmp, child, HAMMER2_RESOLVE_MAYBE);
-                       KASSERT(child->parent == chain,
-                               ("hammer2_flush: child->parent mismatch %p/%p",
-                                child->parent, chain));
-                       if ((child->flags & (HAMMER2_CHAIN_SUBMODIFIED |
-                                            HAMMER2_CHAIN_MODIFIED |
-                                           HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
-                               hammer2_chain_unlock(hmp, child);
-                               hammer2_chain_lock(hmp, chain,
-                                                  HAMMER2_RESOLVE_ALWAYS);
-                               KKASSERT(child->parent == chain);
-                               atomic_add_int(&child->flushing, -1);
-                               continue;
-                       }
-
-                       /*
-                        * Propagate the DESTROYED flag if found set, then
-                        * recurse the flush.
-                        */
-                       if ((chain->flags & HAMMER2_CHAIN_DESTROYED) &&
-                           (child->flags & HAMMER2_CHAIN_DESTROYED) == 0) {
-                               atomic_set_int(&child->flags,
-                                              HAMMER2_CHAIN_DESTROYED |
-                                              HAMMER2_CHAIN_SUBMODIFIED);
+               saved_parent = info->parent;
+               info->parent = chain;
+
+               if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) {
+                       if ((chain->flags & HAMMER2_CHAIN_DEFERRED) == 0) {
+                               hammer2_chain_ref(chain);
+                               TAILQ_INSERT_TAIL(&info->flush_list,
+                                                 chain, flush_node);
+                               atomic_set_int(&chain->flags,
+                                              HAMMER2_CHAIN_DEFERRED);
                        }
-                       ++info->depth;
-                       hammer2_chain_flush_pass1(hmp, child, info);
-                       --info->depth;
-                       hammer2_chain_unlock(hmp, child);
-
-                       /*
-                        * Always resolve when relocking the parent.
-                        */
-                       hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_ALWAYS);
-                       KASSERT(child->parent == chain,
-                               ("hammer2_flush: child->parent mismatch %p/%p",
-                                child->parent, chain));
-                       atomic_add_int(&child->flushing, -1);
+                       diddeferral = 1;
+               } else {
+                       info->diddeferral = 0;
+                       spin_lock(&chain->core->cst.spin);
+                       RB_SCAN(hammer2_chain_tree, &chain->core->rbtree,
+                               NULL, hammer2_chain_flush_scan1, info);
+                       spin_unlock(&chain->core->cst.spin);
+                       diddeferral += info->diddeferral;
                }
 
                /*
-                * Now synchronize any block updates and handle any
-                * chains marked DELETED.
+                * Handle successfully flushed children who are in the MOVED
+                * state on the way back up the recursion.  This can have
+                * the side-effect of clearing MOVED.
                 *
-                * The flushing counter prevents ripouts on lastdrop and
-                * also prevents moves (causes renames to sleep/retry).
-                * Be very careful with it.
+                * We execute this even if there were deferrals to try to
+                * keep the chain topology cleaner.
+                *
+                * Scan2 is non-recursive.
                 */
-               saved = NULL;
-               RB_FOREACH(child, hammer2_chain_tree, &chain->rbhead) {
-                       if ((child->flags & (HAMMER2_CHAIN_MOVED |
-                                            HAMMER2_CHAIN_DELETED)) == 0) {
-                               continue;
-                       }
-                       atomic_add_int(&child->flushing, 1);
-                       if (saved) {
-                               hammer2_saved_child_cleanup(hmp, chain, saved);
-                               saved = NULL;
-                       }
-                       saved = child;
-                       hammer2_chain_lock(hmp, child, HAMMER2_RESOLVE_NEVER);
-                       KKASSERT(child->parent == chain);
-                       if ((child->flags & (HAMMER2_CHAIN_MOVED |
-                                            HAMMER2_CHAIN_DELETED)) == 0) {
-                               hammer2_chain_unlock(hmp, child);
-                               continue;
-                       }
-                       if (child->flags & HAMMER2_CHAIN_MOVED) {
-                               hammer2_chain_modify(hmp, chain,
-                                            HAMMER2_MODIFY_NO_MODIFY_TID);
-                       }
-
-                       switch(chain->bref.type) {
-                       case HAMMER2_BREF_TYPE_INODE:
-                               KKASSERT((chain->data->ipdata.op_flags &
-                                         HAMMER2_OPFLAG_DIRECTDATA) == 0);
-                               base = &chain->data->ipdata.u.blockset.
-                                       blockref[0];
-                               count = HAMMER2_SET_COUNT;
-                               break;
-                       case HAMMER2_BREF_TYPE_INDIRECT:
-                               if (chain->data) {
-                                       base = &chain->data->npdata.blockref[0];
-                               } else {
-                                       base = NULL;
-                                       KKASSERT(child->flags &
-                                                HAMMER2_CHAIN_DELETED);
-                               }
-                               count = chain->bytes /
-                                       sizeof(hammer2_blockref_t);
-                               break;
-                       case HAMMER2_BREF_TYPE_VOLUME:
-                               base = &hmp->voldata.sroot_blockset.blockref[0];
-                               count = HAMMER2_SET_COUNT;
-                               break;
-                       default:
-                               base = NULL;
-                               panic("hammer2_chain_get: "
-                                     "unrecognized blockref type: %d",
-                                     chain->bref.type);
-                       }
+               spin_lock(&chain->core->cst.spin);
+               RB_SCAN(hammer2_chain_tree, &chain->core->rbtree,
+                       NULL, hammer2_chain_flush_scan2, info);
+               spin_unlock(&chain->core->cst.spin);
+               info->parent = saved_parent;
+               hammer2_chain_drop(chain);
+       }
 
-                       KKASSERT(child->index >= 0);
+       /*
+        * Rollup diddeferral for caller.  Note direct assignment, not +=.
+        */
+       info->diddeferral = diddeferral;
 
-                       if (chain->bref.mirror_tid <
-                           child->bref_flush.mirror_tid) {
-                               chain->bref.mirror_tid =
-                                       child->bref_flush.mirror_tid;
-                       }
-                       if (chain->bref.type == HAMMER2_BREF_TYPE_VOLUME &&
-                           hmp->voldata.mirror_tid <
-                           child->bref_flush.mirror_tid) {
-                               hmp->voldata.mirror_tid =
-                                       child->bref_flush.mirror_tid;
-                       }
-                       if (child->flags & HAMMER2_CHAIN_DELETED) {
-                               bzero(&child->bref_flush,
-                                     sizeof(child->bref_flush));
-                       }
-                       if (base)
-                               base[child->index] = child->bref_flush;
-                       if (child->flags & HAMMER2_CHAIN_MOVED) {
-                               atomic_clear_int(&child->flags,
-                                                HAMMER2_CHAIN_MOVED);
-                               hammer2_chain_drop(hmp, child); /* flag */
-                       }
-                       hammer2_chain_unlock(hmp, child);
-               }
-               if (saved) {
-                       hammer2_saved_child_cleanup(hmp, chain, saved);
-                       saved = NULL;
+       /*
+        * Do not flush chain if there were any deferrals.  It will be
+        * retried later after the deferrals are independently handled.
+        */
+       if (diddeferral) {
+               if (hammer2_debug & 0x0008) {
+                       kprintf("%*.*s} %p/%d %04x (deferred)",
+                               info->depth, info->depth, "",
+                               chain, chain->refs, chain->flags);
                }
-               hammer2_chain_drop(hmp, chain);
+               return;
        }
 
        /*
-        * If destroying the object we unconditonally clear the MODIFIED
-        * and MOVED bits, and we destroy the buffer without writing it
-        * out.
+        * Chain objects flagged for complete destruction recurse down from
+        * their inode.  The inode will have already been removed from
+        * its parent.  We have no need to disconnect the children from
+        * their parents or the inode in this situation (it would just
+        * waste time and storage with copy-on-write operations), so
+        * we can clear both the MODIFIED bit and the MOVED bit.
         *
-        * We don't bother updating the hash/crc or the chain bref.
+        * However, delete_tid must be within the synchronization zone
+        * for us to act on this bit.  Open-but-deleted files have to
+        * be managed by the cluster such that they are not subjected to
+        * reclamation.
         *
-        * NOTE: The destroy'd object's bref has already been updated.
-        *       so we can clear MOVED without propagating mirror_tid
-        *       or modify_tid upward.
-        *
-        * XXX allocations for unflushed data can be returned to the
-        *     free pool.
+        * DESTROYED chains stop processing here.
         */
-       if (chain->flags & HAMMER2_CHAIN_DESTROYED) {
+       if ((chain->flags & HAMMER2_CHAIN_DESTROYED) &&
+           (chain->delete_tid <= info->sync_tid)) {
                if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
-                       if (chain->bp) {
+                       if (chain->bp)
                                chain->bp->b_flags |= B_INVAL|B_RELBUF;
-                       }
-                       atomic_clear_int(&chain->flags,
-                                        HAMMER2_CHAIN_MODIFIED |
-                                        HAMMER2_CHAIN_MODIFY_TID);
-                       hammer2_chain_drop(hmp, chain);
-               }
-               if (chain->flags & HAMMER2_CHAIN_MODIFIED_AUX) {
-                       atomic_clear_int(&chain->flags,
-                                        HAMMER2_CHAIN_MODIFIED_AUX);
+                       atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
+                       hammer2_chain_drop(chain);
                }
                if (chain->flags & HAMMER2_CHAIN_MOVED) {
-                       atomic_clear_int(&chain->flags,
-                                        HAMMER2_CHAIN_MOVED);
-                       hammer2_chain_drop(hmp, chain);
+                       atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MOVED);
+                       hammer2_chain_drop(chain);
+               }
+               if (hammer2_debug & 0x0008) {
+                       kprintf("%*.*s} %p/%d %04x (destroyed)",
+                               info->depth, info->depth, "",
+                               chain, chain->refs, chain->flags);
                }
                return;
        }
 
        /*
-        * Flush this chain entry only if it is marked modified.
-        */
-       if ((chain->flags & (HAMMER2_CHAIN_MODIFIED |
-                            HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
-               goto done;
-       }
-
-#if 0
-       /*
-        * Synchronize cumulative data and inode count adjustments to
-        * the inode and propagate the deltas upward to the parent.
+        * If MODIFIED is not set or modify_tid is > sync_tid we have
+        * nothing to do.
+        *
+        * Note that MOVED can be set without MODIFIED being set due to
+        * a deletion, in which case it is handled by Scan2 later on.
         *
-        * XXX removed atm
+        * Both bits can be set along with DELETED due to a deletion if
+        * modified data within the synchronization zone and the chain
+        * was then deleted beyond the zone, in which case we still have
+        * to flush for synchronization point consistency.
         */
-       if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
-               hammer2_inode_t *ip;
-
-               ip = chain->u.ip;
-               ip->ip_data.inode_count += ip->delta_icount;
-               ip->ip_data.data_count += ip->delta_dcount;
-               if (ip->pip) {
-                       ip->pip->delta_icount += ip->delta_icount;
-                       ip->pip->delta_dcount += ip->delta_dcount;
+       if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0)
+               return;
+       if (chain->bref.modify_tid > info->sync_tid) {
+               if (hammer2_debug & 0x0008) {
+                       kprintf("%*.*s} %p/%d %04x (skip - beyond sync_tid)",
+                               info->depth, info->depth, "",
+                               chain, chain->refs, chain->flags);
                }
-               ip->delta_icount = 0;
-               ip->delta_dcount = 0;
+               return;
        }
-#endif
 
        /*
-        * Flush if MODIFIED or MODIFIED_AUX is set.  MODIFIED_AUX is only
-        * used by the volume header (&hmp->vchain).
+        * Issue flush.
+        *
+        * A DESTROYED node that reaches this point must be flushed for
+        * synchronization point consistency.
         */
-       if ((chain->flags & (HAMMER2_CHAIN_MODIFIED |
-                            HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
-               goto done;
-       }
-       atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED_AUX);
 
        /*
-        * Clear MODIFIED and set HAMMER2_CHAIN_MOVED.  The caller
-        * will re-test the MOVED bit.  We must also update the mirror_tid
-        * and modify_tid fields as appropriate.
+        * Update mirror_tid, clear MODIFIED, and set MOVED.
         *
-        * bits own a single chain ref and the MOVED bit owns its own
-        * chain ref.
+        * The caller will update the parent's reference to this chain
+        * by testing MOVED as long as the modification was in-bounds.
+        *
+        * MOVED is never set on the volume root as there is no parent
+        * to adjust.
         */
-       chain->bref.mirror_tid = info->modify_tid;
-       if (chain->flags & HAMMER2_CHAIN_MODIFY_TID)
-               chain->bref.modify_tid = info->modify_tid;
+       if (chain->bref.mirror_tid < info->sync_tid)
+               chain->bref.mirror_tid = info->sync_tid;
        wasmodified = (chain->flags & HAMMER2_CHAIN_MODIFIED) != 0;
-       atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED |
-                                       HAMMER2_CHAIN_MODIFY_TID);
+       atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
+       if (chain == &hmp->vchain)
+               kprintf("(FLUSHED VOLUME HEADER)\n");
 
-       if (chain->flags & HAMMER2_CHAIN_MOVED) {
+       if ((chain->flags & HAMMER2_CHAIN_MOVED) ||
+           chain == &hmp->vchain) {
                /*
                 * Drop the ref from the MODIFIED bit we cleared.
                 */
                if (wasmodified)
-                       hammer2_chain_drop(hmp, chain);
+                       hammer2_chain_drop(chain);
        } else {
                /*
                 * If we were MODIFIED we inherit the ref from clearing
                 * that bit, otherwise we need another ref.
                 */
                if (wasmodified == 0)
-                       hammer2_chain_ref(hmp, chain);
+                       hammer2_chain_ref(chain);
                atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
        }
-       chain->bref_flush = chain->bref;
 
        /*
         * If this is part of a recursive flush we can go ahead and write
@@ -640,6 +473,7 @@ hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *chain,
                                 HAMMER2_VOLUME_ICRCVH_OFF,
                                HAMMER2_VOLUME_ICRCVH_SIZE);
                hmp->volsync = hmp->voldata;
+               atomic_set_int(&chain->flags, HAMMER2_CHAIN_VOLUMESYNC);
                break;
        case HAMMER2_BREF_TYPE_DATA:
                /*
@@ -657,7 +491,6 @@ hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *chain,
                if (bp) {
                        if ((bp->b_flags & (B_CACHE | B_DIRTY)) ==
                            (B_CACHE | B_DIRTY)) {
-                               kprintf("x");
                                cluster_awrite(bp);
                        } else {
                                bp->b_flags |= B_RELBUF;
@@ -675,7 +508,7 @@ hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *chain,
                 * Only write the buffer out if it is dirty, it is possible
                 * the operating system had already written out the buffer.
                 */
-               hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_ALWAYS);
+               hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
                KKASSERT(chain->bp != NULL);
 
                bp = chain->bp;
@@ -687,7 +520,7 @@ hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *chain,
                }
                chain->bp = NULL;
                chain->data = NULL;
-               hammer2_chain_unlock(hmp, chain);
+               hammer2_chain_unlock(chain);
                break;
        default:
                /*
@@ -743,34 +576,271 @@ hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *chain,
                                hammer2_icrc32(chain->data, chain->bytes);
                }
        }
-done:
        if (hammer2_debug & 0x0008) {
-               kprintf("%*.*s} %p/%d %04x ",
+               kprintf("%*.*s} %p/%d %04x (flushed)",
                        info->depth, info->depth, "",
                        chain, chain->refs, chain->flags);
        }
 }
 
-#if 0
 /*
- * PASS2 - not yet implemented (should be called only with the root chain?)
+ * Flush helper scan1 (recursive)
+ *
+ * Flushes the children of the caller's chain (parent) and updates
+ * the blockref.
+ *
+ * Ripouts during the loop should not cause any problems.  Because we are
+ * flushing to a synchronization point, modification races will occur after
+ * sync_tid and do not have to be flushed anyway.
  */
-static void
-hammer2_chain_flush_pass2(hammer2_mount_t *hmp, hammer2_chain_t *chain)
+static int
+hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data)
 {
+       hammer2_flush_info_t *info = data;
+       hammer2_chain_t *parent = info->parent;
+       /*hammer2_mount_t *hmp = info->hmp;*/
+       int diddeferral;
+
+       /*
+        * We should only need to recurse if SUBMODIFIED is set, but as
+        * a safety also recursive if MODIFIED is also set.  Return early
+        * if neither bit is set.
+        */
+       if ((child->flags & (HAMMER2_CHAIN_SUBMODIFIED |
+                            HAMMER2_CHAIN_MODIFIED)) == 0) {
+               return (0);
+       }
+       spin_unlock(&parent->core->cst.spin);
+
+       /*
+        * The caller has added a ref to the parent so we can temporarily
+        * unlock it in order to lock the child.  Re-check the flags before
+        * continuing.
+        */
+       hammer2_chain_unlock(parent);
+       hammer2_chain_lock(child, HAMMER2_RESOLVE_MAYBE);
+
+       if ((child->flags & (HAMMER2_CHAIN_SUBMODIFIED |
+                            HAMMER2_CHAIN_MODIFIED)) == 0) {
+               hammer2_chain_unlock(child);
+               hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
+               spin_lock(&parent->core->cst.spin);
+               return (0);
+       }
+
+       /*
+        * Propagate the DESTROYED flag if found set as well as deal with
+        * delete_tid.  This also causes SUBMODIFIED to be propagated
+        * downward to keep the recursion going.
+        *
+        * In the case of delete_tid, nothing need be done.  Destruction
+        * occurs after any deletions and destruction of internal chains
+        * where delete_tid may be 0 (since we don't bother to copy-on-write
+        * the propagation of a deletion) will pass the conditional just
+        * fine.
+        *
+        * This optimization allows the inode reclaim (destroy unlinked file
+        * on vnode reclamation after last close) to be flagged by just
+        * setting HAMMER2_CHAIN_DESTROYED at the top level.
+        */
+       if ((parent->flags & HAMMER2_CHAIN_DESTROYED) &&
+           (child->flags & HAMMER2_CHAIN_DESTROYED) == 0) {
+               atomic_set_int(&child->flags,
+                              HAMMER2_CHAIN_DESTROYED |
+                              HAMMER2_CHAIN_SUBMODIFIED);
+       }
+
+       /*
+        * Recurse and collect deferral data.
+        */
+       diddeferral = info->diddeferral;
+       ++info->depth;
+       hammer2_chain_flush_core(info, child);
+       --info->depth;
+       info->diddeferral += diddeferral;
+
+       hammer2_chain_unlock(child);
+
+       /*
+        * Always resolve when relocking the parent meta-data so Scan2
+        * has the indirect block data in-hand to handle the MOVED bit.
+        */
+       hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
+
+       spin_lock(&parent->core->cst.spin);
+       return (0);
 }
-#endif
 
-static
-void
-hammer2_saved_child_cleanup(hammer2_mount_t *hmp,
-                           hammer2_chain_t *parent, hammer2_chain_t *child)
+/*
+ * Flush helper scan2 (non-recursive)
+ *
+ * This pass on a chain's children propagates any MOVED or DELETED
+ * elements back up the chain towards the root.  The bref's modify_tid
+ * must be within the synchronization zone for MOVED to be recognized
+ * and delete_tid must be within the synchronization zone for DELETED
+ * to be recognized.
+ *
+ * We must re-set SUBMODIFIED if appropriate.
+ */
+static int
+hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data)
 {
-       atomic_add_int(&child->flushing, -1);
-       if (child->flushing == 0 && (child->flags & HAMMER2_CHAIN_DELETED)) {
-               kprintf("hammer2: fixup deferred deleted child\n");
-               hammer2_chain_lock(hmp, child, HAMMER2_RESOLVE_MAYBE);
-               hammer2_chain_delete(hmp, parent, child, 0);
-               hammer2_chain_unlock(hmp, child);
+       enum { HC_NONE, HC_DELETE, HC_UPDATE } action = HC_NONE;
+       hammer2_flush_info_t *info = data;
+       hammer2_chain_t *parent = info->parent;
+       hammer2_mount_t *hmp = info->hmp;
+       hammer2_blockref_t *base;
+       int count;
+
+       /*
+        * Check update conditions prior to locking child.
+        * We may not be able to safely test the 64-bit TIDs
+        * but we can certainly test the flags.
+        */
+       if ((child->flags & (HAMMER2_CHAIN_DELETED |
+                            HAMMER2_CHAIN_MOVED)) == 0) {
+               goto finalize;
        }
+       spin_unlock(&parent->core->cst.spin);
+
+       /*
+        * The MOVED bit implies an additional reference which prevents
+        * the child from being destroyed out from under our operation
+        * so we can lock the child safely without worrying about it
+        * getting ripped up (?).
+        */
+       hammer2_chain_lock(child, HAMMER2_RESOLVE_NEVER);
+
+       /*
+        * Full condition check.  We can only update and clear MOVED
+        * if the child is deleted or updated within our synchronization
+        * zone.
+        */
+       if ((child->flags & HAMMER2_CHAIN_DELETED) &&
+           child->delete_tid <= info->sync_tid) {
+               action = HC_DELETE;
+       } else if ((child->flags & HAMMER2_CHAIN_MOVED) &&
+                  child->bref.modify_tid <= info->sync_tid) {
+               action = HC_UPDATE;
+       } else {
+               hammer2_chain_unlock(child);
+               spin_lock(&parent->core->cst.spin);
+               goto finalize;
+       }
+
+       /*
+        * If the parent is to be deleted then we can clear MOVED
+        * in the child without updating the parent.  That is, it
+        * doesn't matter that the parent->child blockref is left intact
+        * because the parent is going to be deleted too.  This little
+        * bit of code will result in major optimizations of recursive
+        * file tree deletions and truncations.
+        */
+       if ((parent->flags & HAMMER2_CHAIN_DELETED) &&
+           parent->delete_tid <= info->sync_tid) {
+               goto cleanup;
+       }
+
+       /*
+        * The parent's blockref to the child must be deleted or updated.
+        *
+        * This point is not reached on successful DESTROYED optimizations
+        * but can be reached on recursive deletions.  We can optimize
+        */
+       hammer2_chain_modify(info->trans, parent, HAMMER2_MODIFY_NO_MODIFY_TID);
+
+       switch(parent->bref.type) {
+       case HAMMER2_BREF_TYPE_INODE:
+               KKASSERT((parent->data->ipdata.op_flags &
+                         HAMMER2_OPFLAG_DIRECTDATA) == 0);
+               base = &parent->data->ipdata.u.blockset.blockref[0];
+               count = HAMMER2_SET_COUNT;
+               break;
+       case HAMMER2_BREF_TYPE_INDIRECT:
+               if (parent->data) {
+                       base = &parent->data->npdata.blockref[0];
+               } else {
+                       base = NULL;
+                       KKASSERT(child->flags & HAMMER2_CHAIN_DELETED);
+               }
+               count = parent->bytes / sizeof(hammer2_blockref_t);
+               break;
+       case HAMMER2_BREF_TYPE_VOLUME:
+               base = &hmp->voldata.sroot_blockset.blockref[0];
+               count = HAMMER2_SET_COUNT;
+               break;
+       default:
+               base = NULL;
+               count = 0;
+               panic("hammer2_chain_get: "
+                     "unrecognized blockref type: %d",
+                     parent->bref.type);
+       }
+
+       /*
+        * Update the parent's blockref table and propagate mirror_tid.
+        * blockref updates do not touch modify_tid.  Instead, mirroring
+        * operations always reconcile the entire array during their
+        * mirror_tid based recursion.
+        */
+       if (action == HC_DELETE) {
+               if (base) {
+                       KKASSERT(child->index < count);
+                       bzero(&base[child->index], sizeof(child->bref));
+               }
+       } else {
+               if (base) {
+                       KKASSERT(child->index < count);
+                       base[child->index] = child->bref;
+               }
+       }
+       KKASSERT(child->index >= 0);
+
+       if (parent->bref.mirror_tid < child->bref.mirror_tid) {
+               parent->bref.mirror_tid = child->bref.mirror_tid;
+       }
+       if (parent->bref.type == HAMMER2_BREF_TYPE_VOLUME &&
+           hmp->voldata.mirror_tid < child->bref.mirror_tid) {
+               hmp->voldata.mirror_tid = child->bref.mirror_tid;
+       }
+
+cleanup:
+       /*
+        * Deletions should also zero-out the child's bref for safety.
+        */
+       if (action == HC_DELETE)
+               bzero(&child->bref, sizeof(child->bref));
+
+       /*
+        * Cleanup the child's MOVED flag and unlock the child.
+        */
+       if (child->flags & HAMMER2_CHAIN_MOVED) {
+               atomic_clear_int(&child->flags, HAMMER2_CHAIN_MOVED);
+               hammer2_chain_drop(child);      /* flag */
+       }
+
+       /*
+        * Unlock the child.  This can wind up dropping the child's
+        * last ref, removing it from the parent's RB tree, and deallocating
+        * the structure.  The RB_SCAN() our caller is doing handles the
+        * situation.
+        */
+       hammer2_chain_unlock(child);
+       spin_lock(&parent->core->cst.spin);
+
+       /*
+        * The parent cleared SUBMODIFIED prior to the scan.  If the child
+        * still requires a flush (possibly due to being outside the current
+        * synchronization zone), we must re-set SUBMODIFIED on the way back
+        * up.
+        */
+finalize:
+       if (child->flags & (HAMMER2_CHAIN_MOVED |
+                           HAMMER2_CHAIN_DELETED |
+                           HAMMER2_CHAIN_MODIFIED |
+                           HAMMER2_CHAIN_SUBMODIFIED)) {
+               atomic_set_int(&parent->flags, HAMMER2_CHAIN_SUBMODIFIED);
+       }
+
+       return (0);
 }
index 52fd402..1cb22df 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2011-2013 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
  * by Matthew Dillon <dillon@dragonflybsd.org>
@@ -128,8 +128,7 @@ hammer2_freemap_alloc(hammer2_mount_t *hmp, int type, size_t bytes)
                                        ~HAMMER2_SEGMASK64;
                        fc->bulk = data_next;
                }
-               atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED_AUX);
-               hammer2_voldata_unlock(hmp);
+               hammer2_voldata_unlock(hmp, 1);
        }
        lockmgr(&hmp->alloclk, LK_RELEASE);
 
index d682aad..feff9e6 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2011-2013 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
  * by Matthew Dillon <dillon@dragonflybsd.org>
@@ -77,7 +77,7 @@ hammer2_inode_drop(hammer2_inode_t *ip)
                                chain = ip->chain;
                                ip->chain = NULL;
                                if (chain)
-                                       hammer2_chain_drop(hmp, chain);
+                                       hammer2_chain_drop(chain);
 
                                /*
                                 * We have to drop pip (if non-NULL) to
@@ -258,7 +258,7 @@ hammer2_inode_get(hammer2_mount_t *hmp, hammer2_pfsmount_t *pmp,
        nip = kmalloc(sizeof(*nip), hmp->minode, M_WAITOK | M_ZERO);
 
        nip->chain = chain;
-       hammer2_chain_ref(hmp, chain);          /* nip->chain */
+       hammer2_chain_ref(chain);               /* nip->chain */
        nip->pip = dip;                         /* can be NULL */
        if (dip)
                hammer2_inode_ref(dip); /* ref dip for nip->pip */
@@ -279,33 +279,32 @@ hammer2_inode_get(hammer2_mount_t *hmp, hammer2_pfsmount_t *pmp,
 }
 
 /*
- * Put away an inode, disconnecting it from its chain.  The inode must be
- * exclusively locked.
+ * Put away an inode, unlocking it and disconnecting it from its chain.
  *
- * The inode will be unlocked by this function.  Note however that any related
- * chain returned by the hammer2_inode_lock_*() call will NOT be unlocked
- * by this function.  The related chain is dropped to undo the ref that
- * hammer2_inode_get() put on it.
+ * The inode must be exclusively locked on call and non-recursed, with
+ * at least 2 refs (one belonging to the exclusive lock, and one additional
+ * ref belonging to the caller).
  *
- * passed_chain is unlocked normally and does not have to be directly
- * associated with (ip).  This is simply so the API works the same as
- * the hammer2_inode_unlock_ex() API.  NULL is ok.
+ * Upon return the inode typically has one ref remaining which the caller
+ * drops.
  */
 void
-hammer2_inode_put(hammer2_inode_t *ip, hammer2_chain_t *passed_chain)
+hammer2_inode_put(hammer2_inode_t *ip)
 {
-       hammer2_mount_t *hmp = ip->hmp;
        hammer2_inode_t *pip;
        hammer2_chain_t *chain;
 
        /*
-        * Disconnect chain
+        * Disconnect and unlock chain
         */
+       KKASSERT(ip->refs >= 2);
+       KKASSERT(ip->topo_cst.count == -1);     /* one excl lock allowed */
        if ((chain = ip->chain) != NULL) {
                ip->chain = NULL;
-               hammer2_chain_drop(hmp, chain);         /* from *_get() */
+               hammer2_inode_unlock_ex(ip);
+               hammer2_chain_unlock(chain);    /* because ip->chain now NULL */
+               hammer2_chain_drop(chain);      /* from *_get() */
        }
-       KKASSERT(ip->topo_cst.count == -1);     /* one excl lock allowed */
 
        /*
         * Disconnect pip
@@ -314,11 +313,6 @@ hammer2_inode_put(hammer2_inode_t *ip, hammer2_chain_t *passed_chain)
                ip->pip = NULL;
                hammer2_inode_drop(pip);
        }
-
-       /*
-        * clean up the ip, we use an inode_unlock_ex-compatible API.
-        */
-       hammer2_inode_unlock_ex(ip, passed_chain);
 }
 
 /*
@@ -334,12 +328,13 @@ hammer2_inode_put(hammer2_inode_t *ip, hammer2_chain_t *passed_chain)
  *
  * dip is not locked on entry.
  */
-int
-hammer2_inode_create(hammer2_inode_t *dip,
+hammer2_inode_t *
+hammer2_inode_create(hammer2_trans_t *trans, hammer2_inode_t *dip,
                     struct vattr *vap, struct ucred *cred,
                     const uint8_t *name, size_t name_len,
-                    hammer2_inode_t **nipp, hammer2_chain_t **nchainp)
+                    int *errorp)
 {
+       hammer2_inode_data_t *dipdata;
        hammer2_inode_data_t *nipdata;
        hammer2_mount_t *hmp;
        hammer2_chain_t *chain;
@@ -354,6 +349,7 @@ hammer2_inode_create(hammer2_inode_t *dip,
 
        hmp = dip->hmp;
        lhc = hammer2_dirhash(name, name_len);
+       *errorp = 0;
 
        /*
         * Locate the inode or indirect block to create the new
@@ -361,68 +357,74 @@ hammer2_inode_create(hammer2_inode_t *dip,
         * and iterate until we don't get one.
         */
 retry:
-       parent = hammer2_inode_lock_ex(dip);
-
-       dip_uid = parent->data->ipdata.uid;
-       dip_gid = parent->data->ipdata.gid;
-       dip_mode = parent->data->ipdata.mode;
+       hammer2_inode_lock_ex(dip);
+       dipdata = &dip->chain->data->ipdata;
+       dip_uid = dipdata->uid;
+       dip_gid = dipdata->gid;
+       dip_mode = dipdata->mode;
 
+       parent = hammer2_chain_lookup_init(dip->chain, 0);
        error = 0;
        while (error == 0) {
-               chain = hammer2_chain_lookup(hmp, &parent, lhc, lhc, 0);
+               chain = hammer2_chain_lookup(&parent, lhc, lhc, 0);
                if (chain == NULL)
                        break;
                if ((lhc & HAMMER2_DIRHASH_VISIBLE) == 0)
                        error = ENOSPC;
                if ((lhc & HAMMER2_DIRHASH_LOMASK) == HAMMER2_DIRHASH_LOMASK)
                        error = ENOSPC;
-               hammer2_chain_unlock(hmp, chain);
+               hammer2_chain_unlock(chain);
                chain = NULL;
                ++lhc;
        }
        if (error == 0) {
-               chain = hammer2_chain_create(hmp, parent, NULL, lhc, 0,
+               error = hammer2_chain_create(trans, parent, &chain,
+                                            lhc, 0,
                                             HAMMER2_BREF_TYPE_INODE,
-                                            HAMMER2_INODE_BYTES,
-                                            &error);
+                                            HAMMER2_INODE_BYTES);
        }
 
-       hammer2_inode_unlock_ex(dip, parent);
-
        /*
-        * Handle the error case
+        * Cleanup and handle retries.
         */
+       if (error == EAGAIN) {
+               hammer2_chain_ref(parent);
+               hammer2_chain_lookup_done(parent);
+               hammer2_inode_unlock_ex(dip);
+               hammer2_chain_wait(parent);
+               hammer2_chain_drop(parent);
+               goto retry;
+       }
+       hammer2_chain_lookup_done(parent);
+       hammer2_inode_unlock_ex(dip);
+
        if (error) {
                KKASSERT(chain == NULL);
-               if (error == EAGAIN) {
-                       hammer2_chain_wait(hmp, parent);
-                       goto retry;
-               }
-               *nipp = NULL;
-               *nchainp = NULL;
-               return (error);
+               *errorp = error;
+               return (NULL);
        }
 
        /*
         * Set up the new inode.
         *
         * NOTE: *_get() integrates chain's lock into the inode lock.
+        *
+        * NOTE: Only one new inode can currently be created per
+        *       transaction.  If the need arises we can adjust
+        *       hammer2_trans_init() to allow more.
         */
        nip = hammer2_inode_get(dip->hmp, dip->pmp, dip, chain);
-       *nipp = nip;
-       *nchainp = chain;
        nipdata = &chain->data->ipdata;
 
-       hammer2_voldata_lock(hmp);
        if (vap) {
+               KKASSERT(trans->inodes_created == 0);
                nipdata->type = hammer2_get_obj_type(vap->va_type);
-               nipdata->inum = hmp->voldata.alloc_tid++;
-               /* XXX modify/lock */
+               nipdata->inum = trans->sync_tid;
+               ++trans->inodes_created;
        } else {
                nipdata->type = HAMMER2_OBJTYPE_DIRECTORY;
                nipdata->inum = 1;
        }
-       hammer2_voldata_unlock(hmp);
        nipdata->version = HAMMER2_INODE_VERSION_ONE;
        hammer2_update_time(&nipdata->ctime);
        nipdata->mtime = nipdata->ctime;
@@ -470,32 +472,27 @@ retry:
        nipdata->name_key = lhc;
        nipdata->name_len = name_len;
 
-       return (0);
+       return (nip);
 }
 
 /*
- * Create a duplicate of the inode (chain) in the specified target directory
- * (dip), return the duplicated chain in *nchainp (locked).  chain is locked
- * on call and remains locked on return.
- *
- * If name is NULL the inode is duplicated as a hidden directory entry.
- *
- * XXX name needs to be NULL for now.
+ * Create a duplicate of (ochain) in the specified target directory (dip).
+ * ochain must represent an inode.  The new chain is returned locked and
+ * referenced.
  */
-int
-hammer2_inode_duplicate(hammer2_inode_t *dip,
-                       hammer2_chain_t *ochain, hammer2_chain_t **nchainp)
+hammer2_chain_t *
+hammer2_inode_duplicate(hammer2_trans_t *trans, hammer2_chain_t *ochain,
+                       hammer2_inode_t *dip, int *errorp)
 {
        hammer2_inode_data_t *nipdata;
        hammer2_mount_t *hmp;
        hammer2_chain_t *parent;
        hammer2_chain_t *chain;
        hammer2_key_t lhc;
-       int error = 0;
 
+       *errorp = 0;
        hmp = dip->hmp;
        lhc = ochain->data->ipdata.inum;
-       *nchainp = NULL;
        KKASSERT((lhc & HAMMER2_DIRHASH_VISIBLE) == 0);
 
        /*
@@ -505,44 +502,43 @@ hammer2_inode_duplicate(hammer2_inode_t *dip,
         * There should be no key collisions with invisible inode keys.
         */
 retry:
-       parent = dip->chain;
-       hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
-       chain = hammer2_chain_lookup(hmp, &parent, lhc, lhc, 0);
+       parent = hammer2_chain_lookup_init(dip->chain, 0);
+       chain = hammer2_chain_lookup(&parent, lhc, lhc, 0);
        if (chain) {
-               hammer2_chain_unlock(hmp, chain);
+               hammer2_chain_unlock(chain);
                chain = NULL;
-               error = ENOSPC;
+               *errorp = ENOSPC;
        }
 
        /*
         * Create entry in common parent directory.
         */
-       if (error == 0) {
-               chain = hammer2_chain_create(hmp, parent, NULL, lhc, 0,
-                                            HAMMER2_BREF_TYPE_INODE, /* n/a */
-                                            HAMMER2_INODE_BYTES,     /* n/a */
-                                            &error);
+       if (*errorp == 0) {
+               *errorp = hammer2_chain_create(trans, parent, &chain,
+                                              lhc, 0,
+                                              HAMMER2_BREF_TYPE_INODE,/* n/a */
+                                              HAMMER2_INODE_BYTES);   /* n/a */
        }
 
        /*
-        * Clean up, but we need to retain a ref on parent so we can wait
-        * on it for certain errors.
+        * Cleanup and handle retries.
         */
-       if (error == EAGAIN)
-               hammer2_chain_ref(hmp, parent);
-       hammer2_chain_unlock(hmp, parent);
+       if (*errorp == EAGAIN) {
+               hammer2_chain_ref(parent);
+               hammer2_chain_lookup_done(parent);
+               hammer2_chain_wait(parent);
+               hammer2_chain_drop(parent);
+               goto retry;
+       }
+
+       hammer2_chain_lookup_done(parent);
 
        /*
         * Handle the error case
         */
-       if (error) {
+       if (*errorp) {
                KKASSERT(chain == NULL);
-               if (error == EAGAIN) {
-                       hammer2_chain_wait(hmp, parent);
-                       hammer2_chain_drop(hmp, parent);
-                       goto retry;
-               }
-               return (error);
+               return (NULL);
        }
 
        /*
@@ -564,10 +560,10 @@ retry:
         * pointer to the real hardlink's inum and can't have any sub-chains.
         * XXX might be 0-ref chains left.
         */
-       hammer2_chain_flush(hmp, ochain, 0);
+       hammer2_chain_flush(trans, ochain);
        /*KKASSERT(RB_EMPTY(&ochain.rbhead));*/
 
-       hammer2_chain_modify(hmp, chain, 0);
+       hammer2_chain_modify(trans, chain, 0);
        nipdata = &chain->data->ipdata;
        *nipdata = ochain->data->ipdata;
 
@@ -581,29 +577,39 @@ retry:
        nipdata->name_len = strlen(nipdata->filename);
        nipdata->name_key = lhc;
 
-       *nchainp = chain;
-
-       return (0);
+       return (chain);
 }
 
 /*
- * Connect *chainp to the media topology represented by (dip, name, len).
- * A directory entry is created which points to *chainp.  *chainp is then
- * unlocked and set to NULL.
+ * Connect the target inode to the media topology at (dip, name, len).
+ * This function creates a directory entry and replace (*chainp).
  *
- * If *chainp is not currently connected we simply connect it up.
+ * If (*chainp) was marked DELETED then it represents a terminus inode
+ * with no other nlinks, we can simply duplicate the chain (in-memory
+ * chain structures cannot be moved within the in-memory topology, only
+ * duplicated, but the duplicate uses the same bref).
  *
- * If *chainp is already connected we create a OBJTYPE_HARDLINK entry which
- * points to chain's inode number.  *chainp is expected to be the terminus of
- * the hardlink sitting as a hidden file in a common parent directory
- * in this situation.
+ * if (*chainp) is not marked DELETED then it represents a hardlink
+ * terminus which still has a non-zero nlink count.  Instead of duplicating
+ * it (which would be like a snapshot), we need to create a
+ * OBJTYPE_HARDLINK directory entry which references (*chainp)'s inode
+ * number and bump (*chainp)'s nlinks.  In this situation we return
+ * the terminus as *chainp.
  *
- * The caller always wants to reference the hardlink terminus, not the
- * hardlink pointer that we might be creating, so we do NOT replace
- * *chainp here, we simply unlock and NULL it out.
+ * (*chainp) is adjusted if necessary and returned locked.  If different,
+ * the original (*chainp) is unlocked.  Note that the (*chainp) that is
+ * returned is always the hardlink terminus (the actual inode), which
+ * might reside in some parent directory.  It will not be the
+ * OBJTYPE_HARDLINK pointer.
+ *
+ * WARNING!  This function will also replace ip->chain.  The related inode
+ *          must be locked exclusively or would wind up racing other
+ *          modifying operations on the same inode which then wind up
+ *          modifying under the old chain instead of the new chain.
  */
 int
-hammer2_inode_connect(hammer2_inode_t *dip, hammer2_chain_t **chainp,
+hammer2_inode_connect(hammer2_trans_t *trans, hammer2_inode_t *dip,
+                     hammer2_inode_t *ip, hammer2_chain_t **chainp,
                      const uint8_t *name, size_t name_len)
 {
        hammer2_inode_data_t *ipdata;
@@ -618,25 +624,27 @@ hammer2_inode_connect(hammer2_inode_t *dip, hammer2_chain_t **chainp,
        hmp = dip->hmp;
 
        ochain = *chainp;
-       *chainp = NULL;
 
        /*
         * Since ochain is either disconnected from the topology or represents
         * a hardlink terminus which is always a parent of or equal to dip,
         * we should be able to safely lock dip->chain for our setup.
         */
-retry:
-       parent = dip->chain;
-       hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
+       parent = hammer2_chain_lookup_init(dip->chain, 0);
 
        lhc = hammer2_dirhash(name, name_len);
-       hlink = (ochain->parent != NULL);
+       hlink = ((ochain->flags & HAMMER2_CHAIN_DELETED) != 0);
+       kprintf("reconnect hlink=%d name=%*.*s\n",
+               hlink, (int)name_len, (int)name_len, name);
 
        /*
         * In fake mode flush oip so we can just snapshot it downbelow.
+        * A flush is not otherwise needed as the new chain inherits
+        * all active children of the old chain (they will share the same
+        * chain_core).
         */
        if (hlink && hammer2_hardlink_enable < 0)
-               hammer2_chain_flush(hmp, ochain, 0);
+               hammer2_chain_flush(trans, ochain);
 
        /*
         * Locate the inode or indirect block to create the new
@@ -645,12 +653,12 @@ retry:
         */
        error = 0;
        while (error == 0) {
-               nchain = hammer2_chain_lookup(hmp, &parent, lhc, lhc, 0);
+               nchain = hammer2_chain_lookup(&parent, lhc, lhc, 0);
                if (nchain == NULL)
                        break;
                if ((lhc & HAMMER2_DIRHASH_LOMASK) == HAMMER2_DIRHASH_LOMASK)
                        error = ENOSPC;
-               hammer2_chain_unlock(hmp, nchain);
+               hammer2_chain_unlock(nchain);
                nchain = NULL;
                ++lhc;
        }
@@ -662,35 +670,37 @@ retry:
         */
        if (error == 0) {
                if (hlink) {
-                       nchain = hammer2_chain_create(hmp, parent,
-                                                    NULL, lhc, 0,
+                       /*
+                        * Hardlink pointer needed, create totally fresh
+                        * directory entry.
+                        */
+                       KKASSERT(nchain == NULL);
+                       error = hammer2_chain_create(trans, parent, &nchain,
+                                                    lhc, 0,
                                                     HAMMER2_BREF_TYPE_INODE,
-                                                    HAMMER2_INODE_BYTES,
-                                                    &error);
+                                                    HAMMER2_INODE_BYTES);
                } else {
                        /*
-                        * NOTE: reconnects oip->chain to the media
-                        *       topology and returns its argument
-                        *       (oip->chain).
-                        *
-                        * No additional locks or refs are obtained on
-                        * the returned chain so don't double-unlock!
+                        * Original inode reconnected, duplicate as a
+                        * new directory entry, leave unconnected and
+                        * then call chain_create() to connect it.
                         */
-                       nchain = hammer2_chain_create(hmp, parent,
-                                                    ochain, lhc, 0,
+                       nchain = ochain;
+                       ochain = NULL;
+                       hammer2_chain_duplicate(trans, NULL, -1, &nchain);
+                       error = hammer2_chain_create(trans, parent, &nchain,
+                                                    lhc, 0,
                                                     HAMMER2_BREF_TYPE_INODE,
-                                                    HAMMER2_INODE_BYTES,
-                                                    &error);
+                                                    HAMMER2_INODE_BYTES);
                }
        }
 
        /*
-        * Unlock stuff.  This is a bit messy, if we have an EAGAIN error
-        * we need to wait for operations on parent to finish.
+        * Unlock stuff.
         */
-       if (error == EAGAIN)
-               hammer2_chain_ref(hmp, parent);
-       hammer2_chain_unlock(hmp, parent);
+       KKASSERT(error != EAGAIN);
+       hammer2_chain_lookup_done(parent);
+       parent = NULL;
 
        /*
         * ochain still active.
@@ -699,12 +709,7 @@ retry:
         */
        if (error) {
                KKASSERT(nchain == NULL);
-               if (error == EAGAIN) {
-                       hammer2_chain_wait(hmp, parent);
-                       hammer2_chain_drop(hmp, parent);
-                       goto retry;
-               }
-               hammer2_chain_unlock(hmp, ochain);
+               hammer2_chain_unlock(ochain);
                return (error);
        }
 
@@ -714,16 +719,15 @@ retry:
         *
         * When creating an OBJTYPE_HARDLINK entry remember to unlock the
         * chain, the caller will access the hardlink via the actual hardlink
-        * target file and not the hardlink pointer entry.
+        * target file and not the hardlink pointer entry, so we must still
+        * return ochain.
         */
        if (hlink && hammer2_hardlink_enable >= 0) {
                /*
                 * Create the HARDLINK pointer.  oip represents the hardlink
                 * target in this situation.
-                *
-                * NOTE: *_get() integrates chain's lock into the inode lock.
                 */
-               hammer2_chain_modify(hmp, nchain, 0);
+               hammer2_chain_modify(trans, nchain, 0);
                KKASSERT(name_len < HAMMER2_INODE_MAXNAME);
                ipdata = &nchain->data->ipdata;
                bcopy(name, ipdata->filename, name_len);
@@ -735,14 +739,19 @@ retry:
                ipdata->nlinks = 1;
                kprintf("created hardlink %*.*s\n",
                        (int)name_len, (int)name_len, name);
-               hammer2_chain_unlock(hmp, nchain);
+               hammer2_chain_unlock(nchain);
+               nchain = ochain;
+               ochain = NULL;
        } else if (hlink && hammer2_hardlink_enable < 0) {
                /*
                 * Create a snapshot (hardlink fake mode for debugging).
+                * (ochain already flushed above so we can just copy the
+                * bref XXX).
                 *
-                * NOTE: *_get() integrates nchain's lock into the inode lock.
+                * Since this is a snapshot we return nchain in the fake
+                * hardlink case.
                 */
-               hammer2_chain_modify(hmp, nchain, 0);
+               hammer2_chain_modify(trans, nchain, 0);
                KKASSERT(name_len < HAMMER2_INODE_MAXNAME);
                ipdata = &nchain->data->ipdata;
                *ipdata = ochain->data->ipdata;
@@ -751,18 +760,15 @@ retry:
                ipdata->name_len = name_len;
                kprintf("created fake hardlink %*.*s\n",
                        (int)name_len, (int)name_len, name);
-               hammer2_chain_unlock(hmp, nchain);
        } else {
                /*
-                * Normally disconnected inode (e.g. during a rename) that
-                * was reconnected.  We must fixup the name stored in
-                * oip.
+                * We are reconnecting a previously DELETED node in a new
+                * location.  nchain is a duplication of the deleted node.
                 *
-                * We are using oip as chain, already locked by caller,
-                * do not unlock it.
+                * We must fixup the name stored in oip.
                 */
-               hammer2_chain_modify(hmp, ochain, 0);
-               ipdata = &ochain->data->ipdata;
+               hammer2_chain_modify(trans, nchain, 0);
+               ipdata = &nchain->data->ipdata;
 
                if (ipdata->name_len != name_len ||
                    bcmp(ipdata->filename, name, name_len) != 0) {
@@ -773,7 +779,20 @@ retry:
                }
                ipdata->nlinks = 1;
        }
-       hammer2_chain_unlock(hmp, ochain);
+       if (ochain)
+               hammer2_chain_unlock(ochain);
+       *chainp = nchain;
+
+       /*
+        * Replace ip->chain if necessary.  XXX inode sub-topology replacement.
+        */
+       if (ip->chain != nchain) {
+               hammer2_chain_ref(nchain);                      /* ip->chain */
+               if (ip->chain)
+                       hammer2_chain_drop(ip->chain);          /* ip->chain */
+               ip->chain = nchain;
+       }
+
        return (0);
 }
 
@@ -788,13 +807,14 @@ retry:
  * isdir determines whether a directory/non-directory check should be made.
  * No check is made if isdir is set to -1.
  *
- * If retain_chain is non-NULL this function can fail with an EAGAIN if it
- * catches the object in the middle of a flush.
+ * NOTE!  This function does not prevent the underlying file from still
+ *       being used if it has other refs (such as from an inode, or if it's
+ *       chain is manually held).  However, the caller is responsible for
+ *       fixing up ip->chain if e.g. a rename occurs (see chain_duplicate()).
  */
 int
-hammer2_unlink_file(hammer2_inode_t *dip,
-                   const uint8_t *name, size_t name_len,
-                   int isdir, hammer2_chain_t *retain_chain)
+hammer2_unlink_file(hammer2_trans_t *trans, hammer2_inode_t *dip,
+                   const uint8_t *name, size_t name_len, int isdir)
 {
        hammer2_inode_data_t *ipdata;
        hammer2_mount_t *hmp;
@@ -817,8 +837,10 @@ hammer2_unlink_file(hammer2_inode_t *dip,
        /*
         * Search for the filename in the directory
         */
-       parent = hammer2_inode_lock_ex(dip);
-       chain = hammer2_chain_lookup(hmp, &parent,
+       hammer2_inode_lock_ex(dip);
+
+       parent = hammer2_chain_lookup_init(dip->chain, 0);
+       chain = hammer2_chain_lookup(&parent,
                                     lhc, lhc + HAMMER2_DIRHASH_LOMASK,
                                     0);
        while (chain) {
@@ -827,11 +849,11 @@ hammer2_unlink_file(hammer2_inode_t *dip,
                    bcmp(name, chain->data->ipdata.filename, name_len) == 0) {
                        break;
                }
-               chain = hammer2_chain_next(hmp, &parent, chain,
+               chain = hammer2_chain_next(&parent, chain,
                                           lhc, lhc + HAMMER2_DIRHASH_LOMASK,
                                           0);
        }
-       hammer2_inode_unlock_ex(dip, NULL);     /* retain parent */
+       hammer2_inode_unlock_ex(dip);   /* retain parent */
 
        /*
         * Not found or wrong type (isdir < 0 disables the type check).
@@ -863,7 +885,7 @@ hammer2_unlink_file(hammer2_inode_t *dip,
         */
        if (chain->data->ipdata.type == HAMMER2_OBJTYPE_HARDLINK) {
                KKASSERT(parent_ref == 0);
-               hammer2_chain_unlock(hmp, parent);
+               hammer2_chain_unlock(parent);
                parent = NULL;
                error = hammer2_hardlink_find(dip, &chain, &ochain);
        }
@@ -879,18 +901,17 @@ hammer2_unlink_file(hammer2_inode_t *dip,
         *       entries.
         */
        if (type == HAMMER2_OBJTYPE_DIRECTORY && isdir >= 0) {
-               dparent = chain;
-               hammer2_chain_lock(hmp, dparent, HAMMER2_RESOLVE_ALWAYS);
-               dchain = hammer2_chain_lookup(hmp, &dparent,
+               dparent = hammer2_chain_lookup_init(chain, 0);
+               dchain = hammer2_chain_lookup(&dparent,
                                              0, (hammer2_key_t)-1,
                                              HAMMER2_LOOKUP_NODATA);
                if (dchain) {
-                       hammer2_chain_unlock(hmp, dchain);
-                       hammer2_chain_unlock(hmp, dparent);
+                       hammer2_chain_unlock(dchain);
+                       hammer2_chain_lookup_done(dparent);
                        error = ENOTEMPTY;
                        goto done;
                }
-               hammer2_chain_unlock(hmp, dparent);
+               hammer2_chain_lookup_done(dparent);
                dparent = NULL;
                /* dchain NULL */
        }
@@ -914,30 +935,24 @@ hammer2_unlink_file(hammer2_inode_t *dip,
                 * NOTE: parent from above is NULL when ochain != NULL
                 *       so we can reuse it.
                 */
-               hammer2_chain_lock(hmp, ochain, HAMMER2_RESOLVE_ALWAYS);
+               hammer2_chain_lock(ochain, HAMMER2_RESOLVE_ALWAYS);
                parent_ref = 1;
                for (;;) {
                        parent = ochain->parent;
-                       hammer2_chain_ref(hmp, parent);
-                       hammer2_chain_unlock(hmp, ochain);
-                       hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
-                       hammer2_chain_lock(hmp, ochain, HAMMER2_RESOLVE_ALWAYS);
+                       hammer2_chain_ref(parent);
+                       hammer2_chain_unlock(ochain);
+                       hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
+                       hammer2_chain_lock(ochain, HAMMER2_RESOLVE_ALWAYS);
                        if (ochain->parent == parent)
                                break;
-                       hammer2_chain_unlock(hmp, parent);
-                       hammer2_chain_drop(hmp, parent);
+                       hammer2_chain_unlock(parent);
+                       hammer2_chain_drop(parent);
                }
 
-               if (ochain == retain_chain && ochain->flushing) {
-                       hammer2_chain_unlock(hmp, ochain);
-                       error = EAGAIN;
-                       goto done;
-               }
-               hammer2_chain_delete(hmp, parent, ochain,
-                                    (ochain == retain_chain));
-               hammer2_chain_unlock(hmp, ochain);
-               hammer2_chain_unlock(hmp, parent);
-               hammer2_chain_drop(hmp, parent);
+               hammer2_chain_delete(trans, parent, ochain);
+               hammer2_chain_unlock(ochain);
+               hammer2_chain_unlock(parent);
+               hammer2_chain_drop(parent);
                parent = NULL;
 
                /*
@@ -946,19 +961,17 @@ hammer2_unlink_file(hammer2_inode_t *dip,
                 */
                if (chain->data->ipdata.nlinks == 1) {
                        dparent = chain->parent;
-                       hammer2_chain_ref(hmp, chain);
-                       hammer2_chain_unlock(hmp, chain);
-                       hammer2_chain_lock(hmp, dparent,
-                                          HAMMER2_RESOLVE_ALWAYS);
-                       hammer2_chain_lock(hmp, chain,
-                                          HAMMER2_RESOLVE_ALWAYS);
-                       hammer2_chain_drop(hmp, chain);
-                       hammer2_chain_modify(hmp, chain, 0);
+                       hammer2_chain_ref(chain);
+                       hammer2_chain_unlock(chain);
+                       hammer2_chain_lock(dparent, HAMMER2_RESOLVE_ALWAYS);
+                       hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
+                       hammer2_chain_drop(chain);
+                       hammer2_chain_modify(trans, chain, 0);
                        --chain->data->ipdata.nlinks;
-                       hammer2_chain_delete(hmp, dparent, chain, 0);
-                       hammer2_chain_unlock(hmp, dparent);
+                       hammer2_chain_delete(trans, dparent, chain);
+                       hammer2_chain_unlock(dparent);
                } else {
-                       hammer2_chain_modify(hmp, chain, 0);
+                       hammer2_chain_modify(trans, chain, 0);
                        --chain->data->ipdata.nlinks;
                }
        } else {
@@ -969,27 +982,22 @@ hammer2_unlink_file(hammer2_inode_t *dip,
                 * NOTE: *_get() integrates chain's lock into the inode lock.
                 */
                ipdata = &chain->data->ipdata;
-               if (chain == retain_chain && chain->flushing) {
-                       error = EAGAIN;
-                       goto done;
-               }
-               hammer2_chain_modify(hmp, chain, 0);
+               hammer2_chain_modify(trans, chain, 0);
                --ipdata->nlinks;
-               hammer2_chain_delete(hmp, parent, chain,
-                                    (retain_chain == chain));
+               hammer2_chain_delete(trans, parent, chain);
        }
 
        error = 0;
 done:
        if (chain)
-               hammer2_chain_unlock(hmp, chain);
+               hammer2_chain_unlock(chain);
        if (parent) {
-               hammer2_chain_unlock(hmp, parent);
+               hammer2_chain_lookup_done(parent);
                if (parent_ref)
-                       hammer2_chain_drop(hmp, parent);
+                       hammer2_chain_drop(parent);
        }
        if (ochain)
-               hammer2_chain_drop(hmp, ochain);
+               hammer2_chain_drop(ochain);
 
        return error;
 }
@@ -1018,7 +1026,8 @@ hammer2_inode_calc_alloc(hammer2_key_t filesize)
  * If the file has to be relocated ip->chain will also be adjusted.
  */
 int
-hammer2_hardlink_consolidate(hammer2_inode_t *ip, hammer2_chain_t **chainp,
+hammer2_hardlink_consolidate(hammer2_trans_t *trans, hammer2_inode_t *ip,
+                            hammer2_chain_t **chainp,
                             hammer2_inode_t *tdip, int nlinks)
 {
        hammer2_inode_data_t *ipdata;
@@ -1030,23 +1039,32 @@ hammer2_hardlink_consolidate(hammer2_inode_t *ip, hammer2_chain_t **chainp,
        hammer2_chain_t *parent;
        int error;
 
+       /*
+        * Extra lock on chain so it can be returned locked.
+        */
        hmp = tdip->hmp;
-       *chainp = NULL;
-       chain = hammer2_inode_lock_ex(ip);
+       hammer2_inode_lock_ex(ip);
+
+       chain = ip->chain;
+       error = hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
+       KKASSERT(error == 0);
 
        if (nlinks == 0 &&                      /* no hardlink needed */
            (chain->data->ipdata.name_key & HAMMER2_DIRHASH_VISIBLE)) {
-               hammer2_inode_unlock_ex(ip, NULL);
+               hammer2_inode_unlock_ex(ip);
                *chainp = chain;
                return (0);
        }
        if (hammer2_hardlink_enable < 0) {      /* fake hardlinks */
-               hammer2_inode_unlock_ex(ip, NULL);
+               hammer2_inode_unlock_ex(ip);
                *chainp = chain;
                return (0);
        }
+
        if (hammer2_hardlink_enable == 0) {     /* disallow hardlinks */
-               hammer2_inode_unlock_ex(ip, chain);
+               hammer2_inode_unlock_ex(ip);
+               hammer2_chain_unlock(chain);
+               *chainp = NULL;
                return (ENOTSUP);
        }
 
@@ -1054,7 +1072,7 @@ hammer2_hardlink_consolidate(hammer2_inode_t *ip, hammer2_chain_t **chainp,
         * cdip will be returned with a ref, but not locked.
         */
        fdip = ip->pip;
-       cdip = hammer2_inode_common_parent(hmp, fdip, tdip);
+       cdip = hammer2_inode_common_parent(fdip, tdip);
 
        /*
         * If no change in the hardlink's target directory is required and
@@ -1064,7 +1082,7 @@ hammer2_hardlink_consolidate(hammer2_inode_t *ip, hammer2_chain_t **chainp,
        if (cdip == fdip &&
            (chain->data->ipdata.name_key & HAMMER2_DIRHASH_VISIBLE) == 0) {
                if (nlinks) {
-                       hammer2_chain_modify(hmp, chain, 0);
+                       hammer2_chain_modify(trans, chain, 0);
                        chain->data->ipdata.nlinks += nlinks;
                }
                *chainp = chain;
@@ -1079,12 +1097,12 @@ hammer2_hardlink_consolidate(hammer2_inode_t *ip, hammer2_chain_t **chainp,
         * Hardlink targets are hidden inodes in a parent directory common
         * to all directory entries referencing the hardlink.
         */
-       error = hammer2_inode_duplicate(cdip, chain, &nchain);
+       nchain = hammer2_inode_duplicate(trans, chain, cdip, &error);
        if (error == 0) {
                /*
                 * Bump nlinks on duplicated hidden inode.
                 */
-               hammer2_chain_modify(hmp, nchain, 0);
+               hammer2_chain_modify(trans, nchain, 0);
                nchain->data->ipdata.nlinks += nlinks;
 
                /*
@@ -1094,7 +1112,7 @@ hammer2_hardlink_consolidate(hammer2_inode_t *ip, hammer2_chain_t **chainp,
                 * If the old chain IS a hardlink target then delete it.
                 */
                if (chain->data->ipdata.name_key & HAMMER2_DIRHASH_VISIBLE) {
-                       hammer2_chain_modify(hmp, chain, 0);
+                       hammer2_chain_modify(trans, chain, 0);
                        ipdata = &chain->data->ipdata;
                        ipdata->target_type = ipdata->type;
                        ipdata->type = HAMMER2_OBJTYPE_HARDLINK;
@@ -1129,43 +1147,44 @@ hammer2_hardlink_consolidate(hammer2_inode_t *ip, hammer2_chain_t **chainp,
                        kprintf("DELETE INVISIBLE\n");
                        for (;;) {
                                parent = chain->parent;
-                               hammer2_chain_ref(hmp, parent);
-                               hammer2_chain_ref(hmp, chain);
-                               hammer2_chain_unlock(hmp, chain);
-                               hammer2_chain_lock(hmp, parent,
+                               hammer2_chain_ref(parent);
+                               hammer2_chain_ref(chain);
+                               hammer2_chain_unlock(chain);
+                               hammer2_chain_lock(parent,
                                                   HAMMER2_RESOLVE_ALWAYS);
-                               hammer2_chain_lock(hmp, chain,
+                               hammer2_chain_lock(chain,
                                                   HAMMER2_RESOLVE_ALWAYS);
-                               hammer2_chain_drop(hmp, chain);
+                               hammer2_chain_drop(chain);
                                if (chain->parent == parent)
                                        break;
-                               hammer2_chain_unlock(hmp, parent);
-                               hammer2_chain_drop(hmp, parent);
+                               hammer2_chain_unlock(parent);
+                               hammer2_chain_drop(parent);
                        }
-                       hammer2_chain_delete(hmp, parent, chain, 0);
-                       hammer2_chain_unlock(hmp, parent);
-                       hammer2_chain_drop(hmp, parent);
+                       hammer2_chain_delete(trans, parent, chain);
+                       hammer2_chain_unlock(parent);
+                       hammer2_chain_drop(parent);
                }
 
                /*
                 * Replace ip->chain with nchain (ip is still locked).
                 */
-               hammer2_chain_ref(hmp, nchain);                 /* ip->chain */
+               hammer2_chain_ref(nchain);              /* ip->chain */
                if (ip->chain)
-                       hammer2_chain_drop(hmp, ip->chain);     /* ip->chain */
+                       hammer2_chain_drop(ip->chain);  /* ip->chain */
                ip->chain = nchain;
 
-               hammer2_chain_unlock(hmp, chain);
+               hammer2_chain_unlock(chain);
                *chainp = nchain;
        } else {
-               hammer2_chain_unlock(hmp, chain);
+               hammer2_chain_unlock(chain);
+               *chainp = NULL;
        }
 
        /*
         * Cleanup, chain/nchain already dealt with.
         */
 done:
-       hammer2_inode_unlock_ex(ip, NULL);
+       hammer2_inode_unlock_ex(ip);
        hammer2_inode_drop(cdip);
 
        return (error);
@@ -1179,7 +1198,8 @@ done:
  * represents the only remaining link.
  */
 int
-hammer2_hardlink_deconsolidate(hammer2_inode_t *dip,
+hammer2_hardlink_deconsolidate(hammer2_trans_t *trans,
+                              hammer2_inode_t *dip,
                               hammer2_chain_t **chainp,
                               hammer2_chain_t **ochainp)
 {
@@ -1206,7 +1226,6 @@ int
 hammer2_hardlink_find(hammer2_inode_t *dip, hammer2_chain_t **chainp,
                      hammer2_chain_t **ochainp)
 {
-       hammer2_mount_t *hmp = dip->hmp;
        hammer2_chain_t *chain = *chainp;
        hammer2_chain_t *parent;
        hammer2_inode_t *ip;
@@ -1215,7 +1234,7 @@ hammer2_hardlink_find(hammer2_inode_t *dip, hammer2_chain_t **chainp,
 
        pip = dip;
        hammer2_inode_ref(pip);         /* for loop */
-       hammer2_chain_ref(hmp, chain);  /* for (*ochainp) */
+       hammer2_chain_ref(chain);       /* for (*ochainp) */
 
        *ochainp = chain;
 
@@ -1226,21 +1245,22 @@ hammer2_hardlink_find(hammer2_inode_t *dip, hammer2_chain_t **chainp,
         * chain is reused.
         */
        lhc = chain->data->ipdata.inum;
-       hammer2_chain_unlock(hmp, chain);
+       hammer2_chain_unlock(chain);
        chain = NULL;
 
        while ((ip = pip) != NULL) {
-               parent = hammer2_inode_lock_ex(ip);
+               hammer2_inode_lock_ex(ip);
+               parent = hammer2_chain_lookup_init(ip->chain, 0);
                hammer2_inode_drop(ip);                 /* loop */
                KKASSERT(parent->bref.type == HAMMER2_BREF_TYPE_INODE);
-               chain = hammer2_chain_lookup(hmp, &parent, lhc, lhc, 0);
-               hammer2_chain_unlock(hmp, parent);
+               chain = hammer2_chain_lookup(&parent, lhc, lhc, 0);
+               hammer2_chain_lookup_done(parent);
                if (chain)
                        break;
                pip = ip->pip;          /* safe, ip held locked */
                if (pip)
                        hammer2_inode_ref(pip);         /* loop */
-               hammer2_inode_unlock_ex(ip, NULL);
+               hammer2_inode_unlock_ex(ip);
        }
 
        /*
@@ -1249,7 +1269,8 @@ hammer2_hardlink_find(hammer2_inode_t *dip, hammer2_chain_t **chainp,
         *
         * (parent is already unlocked).
         */
-       hammer2_inode_unlock_ex(ip, NULL);
+       if (ip)
+               hammer2_inode_unlock_ex(ip);
        *chainp = chain;
        if (chain) {
                KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_INODE);
@@ -1265,8 +1286,7 @@ hammer2_hardlink_find(hammer2_inode_t *dip, hammer2_chain_t **chainp,
  * its inode.
  */
 hammer2_inode_t *
-hammer2_inode_common_parent(hammer2_mount_t *hmp,
-                           hammer2_inode_t *fdip, hammer2_inode_t *tdip)
+hammer2_inode_common_parent(hammer2_inode_t *fdip, hammer2_inode_t *tdip)
 {
        hammer2_inode_t *scan1;
        hammer2_inode_t *scan2;
index 61d8c44..ab4bb20 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2011-2013 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
  * by Matthew Dillon <dillon@dragonflybsd.org>
@@ -175,7 +175,7 @@ hammer2_ioctl_remote_scan(hammer2_inode_t *ip, void *data)
 
        hammer2_voldata_lock(hmp);
        remote->copy1 = hmp->voldata.copyinfo[copyid];
-       hammer2_voldata_unlock(hmp);
+       hammer2_voldata_unlock(hmp, 0);
 
        /*
         * Adjust nextid (GET only)
@@ -223,7 +223,7 @@ hammer2_ioctl_remote_add(hammer2_inode_t *ip, void *data)
        hmp->voldata.copyinfo[copyid] = remote->copy1;
        hammer2_volconf_update(pmp, copyid);
 failed:
-       hammer2_voldata_unlock(hmp);
+       hammer2_voldata_unlock(hmp, 1);
        return (error);
 }
 
@@ -261,7 +261,7 @@ hammer2_ioctl_remote_del(hammer2_inode_t *ip, void *data)
        hmp->voldata.copyinfo[copyid].copyid = 0;
        hammer2_volconf_update(pmp, copyid);
 failed:
-       hammer2_voldata_unlock(hmp);
+       hammer2_voldata_unlock(hmp, 1);
        return (error);
 }
 
@@ -280,7 +280,7 @@ hammer2_ioctl_remote_rep(hammer2_inode_t *ip, void *data)
 
        hammer2_voldata_lock(hmp);
        /*hammer2_volconf_update(pmp, copyid);*/
-       hammer2_voldata_unlock(hmp);
+       hammer2_voldata_unlock(hmp, 1);
 
        return(0);
 }
@@ -308,7 +308,7 @@ hammer2_ioctl_socket_set(hammer2_inode_t *ip, void *data)
                return (EINVAL);
 
        hammer2_voldata_lock(hmp);
-       hammer2_voldata_unlock(hmp);
+       hammer2_voldata_unlock(hmp, 0);
 
        return(0);
 }
@@ -329,25 +329,22 @@ hammer2_ioctl_pfs_get(hammer2_inode_t *ip, void *data)
        error = 0;
        hmp = ip->hmp;
        pfs = data;
-       parent = hmp->schain;
-       error = hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
-       if (error)
-               goto done;
+       parent = hammer2_chain_lookup_init(hmp->schain, 0);
 
        /*
         * Search for the first key or specific key.  Remember that keys
         * can be returned in any order.
         */
        if (pfs->name_key == 0) {
-               chain = hammer2_chain_lookup(hmp, &parent,
+               chain = hammer2_chain_lookup(&parent,
                                             0, (hammer2_key_t)-1, 0);
        } else {
-               chain = hammer2_chain_lookup(hmp, &parent,
+               chain = hammer2_chain_lookup(&parent,
                                             pfs->name_key, pfs->name_key, 0);
        }
        while (chain && chain->bref.type != HAMMER2_BREF_TYPE_INODE) {
-               chain = hammer2_chain_next(hmp, &parent, chain,
-                                    0, (hammer2_key_t)-1, 0);
+               chain = hammer2_chain_next(&parent, chain,
+                                          0, (hammer2_key_t)-1, 0);
        }
        if (chain) {
                /*
@@ -367,12 +364,12 @@ hammer2_ioctl_pfs_get(hammer2_inode_t *ip, void *data)
                 * Calculate the next field
                 */
                do {
-                       chain = hammer2_chain_next(hmp, &parent, chain,
-                                            0, (hammer2_key_t)-1, 0);
+                       chain = hammer2_chain_next(&parent, chain,
+                                                  0, (hammer2_key_t)-1, 0);
                } while (chain && chain->bref.type != HAMMER2_BREF_TYPE_INODE);
                if (chain) {
                        pfs->name_next = chain->data->ipdata.name_key;
-                       hammer2_chain_unlock(hmp, chain);
+                       hammer2_chain_unlock(chain);
                } else {
                        pfs->name_next = (hammer2_key_t)-1;
                }
@@ -380,8 +377,8 @@ hammer2_ioctl_pfs_get(hammer2_inode_t *ip, void *data)
                pfs->name_next = (hammer2_key_t)-1;
                error = ENOENT;
        }
-done:
-       hammer2_chain_unlock(hmp, parent);
+       hammer2_chain_lookup_done(parent);
+
        return (error);
 }
 
@@ -403,17 +400,13 @@ hammer2_ioctl_pfs_lookup(hammer2_inode_t *ip, void *data)
        error = 0;
        hmp = ip->hmp;
        pfs = data;
-       parent = hmp->schain;
-       error = hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS |
-                                               HAMMER2_RESOLVE_SHARED);
-       if (error)
-               goto done;
+       parent = hammer2_chain_lookup_init(hmp->schain, HAMMER2_LOOKUP_SHARED);
 
        pfs->name[sizeof(pfs->name) - 1] = 0;
        len = strlen(pfs->name);
        lhc = hammer2_dirhash(pfs->name, len);
 
-       chain = hammer2_chain_lookup(hmp, &parent,
+       chain = hammer2_chain_lookup(&parent,
                                     lhc, lhc + HAMMER2_DIRHASH_LOMASK,
                                     HAMMER2_LOOKUP_SHARED);
        while (chain) {
@@ -422,7 +415,7 @@ hammer2_ioctl_pfs_lookup(hammer2_inode_t *ip, void *data)
                    bcmp(pfs->name, chain->data->ipdata.filename, len) == 0) {
                        break;
                }
-               chain = hammer2_chain_next(hmp, &parent, chain,
+               chain = hammer2_chain_next(&parent, chain,
                                           lhc, lhc + HAMMER2_DIRHASH_LOMASK,
                                           HAMMER2_LOOKUP_SHARED);
        }
@@ -438,12 +431,11 @@ hammer2_ioctl_pfs_lookup(hammer2_inode_t *ip, void *data)
                pfs->pfs_fsid = ipdata->pfs_fsid;
                ipdata = NULL;
 
-               hammer2_chain_unlock(hmp, chain);
+               hammer2_chain_unlock(chain);
        } else {
                error = ENOENT;
        }
-done:
-       hammer2_chain_unlock(hmp, parent);
+       hammer2_chain_lookup_done(parent);
        return (error);
 }
 
@@ -454,10 +446,10 @@ static int
 hammer2_ioctl_pfs_create(hammer2_inode_t *ip, void *data)
 {
        hammer2_inode_data_t *nipdata;
-       hammer2_chain_t *nchain;
        hammer2_mount_t *hmp;
        hammer2_ioc_pfs_t *pfs;
        hammer2_inode_t *nip;
+       hammer2_trans_t trans;
        int error;
 
        hmp = ip->hmp;
@@ -466,17 +458,19 @@ hammer2_ioctl_pfs_create(hammer2_inode_t *ip, void *data)
 
        pfs->name[sizeof(pfs->name) - 1] = 0;   /* ensure 0-termination */
 
-       error = hammer2_inode_create(hmp->sroot, NULL, NULL,
+       hammer2_trans_init(&trans, hmp);
+       nip = hammer2_inode_create(&trans, hmp->sroot, NULL, NULL,
                                     pfs->name, strlen(pfs->name),
-                                    &nip, &nchain);
+                                    &error);
        if (error == 0) {
-               hammer2_chain_modify(hmp, nchain, 0);
-               nipdata = &nchain->data->ipdata;
+               hammer2_chain_modify(&trans, nip->chain, 0);
+               nipdata = &nip->chain->data->ipdata;
                nipdata->pfs_type = pfs->pfs_type;
                nipdata->pfs_clid = pfs->pfs_clid;
                nipdata->pfs_fsid = pfs->pfs_fsid;
-               hammer2_inode_unlock_ex(nip, nchain);
+               hammer2_inode_unlock_ex(nip);
        }
+       hammer2_trans_done(&trans);
        return (error);
 }
 
@@ -488,11 +482,14 @@ hammer2_ioctl_pfs_delete(hammer2_inode_t *ip, void *data)
 {
        hammer2_mount_t *hmp = ip->hmp;
        hammer2_ioc_pfs_t *pfs = data;
+       hammer2_trans_t trans;
        int error;
 
-       error = hammer2_unlink_file(hmp->sroot,
-                                   pfs->name, strlen(pfs->name),
-                                   0, NULL);
+       hammer2_trans_init(&trans, hmp);
+       error = hammer2_unlink_file(&trans, hmp->sroot,
+                                   pfs->name, strlen(pfs->name), 0);
+       hammer2_trans_done(&trans);
+
        return (error);
 }
 
@@ -503,12 +500,12 @@ static int
 hammer2_ioctl_inode_get(hammer2_inode_t *ip, void *data)
 {
        hammer2_ioc_inode_t *ino = data;
-       hammer2_chain_t *chain;
 
-       chain = hammer2_inode_lock_sh(ip);
-       ino->ip_data = chain->data->ipdata;
+       hammer2_inode_lock_sh(ip);
+       ino->ip_data = ip->chain->data->ipdata;
        ino->kdata = ip;
-       hammer2_inode_unlock_sh(ip, chain);
+       hammer2_inode_unlock_sh(ip);
+
        return (0);
 }
 
@@ -516,16 +513,16 @@ static int
 hammer2_ioctl_inode_set(hammer2_inode_t *ip, void *data)
 {
        hammer2_ioc_inode_t *ino = data;
-       hammer2_chain_t *chain;
        int error = EINVAL;
 
-       chain = hammer2_inode_lock_ex(ip);
+       hammer2_inode_lock_ex(ip);
        if (ino->flags & HAMMER2IOC_INODE_FLAG_IQUOTA) {
        }
        if (ino->flags & HAMMER2IOC_INODE_FLAG_DQUOTA) {
        }
        if (ino->flags & HAMMER2IOC_INODE_FLAG_COPIES) {
        }
-       hammer2_inode_unlock_ex(ip, chain);
+       hammer2_inode_unlock_ex(ip);
+
        return (error);
 }
index c92ba36..7d9cabd 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2012-2013 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
  * by Matthew Dillon <dillon@dragonflybsd.org>
index 0e3476c..c3c9cd3 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2011-2013 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
  * by Matthew Dillon <dillon@dragonflybsd.org>
@@ -56,7 +56,7 @@
  * NOTE: We don't combine the inode/chain lock because putting away an
  *       inode would otherwise confuse multiple lock holders of the inode.
  */
-hammer2_chain_t *
+void
 hammer2_inode_lock_ex(hammer2_inode_t *ip)
 {
        hammer2_chain_t *chain;
@@ -66,24 +66,26 @@ hammer2_inode_lock_ex(hammer2_inode_t *ip)
 
        chain = ip->chain;
        KKASSERT(chain != NULL);        /* for now */
-       hammer2_chain_lock(ip->hmp, chain, HAMMER2_RESOLVE_ALWAYS);
-
-       return (chain);
+       hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
 }
 
 void
-hammer2_inode_unlock_ex(hammer2_inode_t *ip, hammer2_chain_t *chain)
+hammer2_inode_unlock_ex(hammer2_inode_t *ip)
 {
+       hammer2_chain_t *chain;
+
        /*
         * XXX this will catch parent directories too which we don't
         *     really want.
         */
-       if (ip->chain && (ip->chain->flags & (HAMMER2_CHAIN_MODIFIED |
-                                             HAMMER2_CHAIN_SUBMODIFIED))) {
-               atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
+       chain = ip->chain;
+       if (chain) {
+               if (chain->flags & (HAMMER2_CHAIN_MODIFIED |
+                                   HAMMER2_CHAIN_SUBMODIFIED)) {
+                       atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
+               }
+               hammer2_chain_unlock(chain);
        }
-       if (chain)
-               hammer2_chain_unlock(ip->hmp, chain);
        ccms_thread_unlock(&ip->topo_cst);
        hammer2_inode_drop(ip);
 }
@@ -97,7 +99,7 @@ hammer2_inode_unlock_ex(hammer2_inode_t *ip, hammer2_chain_t *chain)
  *      need to upgrade them.  Only one count of a shared lock can be
  *      upgraded.
  */
-hammer2_chain_t *
+void
 hammer2_inode_lock_sh(hammer2_inode_t *ip)
 {
        hammer2_chain_t *chain;
@@ -107,16 +109,16 @@ hammer2_inode_lock_sh(hammer2_inode_t *ip)
 
        chain = ip->chain;
        KKASSERT(chain != NULL);        /* for now */
-       hammer2_chain_lock(ip->hmp, chain, HAMMER2_RESOLVE_ALWAYS |
-                                          HAMMER2_RESOLVE_SHARED);
-       return (chain);
+       hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS |
+                                 HAMMER2_RESOLVE_SHARED);
+
 }
 
 void
-hammer2_inode_unlock_sh(hammer2_inode_t *ip, hammer2_chain_t *chain)
+hammer2_inode_unlock_sh(hammer2_inode_t *ip)
 {
-       if (chain)
-               hammer2_chain_unlock(ip->hmp, chain);
+       if (ip->chain)
+               hammer2_chain_unlock(ip->chain);
        ccms_thread_unlock(&ip->topo_cst);
        hammer2_inode_drop(ip);
 }
@@ -146,19 +148,19 @@ hammer2_inode_lock_restore(hammer2_inode_t *ip, ccms_state_t ostate)
 void
 hammer2_mount_exlock(hammer2_mount_t *hmp)
 {
-       ccms_thread_lock(&hmp->vchain.cst, CCMS_STATE_EXCLUSIVE);
+       ccms_thread_lock(&hmp->vchain.core->cst, CCMS_STATE_EXCLUSIVE);
 }
 
 void
 hammer2_mount_shlock(hammer2_mount_t *hmp)
 {
-       ccms_thread_lock(&hmp->vchain.cst, CCMS_STATE_SHARED);
+       ccms_thread_lock(&hmp->vchain.core->cst, CCMS_STATE_SHARED);
 }
 
 void
 hammer2_mount_unlock(hammer2_mount_t *hmp)
 {
-       ccms_thread_unlock(&hmp->vchain.cst);
+       ccms_thread_unlock(&hmp->vchain.core->cst);
 }
 
 void
@@ -168,8 +170,13 @@ hammer2_voldata_lock(hammer2_mount_t *hmp)
 }
 
 void
-hammer2_voldata_unlock(hammer2_mount_t *hmp)
+hammer2_voldata_unlock(hammer2_mount_t *hmp, int modify)
 {
+       if (modify &&
+           (hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) == 0) {
+               atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED);
+               hammer2_chain_ref(&hmp->vchain);
+       }
        lockmgr(&hmp->voldatalk, LK_RELEASE);
 }
 
index 54a4f7c..19086b5 100644 (file)
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2011, 2012 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2011-2013 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
  * by Matthew Dillon <dillon@backplane.com>
 #include "hammer2_disk.h"
 #include "hammer2_mount.h"
 
+#define REPORT_REFS_ERRORS 1   /* XXX remove me */
+
 struct hammer2_sync_info {
+       hammer2_trans_t trans;
        int error;
        int waitfor;
 };
@@ -378,18 +381,22 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
 
        if (create_hmp) {
                /*
+                * Presetup
+                */
+               lockinit(&hmp->alloclk, "h2alloc", 0, 0);
+               lockinit(&hmp->voldatalk, "voldata", 0, LK_CANRECURSE);
+
+               /*
                 * vchain setup. vchain.data is special cased to NULL.
                 * vchain.refs is initialized and will never drop to 0.
                 */
+               hmp->vchain.hmp = hmp;
                hmp->vchain.refs = 1;
                hmp->vchain.data = (void *)&hmp->voldata;
                hmp->vchain.bref.type = HAMMER2_BREF_TYPE_VOLUME;
                hmp->vchain.bref.data_off = 0 | HAMMER2_PBUFRADIX;
-               hmp->vchain.bref_flush = hmp->vchain.bref;
-               ccms_cst_init(&hmp->vchain.cst, NULL);
+               hammer2_chain_core_alloc(&hmp->vchain, NULL);
                /* hmp->vchain.u.xxx is left NULL */
-               lockinit(&hmp->alloclk, "h2alloc", 0, 0);
-               lockinit(&hmp->voldatalk, "voldata", 0, LK_CANRECURSE);
 
                /*
                 * Install the volume header
@@ -423,32 +430,31 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
         * represented by the label.
         */
        if (create_hmp) {
-               parent = &hmp->vchain;
-               hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
-               schain = hammer2_chain_lookup(hmp, &parent,
+               parent = hammer2_chain_lookup_init(&hmp->vchain, 0);
+               schain = hammer2_chain_lookup(&parent,
                                      HAMMER2_SROOT_KEY, HAMMER2_SROOT_KEY, 0);
-               hammer2_chain_unlock(hmp, parent);
+               hammer2_chain_lookup_done(parent);
                if (schain == NULL) {
                        kprintf("hammer2_mount: invalid super-root\n");
                        hammer2_vfs_unmount(mp, MNT_FORCE);
                        return EINVAL;
                }
-               hammer2_chain_ref(hmp, schain); /* for hmp->schain */
-               hmp->schain = schain;           /* left locked */
+               hammer2_chain_ref(schain);      /* for hmp->schain */
+               hmp->schain = schain;           /* left locked for inode_get */
                hmp->sroot = hammer2_inode_get(hmp, NULL, NULL, schain);
-               hammer2_inode_ref(hmp->sroot);  /* for hmp->sroot */
-               hammer2_inode_unlock_ex(hmp->sroot, NULL);
+               hammer2_inode_ref(hmp->sroot);       /* for hmp->sroot */
+               hammer2_inode_unlock_ex(hmp->sroot); /* eats schain lock */
        } else {
                schain = hmp->schain;
-               hammer2_chain_lock(hmp, schain, HAMMER2_RESOLVE_ALWAYS);
        }
 
        /*
-        * schain left locked at this point, use as basis for PFS search.
+        * schain only has 1 ref now for its hmp->schain assignment.
+        * Setup for lookup (which will lock it).
         */
-       parent = schain;
+       parent = hammer2_chain_lookup_init(schain, 0);
        lhc = hammer2_dirhash(label, strlen(label));
-       rchain = hammer2_chain_lookup(hmp, &parent,
+       rchain = hammer2_chain_lookup(&parent,
                                      lhc, lhc + HAMMER2_DIRHASH_LOMASK,
                                      0);
        while (rchain) {
@@ -456,18 +462,18 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
                    strcmp(label, rchain->data->ipdata.filename) == 0) {
                        break;
                }
-               rchain = hammer2_chain_next(hmp, &parent, rchain,
+               rchain = hammer2_chain_next(&parent, rchain,
                                            lhc, lhc + HAMMER2_DIRHASH_LOMASK,
                                            0);
        }
-       hammer2_chain_unlock(hmp, parent);
+       hammer2_chain_lookup_done(parent);
        if (rchain == NULL) {
                kprintf("hammer2_mount: PFS label not found\n");
                hammer2_vfs_unmount(mp, MNT_FORCE);
                return EINVAL;
        }
        if (rchain->flags & HAMMER2_CHAIN_MOUNTED) {
-               hammer2_chain_unlock(hmp, rchain);
+               hammer2_chain_unlock(rchain);
                kprintf("hammer2_mount: PFS label already mounted!\n");
                hammer2_vfs_unmount(mp, MNT_FORCE);
                return EBUSY;
@@ -477,11 +483,11 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
        /*
         * NOTE: *_get() integrates chain's lock into the inode lock.
         */
-       hammer2_chain_ref(hmp, rchain);         /* for pmp->rchain */
+       hammer2_chain_ref(rchain);              /* for pmp->rchain */
        pmp->rchain = rchain;                   /* left held & unlocked */
        pmp->iroot = hammer2_inode_get(hmp, pmp, NULL, rchain);
        hammer2_inode_ref(pmp->iroot);          /* ref for pmp->iroot */
-       hammer2_inode_unlock_ex(pmp->iroot, rchain); /* iroot & its chain */
+       hammer2_inode_unlock_ex(pmp->iroot);    /* iroot & its chain */
 
        kprintf("iroot %p\n", pmp->iroot);
 
@@ -534,7 +540,6 @@ hammer2_vfs_unmount(struct mount *mp, int mntflags)
 {
        hammer2_pfsmount_t *pmp;
        hammer2_mount_t *hmp;
-       hammer2_chain_t *chain;
        int flags;
        int error = 0;
        int ronly = ((mp->mnt_flag & MNT_RDONLY) != 0);
@@ -569,16 +574,14 @@ hammer2_vfs_unmount(struct mount *mp, int mntflags)
         */
        hammer2_voldata_lock(hmp);
        if (hmp->vchain.flags & (HAMMER2_CHAIN_MODIFIED |
-                                HAMMER2_CHAIN_MODIFIED_AUX |
                                 HAMMER2_CHAIN_SUBMODIFIED)) {
-               hammer2_voldata_unlock(hmp);
+               hammer2_voldata_unlock(hmp, 0);
                hammer2_vfs_sync(mp, MNT_WAIT);
        } else {
-               hammer2_voldata_unlock(hmp);
+               hammer2_voldata_unlock(hmp, 0);
        }
        if (hmp->pmp_count == 0) {
                if (hmp->vchain.flags & (HAMMER2_CHAIN_MODIFIED |
-                                        HAMMER2_CHAIN_MODIFIED_AUX |
                                         HAMMER2_CHAIN_SUBMODIFIED)) {
                        kprintf("hammer2_unmount: chains left over after "
                                "final sync\n");
@@ -592,17 +595,29 @@ hammer2_vfs_unmount(struct mount *mp, int mntflags)
         * clean).
         */
        if (pmp->iroot) {
-               chain = hammer2_inode_lock_ex(pmp->iroot);
-               hammer2_inode_put(pmp->iroot, chain);
+               hammer2_inode_lock_ex(pmp->iroot);
+               hammer2_inode_put(pmp->iroot);
                /* lock destroyed by the put */
+#if REPORT_REFS_ERRORS
+               if (pmp->iroot->refs != 1)
+                       kprintf("PMP->IROOT %p REFS WRONG %d\n",
+                               pmp->iroot, pmp->iroot->refs);
+#else
                KKASSERT(pmp->iroot->refs == 1);
+#endif
                hammer2_inode_drop(pmp->iroot);     /* ref for pmp->iroot */
                pmp->iroot = NULL;
        }
        if (pmp->rchain) {
                atomic_clear_int(&pmp->rchain->flags, HAMMER2_CHAIN_MOUNTED);
+#if REPORT_REFS_ERRORS
+               if (pmp->rchain->refs != 1)
+                       kprintf("PMP->RCHAIN %p REFS WRONG %d\n",
+                               pmp->rchain, pmp->rchain->refs);
+#else
                KKASSERT(pmp->rchain->refs == 1);
-               hammer2_chain_drop(hmp, pmp->rchain);
+#endif
+               hammer2_chain_drop(pmp->rchain);
                pmp->rchain = NULL;
        }
        ccms_domain_uninit(&pmp->ccms_dom);
@@ -621,8 +636,14 @@ hammer2_vfs_unmount(struct mount *mp, int mntflags)
                        hmp->sroot = NULL;
                }
                if (hmp->schain) {
+#if REPORT_REFS_ERRORS
+                       if (hmp->schain->refs != 1)
+                               kprintf("HMP->SCHAIN %p REFS WRONG %d\n",
+                                       hmp->schain, hmp->schain->refs);
+#else
                        KKASSERT(hmp->schain->refs == 1);
-                       hammer2_chain_drop(hmp, hmp->schain);
+#endif
+                       hammer2_chain_drop(hmp->schain);
                        hmp->schain = NULL;
                }
 
@@ -639,6 +660,13 @@ hammer2_vfs_unmount(struct mount *mp, int mntflags)
        }
        hammer2_mount_unlock(hmp);
 
+       /*
+        * Final drop of embedded volume root chain to clean up
+        * vchain.core (vchain structure is not flagged ALLOCATED
+        * so it is cleaned out and then left).
+        */
+       hammer2_chain_drop(&hmp->vchain);
+
        pmp->mp = NULL;
        pmp->hmp = NULL;
        mp->mnt_data = NULL;
@@ -670,7 +698,6 @@ int
 hammer2_vfs_root(struct mount *mp, struct vnode **vpp)
 {
        hammer2_pfsmount_t *pmp;
-       hammer2_chain_t *ichain;
        hammer2_mount_t *hmp;
        int error;
        struct vnode *vp;
@@ -682,9 +709,9 @@ hammer2_vfs_root(struct mount *mp, struct vnode **vpp)
                *vpp = NULL;
                error = EINVAL;
        } else {
-               ichain = hammer2_inode_lock_sh(pmp->iroot);
+               hammer2_inode_lock_sh(pmp->iroot);
                vp = hammer2_igetv(pmp->iroot, &error);
-               hammer2_inode_unlock_sh(pmp->iroot, ichain);
+               hammer2_inode_unlock_sh(pmp->iroot);
                *vpp = vp;
                if (vp == NULL)
                        kprintf("vnodefail\n");
@@ -766,7 +793,6 @@ hammer2_vfs_sync(struct mount *mp, int waitfor)
        hammer2_mount_t *hmp;
        int flags;
        int error;
-       int haswork;
        int i;
 
        hmp = MPTOHMP(mp);
@@ -775,6 +801,7 @@ hammer2_vfs_sync(struct mount *mp, int waitfor)
        if (waitfor & MNT_LAZY)
                flags |= VMSC_ONEPASS;
 
+       hammer2_trans_init(&info.trans, hmp);
        info.error = 0;
        info.waitfor = MNT_NOWAIT;
        vmntvnodescan(mp, flags | VMSC_NOWAIT,
@@ -794,16 +821,13 @@ hammer2_vfs_sync(struct mount *mp, int waitfor)
                /* XXX */
        }
 #endif
-       hammer2_chain_lock(hmp, &hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
+       hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
        if (hmp->vchain.flags & (HAMMER2_CHAIN_MODIFIED |
-                                HAMMER2_CHAIN_MODIFIED_AUX |
                                 HAMMER2_CHAIN_SUBMODIFIED)) {
-               hammer2_chain_flush(hmp, &hmp->vchain, 0);
-               haswork = 1;
-       } else {
-               haswork = 0;
+               hammer2_chain_flush(&info.trans, &hmp->vchain);
        }
-       hammer2_chain_unlock(hmp, &hmp->vchain);
+       hammer2_chain_unlock(&hmp->vchain);
+       hammer2_trans_done(&info.trans);
 
        error = 0;
 
@@ -823,7 +847,13 @@ hammer2_vfs_sync(struct mount *mp, int waitfor)
        error = VOP_FSYNC(hmp->devvp, MNT_WAIT, 0);
        vn_unlock(hmp->devvp);
 
-       if (error == 0 && haswork) {
+       /*
+        * The flush code sets CHAIN_VOLUMESYNC to indicate that the
+        * volume header needs synchronization via hmp->volsync.
+        *
+        * XXX synchronize the flag & data with only this flush XXX
+        */
+       if (error == 0 && (hmp->vchain.flags & HAMMER2_CHAIN_VOLUMESYNC)) {
                struct buf *bp;
 
                /*
@@ -856,6 +886,7 @@ hammer2_vfs_sync(struct mount *mp, int waitfor)
                        i, (intmax_t)hmp->volsync.volu_size);
                bp = getblk(hmp->devvp, i * HAMMER2_ZONE_BYTES64,
                            HAMMER2_PBUFSIZE, 0, 0);
+               atomic_clear_int(&hmp->vchain.flags, HAMMER2_CHAIN_VOLUMESYNC);
                bcopy(&hmp->volsync, bp->b_data, HAMMER2_PBUFSIZE);
                bawrite(bp);
                hmp->volhdrno = i;
@@ -1033,7 +1064,6 @@ hammer2_install_volume_header(hammer2_mount_t *hmp)
 void
 hammer2_cluster_reconnect(hammer2_pfsmount_t *pmp, struct file *fp)
 {
-       hammer2_chain_t *chain;
        hammer2_inode_data_t *ipdata;
        size_t name_len;
 
@@ -1047,14 +1077,14 @@ hammer2_cluster_reconnect(hammer2_pfsmount_t *pmp, struct file *fp)
        /*
         * Setup LNK_CONN fields for autoinitiated state machine
         */
-       chain = hammer2_inode_lock_ex(pmp->iroot);
-       ipdata = &chain->data->ipdata;
+       hammer2_inode_lock_ex(pmp->iroot);
+       ipdata = &pmp->iroot->chain->data->ipdata;
        pmp->iocom.auto_lnk_conn.pfs_clid = ipdata->pfs_clid;
        pmp->iocom.auto_lnk_conn.pfs_fsid = ipdata->pfs_fsid;
        pmp->iocom.auto_lnk_conn.pfs_type = ipdata->pfs_type;
        pmp->iocom.auto_lnk_conn.proto_version = DMSG_SPAN_PROTO_1;
        pmp->iocom.auto_lnk_conn.peer_type = pmp->hmp->voldata.peer_type;
-       hammer2_inode_unlock_ex(pmp->iroot, chain);
+       hammer2_inode_unlock_ex(pmp->iroot);
 
        /*
         * Filter adjustment.  Clients do not need visibility into other
@@ -1176,7 +1206,7 @@ hammer2_autodmsg(kdmsg_msg_t *msg)
                                continue;
                        hammer2_volconf_update(pmp, copyid);
                }
-               hammer2_voldata_unlock(hmp);
+               hammer2_voldata_unlock(hmp, 0);
        }
        if ((msg->any.head.cmd & DMSGF_DELETE) &&
            msg->state && (msg->state->txcmd & DMSGF_DELETE) == 0) {
index 2153544..cc1d6e1 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2011-2013 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
  * by Matthew Dillon <dillon@dragonflybsd.org>
 
 static int hammer2_read_file(hammer2_inode_t *ip, struct uio *uio,
                                int seqcount);
-static int hammer2_write_file(hammer2_inode_t *ip, hammer2_chain_t **chainp,
-                               struct uio *uio, int ioflag, int seqcount);
-static hammer2_off_t hammer2_assign_physical(hammer2_inode_t *ip,
-                               hammer2_key_t lbase, int lblksize, int *errorp);
-static void hammer2_extend_file(hammer2_inode_t *ip, hammer2_key_t nsize);
-static void hammer2_truncate_file(hammer2_inode_t *ip, hammer2_key_t nsize);
+static int hammer2_write_file(hammer2_inode_t *ip, struct uio *uio,
+                               int ioflag, int seqcount);
+static hammer2_off_t hammer2_assign_physical(hammer2_trans_t *trans,
+                               hammer2_inode_t *ip,
+                               hammer2_key_t lbase, int lblksize,
+                               int *errorp);
+static void hammer2_extend_file(hammer2_trans_t *trans,
+                               hammer2_inode_t *ip, hammer2_key_t nsize);
+static void hammer2_truncate_file(hammer2_trans_t *trans,
+                               hammer2_inode_t *ip, hammer2_key_t nsize);
 
 static __inline
 void
@@ -73,8 +77,8 @@ static
 int
 hammer2_vop_inactive(struct vop_inactive_args *ap)
 {
-       hammer2_chain_t *chain;
        hammer2_inode_t *ip;
+       hammer2_trans_t trans;
        struct vnode *vp;
 #if 0
        struct hammer2_mount *hmp;
@@ -96,20 +100,23 @@ hammer2_vop_inactive(struct vop_inactive_args *ap)
         * the strategy code.  Simply mark the inode modified so it gets
         * picked up by our normal flush.
         */
-       chain = hammer2_inode_lock_ex(ip);
+       hammer2_inode_lock_ex(ip);
+       KKASSERT(ip->chain);
        if (ip->flags & HAMMER2_INODE_DIRTYEMBED) {
                atomic_clear_int(&ip->flags, HAMMER2_INODE_DIRTYEMBED);
-               hammer2_chain_modify(ip->hmp, chain, 0);
+               hammer2_trans_init(&trans, ip->hmp);
+               hammer2_chain_modify(&trans, ip->chain, 0);
+               hammer2_trans_done(&trans);
        }
 
        /*
         * Check for deleted inodes and recycle immediately.
         */
-       if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) {
-               hammer2_inode_unlock_ex(ip, chain);
+       if (ip->chain->flags & HAMMER2_CHAIN_DELETED) {
+               hammer2_inode_unlock_ex(ip);
                vrecycle(vp);
        } else {
-               hammer2_inode_unlock_ex(ip, chain);
+               hammer2_inode_unlock_ex(ip);
        }
        return (0);
 }
@@ -125,6 +132,7 @@ hammer2_vop_reclaim(struct vop_reclaim_args *ap)
        hammer2_chain_t *chain;
        hammer2_inode_t *ip;
        hammer2_mount_t *hmp;
+       hammer2_trans_t trans;
        struct vnode *vp;
 
        vp = ap->a_vp;
@@ -137,7 +145,8 @@ hammer2_vop_reclaim(struct vop_reclaim_args *ap)
         * Set SUBMODIFIED so we can detect and propagate the DESTROYED
         * bit in the flush code.
         */
-       chain = hammer2_inode_lock_ex(ip);
+       hammer2_inode_lock_ex(ip);
+       chain = ip->chain;
        vp->v_data = NULL;
        ip->vp = NULL;
        if (chain->flags & HAMMER2_CHAIN_DELETED) {
@@ -145,13 +154,19 @@ hammer2_vop_reclaim(struct vop_reclaim_args *ap)
                atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROYED |
                                              HAMMER2_CHAIN_SUBMODIFIED);
        }
-       hammer2_chain_flush(hmp, chain, 0);
-       kprintf("vop_reclaim vp %p ip %p refs %d\n",
-               vp, ip, ip->refs);
-       if (ip->refs > 2)                       /* (our lock + vp ref) */
-               hammer2_inode_unlock_ex(ip, chain); /* unlock */
+       if (chain->flags & (HAMMER2_CHAIN_MODIFIED |
+                           HAMMER2_CHAIN_DELETED |
+                           HAMMER2_CHAIN_SUBMODIFIED)) {
+               hammer2_trans_init(&trans, ip->hmp);
+               hammer2_chain_flush(&trans, chain);
+               hammer2_trans_done(&trans);
+       }
+       if (ip->refs > 2)                           /* (our lock + vp ref) */
+               hammer2_inode_unlock_ex(ip);        /* unlock */
        else
-               hammer2_inode_put(ip, chain);       /* unlock & disconnect */
+               hammer2_inode_put(ip);              /* unlock & disconnect */
+       /* chain no longer referenced */
+       /* chain = NULL; not needed */
        hammer2_inode_drop(ip);                     /* vp ref */
 
        /*
@@ -167,16 +182,16 @@ static
 int
 hammer2_vop_fsync(struct vop_fsync_args *ap)
 {
-       hammer2_chain_t *chain;
        hammer2_inode_t *ip;
-       hammer2_mount_t *hmp;
+       hammer2_trans_t trans;
        struct vnode *vp;
 
        vp = ap->a_vp;
        ip = VTOI(vp);
-       hmp = ip->hmp;
 
-       chain = hammer2_inode_lock_ex(ip);
+       hammer2_trans_init(&trans, ip->hmp);
+       hammer2_inode_lock_ex(ip);
+
        vfsync(vp, ap->a_waitfor, 1, NULL, NULL);
 
        /*
@@ -186,7 +201,7 @@ hammer2_vop_fsync(struct vop_fsync_args *ap)
         */
        if (ip->flags & HAMMER2_INODE_DIRTYEMBED) {
                atomic_clear_int(&ip->flags, HAMMER2_INODE_DIRTYEMBED);
-               hammer2_chain_modify(hmp, chain, 0);
+               hammer2_chain_modify(&trans, ip->chain, 0);
        }
 
        /*
@@ -199,8 +214,9 @@ hammer2_vop_fsync(struct vop_fsync_args *ap)
         */
        atomic_clear_int(&ip->flags, HAMMER2_INODE_MODIFIED);
        if (ap->a_flags & VOP_FSYNC_SYSCALL)
-               hammer2_chain_flush(hmp, chain, 0);
-       hammer2_inode_unlock_ex(ip, chain);
+               hammer2_chain_flush(&trans, ip->chain);
+       hammer2_inode_unlock_ex(ip);
+       hammer2_trans_done(&trans);
        return (0);
 }
 
@@ -209,18 +225,17 @@ int
 hammer2_vop_access(struct vop_access_args *ap)
 {
        hammer2_inode_t *ip = VTOI(ap->a_vp);
-       hammer2_chain_t *chain;
        hammer2_inode_data_t *ipdata;
        uid_t uid;
        gid_t gid;
        int error;
 
-       chain = hammer2_inode_lock_sh(ip);
-       ipdata = &chain->data->ipdata;
+       hammer2_inode_lock_sh(ip);
+       ipdata = &ip->chain->data->ipdata;
        uid = hammer2_to_unix_xid(&ipdata->uid);
        gid = hammer2_to_unix_xid(&ipdata->gid);
        error = vop_helper_access(ap, uid, gid, ipdata->mode, ipdata->uflags);
-       hammer2_inode_unlock_sh(ip, chain);
+       hammer2_inode_unlock_sh(ip);
 
        return (error);
 }
@@ -232,7 +247,6 @@ hammer2_vop_getattr(struct vop_getattr_args *ap)
        hammer2_inode_data_t *ipdata;
        hammer2_pfsmount_t *pmp;
        hammer2_inode_t *ip;
-       hammer2_chain_t *chain;
        struct vnode *vp;
        struct vattr *vap;
 
@@ -242,8 +256,8 @@ hammer2_vop_getattr(struct vop_getattr_args *ap)
        ip = VTOI(vp);
        pmp = ip->pmp;
 
-       chain = hammer2_inode_lock_sh(ip);
-       ipdata = &chain->data->ipdata;
+       hammer2_inode_lock_sh(ip);
+       ipdata = &ip->chain->data->ipdata;
 
        vap->va_fsid = pmp->mp->mnt_stat.f_fsid.val[0];
        vap->va_fileid = ipdata->inum;
@@ -261,14 +275,14 @@ hammer2_vop_getattr(struct vop_getattr_args *ap)
        hammer2_time_to_timespec(ipdata->mtime, &vap->va_atime);
        vap->va_gen = 1;
        vap->va_bytes = vap->va_size;   /* XXX */
-       vap->va_type = hammer2_get_vtype(chain);
+       vap->va_type = hammer2_get_vtype(ip->chain);
        vap->va_filerev = 0;
        vap->va_uid_uuid = ipdata->uid;
        vap->va_gid_uuid = ipdata->gid;
        vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
                          VA_FSID_UUID_VALID;
 
-       hammer2_inode_unlock_sh(ip, chain);
+       hammer2_inode_unlock_sh(ip);
 
        return (0);
 }
@@ -278,9 +292,9 @@ int
 hammer2_vop_setattr(struct vop_setattr_args *ap)
 {
        hammer2_inode_data_t *ipdata;
-       hammer2_chain_t *chain;
        hammer2_inode_t *ip;
        hammer2_mount_t *hmp;
+       hammer2_trans_t trans;
        struct vnode *vp;
        struct vattr *vap;
        int error;
@@ -298,8 +312,9 @@ hammer2_vop_setattr(struct vop_setattr_args *ap)
        if (hmp->ronly)
                return(EROFS);
 
-       chain = hammer2_inode_lock_ex(ip);
-       ipdata = &chain->data->ipdata;
+       hammer2_trans_init(&trans, hmp);
+       hammer2_inode_lock_ex(ip);
+       ipdata = &ip->chain->data->ipdata;
        error = 0;
 
        if (vap->va_flags != VNOVAL) {
@@ -311,7 +326,7 @@ hammer2_vop_setattr(struct vop_setattr_args *ap)
                                         ap->a_cred);
                if (error == 0) {
                        if (ipdata->uflags != flags) {
-                               hammer2_chain_modify(hmp, chain, 0);
+                               hammer2_chain_modify(&trans, ip->chain, 0);
                                ipdata->uflags = flags;
                                ipdata->ctime = ctime;
                                kflags |= NOTE_ATTRIB;
@@ -344,7 +359,7 @@ hammer2_vop_setattr(struct vop_setattr_args *ap)
                            bcmp(&uuid_gid, &ipdata->gid, sizeof(uuid_gid)) ||
                            ipdata->mode != cur_mode
                        ) {
-                               hammer2_chain_modify(hmp, chain, 0);
+                               hammer2_chain_modify(&trans, ip->chain, 0);
                                ipdata->uid = uuid_uid;
                                ipdata->gid = uuid_gid;
                                ipdata->mode = cur_mode;
@@ -363,9 +378,9 @@ hammer2_vop_setattr(struct vop_setattr_args *ap)
                        if (vap->va_size == ipdata->size)
                                break;
                        if (vap->va_size < ipdata->size) {
-                               hammer2_truncate_file(ip, vap->va_size);
+                               hammer2_truncate_file(&trans, ip, vap->va_size);
                        } else {
-                               hammer2_extend_file(ip, vap->va_size);
+                               hammer2_extend_file(&trans, ip, vap->va_size);
                        }
                        domtime = 1;
                        break;
@@ -377,13 +392,13 @@ hammer2_vop_setattr(struct vop_setattr_args *ap)
 #if 0
        /* atime not supported */
        if (vap->va_atime.tv_sec != VNOVAL) {
-               hammer2_chain_modify(hmp, chain, 0);
+               hammer2_chain_modify(&trans, ip->chain, 0);
                ipdata->atime = hammer2_timespec_to_time(&vap->va_atime);
                kflags |= NOTE_ATTRIB;
        }
 #endif
        if (vap->va_mtime.tv_sec != VNOVAL) {
-               hammer2_chain_modify(hmp, chain, 0);
+               hammer2_chain_modify(&trans, ip->chain, 0);
                ipdata->mtime = hammer2_timespec_to_time(&vap->va_mtime);
                kflags |= NOTE_ATTRIB;
        }
@@ -395,14 +410,15 @@ hammer2_vop_setattr(struct vop_setattr_args *ap)
                error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
                                         cur_uid, cur_gid, &cur_mode);
                if (error == 0 && ipdata->mode != cur_mode) {
-                       hammer2_chain_modify(hmp, chain, 0);
+                       hammer2_chain_modify(&trans, ip->chain, 0);
                        ipdata->mode = cur_mode;
                        ipdata->ctime = ctime;
                        kflags |= NOTE_ATTRIB;
                }
        }
 done:
-       hammer2_inode_unlock_ex(ip, chain);
+       hammer2_inode_unlock_ex(ip);
+       hammer2_trans_done(&trans);
        return (error);
 }
 
@@ -415,7 +431,6 @@ hammer2_vop_readdir(struct vop_readdir_args *ap)
        hammer2_inode_t *ip;
        hammer2_inode_t *xip;
        hammer2_chain_t *parent;
-       hammer2_chain_t *xparent;
        hammer2_chain_t *chain;
        hammer2_tid_t inum;
        hammer2_key_t lkey;
@@ -447,8 +462,8 @@ hammer2_vop_readdir(struct vop_readdir_args *ap)
        }
        cookie_index = 0;
 
-       parent = hammer2_inode_lock_sh(ip);
-       ipdata = &parent->data->ipdata;
+       hammer2_inode_lock_sh(ip);
+       ipdata = &ip->chain->data->ipdata;
 
        /*
         * Handle artificial entries.  To ensure that only positive 64 bit
@@ -479,23 +494,23 @@ hammer2_vop_readdir(struct vop_readdir_args *ap)
                /*
                 * Be careful with lockorder when accessing ".."
                 *
-                * (parent is the current dir. xip is the parent dir).
+                * (ip is the current dir. xip is the parent dir).
                 */
-               inum = parent->data->ipdata.inum & HAMMER2_DIRHASH_USERMSK;
+               inum = ipdata->inum & HAMMER2_DIRHASH_USERMSK;
                while (ip->pip != NULL && ip != ip->pmp->iroot) {
                        xip = ip->pip;
                        hammer2_inode_ref(xip);
-                       hammer2_inode_unlock_sh(ip, parent);
-                       xparent = hammer2_inode_lock_sh(xip);
-                       parent = hammer2_inode_lock_sh(ip);
+                       hammer2_inode_unlock_sh(ip);
+                       hammer2_inode_lock_sh(xip);
+                       hammer2_inode_lock_sh(ip);
                        hammer2_inode_drop(xip);
                        if (xip == ip->pip) {
-                               inum = xparent->data->ipdata.inum &
+                               inum = xip->chain->data->ipdata.inum &
                                       HAMMER2_DIRHASH_USERMSK;
-                               hammer2_inode_unlock_sh(xip, xparent);
+                               hammer2_inode_unlock_sh(xip);
                                break;
                        }
-                       hammer2_inode_unlock_sh(xip, xparent);
+                       hammer2_inode_unlock_sh(xip);
                }
                r = vop_write_dirent(&error, uio, inum, DT_DIR, 2, "..");
                if (r)
@@ -517,10 +532,11 @@ hammer2_vop_readdir(struct vop_readdir_args *ap)
        if (error) {
                goto done;
        }
-       chain = hammer2_chain_lookup(hmp, &parent, lkey, lkey,
+       parent = hammer2_chain_lookup_init(ip->chain, HAMMER2_LOOKUP_SHARED);
+       chain = hammer2_chain_lookup(&parent, lkey, lkey,
                                     HAMMER2_LOOKUP_SHARED);
        if (chain == NULL) {
-               chain = hammer2_chain_lookup(hmp, &parent,
+               chain = hammer2_chain_lookup(&parent,
                                             lkey, (hammer2_key_t)-1,
                                             HAMMER2_LOOKUP_SHARED);
        }
@@ -550,7 +566,7 @@ hammer2_vop_readdir(struct vop_readdir_args *ap)
                 * placemarker (chain) the scan must allow the full range
                 * or some entries will be missed.
                 */
-               chain = hammer2_chain_next(hmp, &parent, chain,
+               chain = hammer2_chain_next(&parent, chain,
                                           HAMMER2_DIRHASH_VISIBLE,
                                           (hammer2_key_t)-1,
                                           HAMMER2_LOOKUP_SHARED);
@@ -564,9 +580,10 @@ hammer2_vop_readdir(struct vop_readdir_args *ap)
                        break;
        }
        if (chain)
-               hammer2_chain_unlock(hmp, chain);
+               hammer2_chain_unlock(chain);
+       hammer2_chain_lookup_done(parent);
 done:
-       hammer2_inode_unlock_sh(ip, parent);
+       hammer2_inode_unlock_sh(ip);
        if (ap->a_eofflag)
                *ap->a_eofflag = (chain == NULL);
        uio->uio_offset = saveoff & ~HAMMER2_DIRHASH_VISIBLE;
@@ -645,7 +662,6 @@ static
 int
 hammer2_vop_write(struct vop_write_args *ap)
 {
-       hammer2_chain_t *chain;
        hammer2_mount_t *hmp;
        hammer2_inode_t *ip;
        thread_t td;
@@ -694,9 +710,9 @@ hammer2_vop_write(struct vop_write_args *ap)
         * ip must be marked modified, particularly because the write
         * might wind up being copied into the embedded data area.
         */
-       chain = hammer2_inode_lock_ex(ip);
-       error = hammer2_write_file(ip, &chain, uio, ap->a_ioflag, seqcount);
-       hammer2_inode_unlock_ex(ip, chain);
+       hammer2_inode_lock_ex(ip);
+       error = hammer2_write_file(ip, uio, ap->a_ioflag, seqcount);
+       hammer2_inode_unlock_ex(ip);
        return (error);
 }
 
@@ -710,7 +726,6 @@ static
 int
 hammer2_read_file(hammer2_inode_t *ip, struct uio *uio, int seqcount)
 {
-       hammer2_chain_t *chain;
        hammer2_off_t size;
        struct buf *bp;
        int error;
@@ -719,19 +734,9 @@ hammer2_read_file(hammer2_inode_t *ip, struct uio *uio, int seqcount)
 
        /*
         * UIO read loop.
-        *
-        * We can't hold a shared lock on ip's chain across file bread's
-        * because the bread operation will itself obtain a shared lock,
-        * resulting in one thread holding 2 shared refs.  This will deadlock
-        * against temporary lock upgrades.  Temporary lock upgrades are
-        * needed to insert new chain structures into a parent's RB tree.
-        *
-        * We should be able to safely retain the shared lock on ip itself.
         */
-       chain = hammer2_inode_lock_sh(ip);
-       size = chain->data->ipdata.size;
-       hammer2_chain_unlock(ip->hmp, chain);
-       chain = NULL;
+       hammer2_inode_lock_sh(ip);
+       size = ip->chain->data->ipdata.size;
 
        while (uio->uio_resid > 0 && uio->uio_offset < size) {
                hammer2_key_t lbase;
@@ -759,7 +764,7 @@ hammer2_read_file(hammer2_inode_t *ip, struct uio *uio, int seqcount)
                uiomove((char *)bp->b_data + loff, n, uio);
                bqrelse(bp);
        }
-       hammer2_inode_unlock_sh(ip, chain);
+       hammer2_inode_unlock_sh(ip);
        return (error);
 }
 
@@ -769,10 +774,10 @@ hammer2_read_file(hammer2_inode_t *ip, struct uio *uio, int seqcount)
  */
 static
 int
-hammer2_write_file(hammer2_inode_t *ip, hammer2_chain_t **chainp,
-                  struct uio *uio,
+hammer2_write_file(hammer2_inode_t *ip, struct uio *uio,
                   int ioflag, int seqcount)
 {
+       hammer2_trans_t trans;
        hammer2_inode_data_t *ipdata;
        hammer2_key_t old_eof;
        struct buf *bp;
@@ -789,6 +794,8 @@ hammer2_write_file(hammer2_inode_t *ip, hammer2_chain_t **chainp,
        kflags = 0;
        error = 0;
 
+       hammer2_trans_init(&trans, ip->hmp);
+
        /*
         * Extend the file if necessary.  If the write fails at some point
         * we will truncate it back down to cover as much as we were able
@@ -800,7 +807,8 @@ hammer2_write_file(hammer2_inode_t *ip, hammer2_chain_t **chainp,
        old_eof = ipdata->size;
        if (uio->uio_offset + uio->uio_resid > ipdata->size) {
                modified = 1;
-               hammer2_extend_file(ip, uio->uio_offset + uio->uio_resid);
+               hammer2_extend_file(&trans, ip,
+                                   uio->uio_offset + uio->uio_resid);
                kflags |= NOTE_EXTEND;
        }
 
@@ -824,10 +832,10 @@ hammer2_write_file(hammer2_inode_t *ip, hammer2_chain_t **chainp,
                         * XXX should try to leave this unlocked through
                         *      the whole loop
                         */
-                       hammer2_inode_unlock_ex(ip, *chainp);
+                       hammer2_inode_unlock_ex(ip);
                        bwillwrite(HAMMER2_PBUFSIZE);
-                       *chainp = hammer2_inode_lock_ex(ip);
-                       ipdata = &(*chainp)->data->ipdata;      /* reload */
+                       hammer2_inode_lock_ex(ip);
+                       ipdata = &ip->chain->data->ipdata;      /* reload */
                }
 
                /* XXX bigwrite & signal check test */
@@ -908,7 +916,8 @@ hammer2_write_file(hammer2_inode_t *ip, hammer2_chain_t **chainp,
                 * strategy code will take care of it in that case.
                 */
                bp->b_bio2.bio_offset =
-                       hammer2_assign_physical(ip, lbase, lblksize, &error);
+                       hammer2_assign_physical(&trans, ip,
+                                               lbase, lblksize, &error);
                if (error) {
                        brelse(bp);
                        break;
@@ -917,10 +926,10 @@ hammer2_write_file(hammer2_inode_t *ip, hammer2_chain_t **chainp,
                /*
                 * Ok, copy the data in
                 */
-               hammer2_inode_unlock_ex(ip, *chainp);
+               hammer2_inode_unlock_ex(ip);
                error = uiomove(bp->b_data + loff, n, uio);
-               *chainp = hammer2_inode_lock_ex(ip);
-               ipdata = &(*chainp)->data->ipdata;      /* reload */
+               hammer2_inode_lock_ex(ip);
+               ipdata = &ip->chain->data->ipdata;      /* reload */
                kflags |= NOTE_WRITE;
                modified = 1;
 
@@ -963,13 +972,13 @@ hammer2_write_file(hammer2_inode_t *ip, hammer2_chain_t **chainp,
         * the entire write is a failure and we have to back-up.
         */
        if (error && ipdata->size != old_eof) {
-               hammer2_truncate_file(ip, old_eof);
+               hammer2_truncate_file(&trans, ip, old_eof);
        } else if (modified) {
-               KKASSERT(ip->chain == *chainp);
-               hammer2_chain_modify(ip->hmp, *chainp, 0);
+               hammer2_chain_modify(&trans, ip->chain, 0);
                hammer2_update_time(&ipdata->mtime);
        }
        hammer2_knote(ip->vp, kflags);
+       hammer2_trans_done(&trans);
        return error;
 }
 
@@ -983,8 +992,8 @@ hammer2_write_file(hammer2_inode_t *ip, hammer2_chain_t **chainp,
  */
 static
 hammer2_off_t
-hammer2_assign_physical(hammer2_inode_t *ip, hammer2_key_t lbase,
-                       int lblksize, int *errorp)
+hammer2_assign_physical(hammer2_trans_t *trans, hammer2_inode_t *ip,
+                       hammer2_key_t lbase, int lblksize, int *errorp)
 {
        hammer2_mount_t *hmp;
        hammer2_chain_t *parent;
@@ -1000,8 +1009,9 @@ hammer2_assign_physical(hammer2_inode_t *ip, hammer2_key_t lbase,
        hmp = ip->hmp;
        *errorp = 0;
 retry:
-       parent = hammer2_inode_lock_ex(ip);
-       chain = hammer2_chain_lookup(hmp, &parent,
+       hammer2_inode_lock_ex(ip);
+       parent = hammer2_chain_lookup_init(ip->chain, 0);
+       chain = hammer2_chain_lookup(&parent,
                                     lbase, lbase,
                                     HAMMER2_LOOKUP_NODATA);
 
@@ -1012,13 +1022,15 @@ retry:
                 * NOTE: DATA chains are created without device backing
                 *       store (nor do we want any).
                 */
-               chain = hammer2_chain_create(hmp, parent, NULL,
-                                            lbase, HAMMER2_PBUFRADIX,
-                                            HAMMER2_BREF_TYPE_DATA,
-                                            lblksize, errorp);
+               *errorp = hammer2_chain_create(trans, parent, &chain,
+                                              lbase, HAMMER2_PBUFRADIX,
+                                              HAMMER2_BREF_TYPE_DATA,
+                                              lblksize);
                if (chain == NULL) {
-                       KKASSERT(*errorp == EAGAIN); /* XXX */
-                       hammer2_inode_unlock_ex(ip, parent);
+                       hammer2_inode_unlock_ex(ip);
+                       hammer2_chain_lookup_done(parent);
+                       panic("hammer2_chain_create: par=%p error=%d\n",
+                               parent, *errorp);
                        goto retry;
                }
 
@@ -1041,7 +1053,7 @@ retry:
                                      "size mismatch %d/%d\n",
                                      lblksize, chain->bytes);
                        }
-                       hammer2_chain_modify(hmp, chain,
+                       hammer2_chain_modify(trans, chain,
                                             HAMMER2_MODIFY_OPTDATA);
                        pbase = chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX;
                        break;
@@ -1052,10 +1064,11 @@ retry:
                        break;
                }
        }
-
        if (chain)
-               hammer2_chain_unlock(hmp, chain);
-       hammer2_inode_unlock_ex(ip, parent);
+               hammer2_chain_unlock(chain);
+       hammer2_chain_lookup_done(parent);
+
+       hammer2_inode_unlock_ex(ip);
 
        return (pbase);
 }
@@ -1071,12 +1084,12 @@ retry:
  */
 static
 void
-hammer2_truncate_file(hammer2_inode_t *ip, hammer2_key_t nsize)
+hammer2_truncate_file(hammer2_trans_t *trans,
+                     hammer2_inode_t *ip, hammer2_key_t nsize)
 {
        hammer2_inode_data_t *ipdata;
        hammer2_chain_t *parent;
        hammer2_chain_t *chain;
-       hammer2_mount_t *hmp = ip->hmp;
        hammer2_key_t lbase;
        hammer2_key_t leof;
        struct buf *bp;
@@ -1085,9 +1098,10 @@ hammer2_truncate_file(hammer2_inode_t *ip, hammer2_key_t nsize)
        int oblksize;
        int nblksize;
 
-       hammer2_chain_modify(hmp, ip->chain, 0);
+       hammer2_chain_modify(trans, ip->chain, 0);
        bp = NULL;
        ipdata = &ip->chain->data->ipdata;
+       error = 0;
 
        /*
         * Destroy any logical buffer cache buffers beyond the file EOF.
@@ -1105,13 +1119,7 @@ hammer2_truncate_file(hammer2_inode_t *ip, hammer2_key_t nsize)
        /*
         * Setup for lookup/search
         */
-       parent = ip->chain;
-       error = hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
-       if (error) {
-               hammer2_chain_unlock(hmp, parent);
-               /* XXX error reporting */
-               return;
-       }
+       parent = hammer2_chain_lookup_init(ip->chain, 0);
 
        /*
         * Handle the case where a chain/logical-buffer straddles the new
@@ -1133,20 +1141,22 @@ hammer2_truncate_file(hammer2_inode_t *ip, hammer2_key_t nsize)
         * we don't want to create a conflicting device buffer.
         */
        if (loff && bp) {
-               chain = hammer2_chain_lookup(hmp, &parent, lbase, lbase,
+               chain = hammer2_chain_lookup(&parent, lbase, lbase,
                                             HAMMER2_LOOKUP_NODATA);
                if (chain) {
-                       allocbuf(bp, nblksize);
                        switch(chain->bref.type) {
                        case HAMMER2_BREF_TYPE_DATA:
-                               hammer2_chain_resize(ip, chain,
+                               hammer2_chain_resize(trans, ip, bp,
+                                            parent, &chain,
                                             hammer2_allocsize(nblksize),
                                             HAMMER2_MODIFY_OPTDATA);
+                               allocbuf(bp, nblksize);
                                bzero(bp->b_data + loff, nblksize - loff);
                                bp->b_bio2.bio_offset = chain->bref.data_off &
                                                        HAMMER2_OFF_MASK;
                                break;
                        case HAMMER2_BREF_TYPE_INODE:
+                               allocbuf(bp, nblksize);
                                bzero(bp->b_data + loff, nblksize - loff);
                                bp->b_bio2.bio_offset = NOOFFSET;
                                break;
@@ -1154,7 +1164,7 @@ hammer2_truncate_file(hammer2_inode_t *ip, hammer2_key_t nsize)
                                panic("hammer2_truncate_file: bad type");
                                break;
                        }
-                       hammer2_chain_unlock(hmp, chain);
+                       hammer2_chain_unlock(chain);
                        if (bp->b_bcount == HAMMER2_PBUFSIZE)
                                bp->b_flags |= B_CLUSTEROK;
                        bdwrite(bp);
@@ -1178,11 +1188,12 @@ hammer2_truncate_file(hammer2_inode_t *ip, hammer2_key_t nsize)
                 */
                panic("hammer2_truncate_file: non-zero truncation, no-vnode");
 #if 0
-               chain = hammer2_chain_lookup(hmp, &parent, lbase, lbase, 0);
+               chain = hammer2_chain_lookup(&parent, lbase, lbase, 0);
                if (chain) {
                        switch(chain->bref.type) {
                        case HAMMER2_BREF_TYPE_DATA:
-                               hammer2_chain_resize(ip, chain,
+                               chain = hammer2_chain_resize(trans, ip, bp,
+                                            parent, chain,
                                             hammer2_allocsize(nblksize),
                                             0);
                                hammer2_chain_modify(hmp, chain, 0);
@@ -1196,7 +1207,7 @@ hammer2_truncate_file(hammer2_inode_t *ip, hammer2_key_t nsize)
                                }
                                break;
                        }
-                       hammer2_chain_unlock(hmp, chain);
+                       hammer2_chain_unlock(chain);
                }
 #endif
        }
@@ -1216,7 +1227,7 @@ hammer2_truncate_file(hammer2_inode_t *ip, hammer2_key_t nsize)
         * Destroy any physical blocks after the new EOF point.
         */
        lbase = (nsize + HAMMER2_PBUFMASK64) & ~HAMMER2_PBUFMASK64;
-       chain = hammer2_chain_lookup(hmp, &parent,
+       chain = hammer2_chain_lookup(&parent,
                                     lbase, (hammer2_key_t)-1,
                                     HAMMER2_LOOKUP_NODATA);
        while (chain) {
@@ -1224,7 +1235,7 @@ hammer2_truncate_file(hammer2_inode_t *ip, hammer2_key_t nsize)
                 * Degenerate embedded data case, nothing to loop on.
                 */
                if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
-                       hammer2_chain_unlock(hmp, chain);
+                       hammer2_chain_unlock(chain);
                        break;
                }
 
@@ -1233,14 +1244,14 @@ hammer2_truncate_file(hammer2_inode_t *ip, hammer2_key_t nsize)
                 */
                if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
                        /*ip->delta_dcount -= chain->bytes;*/
-                       hammer2_chain_delete(hmp, parent, chain, 0);
+                       hammer2_chain_delete(trans, parent, chain);
                }
                /* XXX check parent if empty indirect block & delete */
-               chain = hammer2_chain_next(hmp, &parent, chain,
+               chain = hammer2_chain_next(&parent, chain,
                                           lbase, (hammer2_key_t)-1,
                                           HAMMER2_LOOKUP_NODATA);
        }
-       hammer2_chain_unlock(hmp, parent);
+       hammer2_chain_lookup_done(parent);
 }
 
 /*
@@ -1250,7 +1261,8 @@ hammer2_truncate_file(hammer2_inode_t *ip, hammer2_key_t nsize)
  */
 static
 void
-hammer2_extend_file(hammer2_inode_t *ip, hammer2_key_t nsize)
+hammer2_extend_file(hammer2_trans_t *trans,
+                   hammer2_inode_t *ip, hammer2_key_t nsize)
 {
        hammer2_inode_data_t *ipdata;
        hammer2_mount_t *hmp;
@@ -1269,7 +1281,7 @@ hammer2_extend_file(hammer2_inode_t *ip, hammer2_key_t nsize)
        KKASSERT(ip->vp);
        hmp = ip->hmp;
 
-       hammer2_chain_modify(hmp, ip->chain, 0);
+       hammer2_chain_modify(trans, ip->chain, 0);
        ipdata = &ip->chain->data->ipdata;
 
        /*
@@ -1321,14 +1333,6 @@ hammer2_extend_file(hammer2_inode_t *ip, hammer2_key_t nsize)
        if (((int)osize & HAMMER2_PBUFMASK)) {
                error = bread(ip->vp, obase, oblksize, &bp);
                KKASSERT(error == 0);
-
-               if (obase != nbase) {
-                       if (oblksize != HAMMER2_PBUFSIZE)
-                               allocbuf(bp, HAMMER2_PBUFSIZE);
-               } else {
-                       if (oblksize != nblksize)
-                               allocbuf(bp, nblksize);
-               }
        }
 
        /*
@@ -1346,38 +1350,46 @@ hammer2_extend_file(hammer2_inode_t *ip, hammer2_key_t nsize)
         */
        if (((int)osize & HAMMER2_PBUFMASK)) {
 retry:
-               parent = ip->chain;
-               error = hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
-               KKASSERT(error == 0);
-
+               error = 0;
+               parent = hammer2_chain_lookup_init(ip->chain, 0);
                nradix = hammer2_allocsize(nblksize);
 
-               chain = hammer2_chain_lookup(hmp, &parent,
+               chain = hammer2_chain_lookup(&parent,
                                             obase, obase,
                                             HAMMER2_LOOKUP_NODATA);
                if (chain == NULL) {
-                       chain = hammer2_chain_create(hmp, parent, NULL,
+                       error = hammer2_chain_create(trans, parent, &chain,
                                                     obase, nblksize,
                                                     HAMMER2_BREF_TYPE_DATA,
-                                                    nblksize, &error);
+                                                    nblksize);
                        if (chain == NULL) {
-                               KKASSERT(error == EAGAIN);
-                               hammer2_chain_unlock(hmp, parent);
+                               hammer2_chain_lookup_done(parent);
+                               panic("hammer2_chain_create: par=%p error=%d\n",
+                                       parent, error);
                                goto retry;
                        }
                        /*ip->delta_dcount += nblksize;*/
                } else {
                        KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA);
-                       hammer2_chain_resize(ip, chain, nradix,
+                       hammer2_chain_resize(trans, ip, bp,
+                                            parent, &chain,
+                                            nradix,
                                             HAMMER2_MODIFY_OPTDATA);
                }
+               if (obase != nbase) {
+                       if (oblksize != HAMMER2_PBUFSIZE)
+                               allocbuf(bp, HAMMER2_PBUFSIZE);
+               } else {
+                       if (oblksize != nblksize)
+                               allocbuf(bp, nblksize);
+               }
                bp->b_bio2.bio_offset = chain->bref.data_off &
                                        HAMMER2_OFF_MASK;
-               hammer2_chain_unlock(hmp, chain);
+               hammer2_chain_unlock(chain);
                if (bp->b_bcount == HAMMER2_PBUFSIZE)
                        bp->b_flags |= B_CLUSTEROK;
                bdwrite(bp);
-               hammer2_chain_unlock(hmp, parent);
+               hammer2_chain_lookup_done(parent);  /* must be after bdwrite */
        }
 }
 
@@ -1391,6 +1403,7 @@ hammer2_vop_nresolve(struct vop_nresolve_args *ap)
        hammer2_chain_t *parent;
        hammer2_chain_t *chain;
        hammer2_chain_t *ochain;
+       hammer2_trans_t trans;
        struct namecache *ncp;
        const uint8_t *name;
        size_t name_len;
@@ -1408,8 +1421,9 @@ hammer2_vop_nresolve(struct vop_nresolve_args *ap)
        /*
         * Note: In DragonFly the kernel handles '.' and '..'.
         */
-       parent = hammer2_inode_lock_sh(dip);
-       chain = hammer2_chain_lookup(hmp, &parent,
+       hammer2_inode_lock_sh(dip);
+       parent = hammer2_chain_lookup_init(dip->chain, HAMMER2_LOOKUP_SHARED);
+       chain = hammer2_chain_lookup(&parent,
                                     lhc, lhc + HAMMER2_DIRHASH_LOMASK,
                                     HAMMER2_LOOKUP_SHARED);
        while (chain) {
@@ -1418,11 +1432,12 @@ hammer2_vop_nresolve(struct vop_nresolve_args *ap)
                    bcmp(name, chain->data->ipdata.filename, name_len) == 0) {
                        break;
                }
-               chain = hammer2_chain_next(hmp, &parent, chain,
+               chain = hammer2_chain_next(&parent, chain,
                                           lhc, lhc + HAMMER2_DIRHASH_LOMASK,
                                           HAMMER2_LOOKUP_SHARED);
        }
-       hammer2_inode_unlock_sh(dip, parent);
+       hammer2_chain_lookup_done(parent);
+       hammer2_inode_unlock_sh(dip);
 
        /*
         * If the inode represents a forwarding entry for a hardlink we have
@@ -1440,7 +1455,7 @@ hammer2_vop_nresolve(struct vop_nresolve_args *ap)
                if (error) {
                        kprintf("hammer2: unable to find hardlink\n");
                        if (chain) {
-                               hammer2_chain_unlock(hmp, chain);
+                               hammer2_chain_unlock(chain);
                                chain = NULL;
                        }
                        goto failed;
@@ -1457,7 +1472,9 @@ hammer2_vop_nresolve(struct vop_nresolve_args *ap)
                kprintf("hammer2: need to unconsolidate hardlink for %s\n",
                        chain->data->ipdata.filename);
                /* XXX retain shared lock on dip? (currently not held) */
-               hammer2_hardlink_deconsolidate(dip, &chain, &ochain);
+               hammer2_trans_init(&trans, dip->hmp);
+               hammer2_hardlink_deconsolidate(&trans, dip, &chain, &ochain);
+               hammer2_trans_done(&trans);
        }
 
        /*
@@ -1471,9 +1488,13 @@ hammer2_vop_nresolve(struct vop_nresolve_args *ap)
         *       same chain element, for example for hardlinks.  This
         *       use case does not 'reattach' inode associations that
         *       might already exist, but always allocates a new one.
+        *
+        * WARNING: inode structure is locked exclusively via inode_get
+        *          but chain was locked shared.  inode_unlock_ex()
+        *          will handle it properly.
         */
        if (chain) {
-               ip = hammer2_inode_get(dip->hmp, dip->pmp, dip, chain);
+               ip = hammer2_inode_get(hmp, dip->pmp, dip, chain);
                vp = hammer2_igetv(ip, &error);
                if (error == 0) {
                        vn_unlock(vp);
@@ -1481,13 +1502,7 @@ hammer2_vop_nresolve(struct vop_nresolve_args *ap)
                } else if (error == ENOENT) {
                        cache_setvp(ap->a_nch, NULL);
                }
-               /*
-                * don't break the API, chain is locked shared so unlock
-                * it separately even though unlock_ex() currently doesn't
-                * care.
-                */
-               hammer2_inode_unlock_ex(ip, NULL);
-               hammer2_chain_unlock(hmp, chain);
+               hammer2_inode_unlock_ex(ip);
 
                /*
                 * The vp should not be released until after we've disposed
@@ -1505,7 +1520,7 @@ failed:
                ("resolve error %d/%p chain %p ap %p\n",
                 error, ap->a_nch->ncp->nc_vp, chain, ap));
        if (ochain)
-               hammer2_chain_drop(hmp, ochain);
+               hammer2_chain_drop(ochain);
        return error;
 }
 
@@ -1513,7 +1528,6 @@ static
 int
 hammer2_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
 {
-       hammer2_chain_t *chain;
        hammer2_inode_t *dip;
        hammer2_inode_t *ip;
        hammer2_mount_t *hmp;
@@ -1526,9 +1540,9 @@ hammer2_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
                *ap->a_vpp = NULL;
                return ENOENT;
        }
-       chain = hammer2_inode_lock_ex(ip);
+       hammer2_inode_lock_ex(ip);
        *ap->a_vpp = hammer2_igetv(ip, &error);
-       hammer2_inode_unlock_ex(ip, chain);
+       hammer2_inode_unlock_ex(ip);
 
        return error;
 }
@@ -1537,10 +1551,10 @@ static
 int
 hammer2_vop_nmkdir(struct vop_nmkdir_args *ap)
 {
-       hammer2_chain_t *nchain;
        hammer2_mount_t *hmp;
        hammer2_inode_t *dip;
        hammer2_inode_t *nip;
+       hammer2_trans_t trans;
        struct namecache *ncp;
        const uint8_t *name;
        size_t name_len;
@@ -1555,15 +1569,17 @@ hammer2_vop_nmkdir(struct vop_nmkdir_args *ap)
        name = ncp->nc_name;
        name_len = ncp->nc_nlen;
 
-       error = hammer2_inode_create(dip, ap->a_vap, ap->a_cred,
-                                    name, name_len, &nip, &nchain);
+       hammer2_trans_init(&trans, hmp);
+       nip = hammer2_inode_create(&trans, dip, ap->a_vap, ap->a_cred,
+                                  name, name_len, &error);
        if (error) {
                KKASSERT(nip == NULL);
                *ap->a_vpp = NULL;
-               return error;
+       } else {
+               *ap->a_vpp = hammer2_igetv(nip, &error);
+               hammer2_inode_unlock_ex(nip);
        }
-       *ap->a_vpp = hammer2_igetv(nip, &error);
-       hammer2_inode_unlock_ex(nip, nchain);
+       hammer2_trans_done(&trans);
 
        if (error == 0) {
                cache_setunresolved(ap->a_nch);
@@ -1621,14 +1637,16 @@ hammer2_vop_bmap(struct vop_bmap_args *ap)
                lend = lbeg;
        loff = ap->a_loffset & HAMMER2_OFF_MASK_LO;
 
-       parent = hammer2_inode_lock_sh(ip);
-       chain = hammer2_chain_lookup(hmp, &parent,
+       hammer2_inode_lock_sh(ip);
+       parent = hammer2_chain_lookup_init(ip->chain, HAMMER2_LOOKUP_SHARED);
+       chain = hammer2_chain_lookup(&parent,
                                     lbeg, lend,
                                     HAMMER2_LOOKUP_NODATA |
                                     HAMMER2_LOOKUP_SHARED);
        if (chain == NULL) {
                *ap->a_doffsetp = ZFOFFSET;
-               hammer2_inode_unlock_sh(ip, parent);
+               hammer2_chain_lookup_done(parent);
+               hammer2_inode_unlock_sh(ip);
                return (0);
        }
 
@@ -1639,12 +1657,13 @@ hammer2_vop_bmap(struct vop_bmap_args *ap)
                        array[ai][0] = chain->bref.data_off & HAMMER2_OFF_MASK;
                        array[ai][1] = chain->bytes;
                }
-               chain = hammer2_chain_next(hmp, &parent, chain,
+               chain = hammer2_chain_next(&parent, chain,
                                           lbeg, lend,
                                           HAMMER2_LOOKUP_NODATA |
                                           HAMMER2_LOOKUP_SHARED);
        }
-       hammer2_inode_unlock_sh(ip, parent);
+       hammer2_chain_lookup_done(parent);
+       hammer2_inode_unlock_sh(ip);
 
        /*
         * If the requested loffset is not mappable physically we can't
@@ -1689,12 +1708,11 @@ int
 hammer2_vop_advlock(struct vop_advlock_args *ap)
 {
        hammer2_inode_t *ip = VTOI(ap->a_vp);
-       hammer2_chain_t *chain;
        hammer2_off_t size;
 
-       chain = hammer2_inode_lock_sh(ip);
-       size = chain->data->ipdata.size;
-       hammer2_inode_unlock_sh(ip, chain);
+       hammer2_inode_lock_sh(ip);
+       size = ip->chain->data->ipdata.size;
+       hammer2_inode_unlock_sh(ip);
        return (lf_advlock(ap, &ip->advlock, size));
 }
 
@@ -1719,6 +1737,7 @@ hammer2_vop_nlink(struct vop_nlink_args *ap)
        hammer2_inode_t *ip;    /* inode we are hardlinking to */
        hammer2_mount_t *hmp;
        hammer2_chain_t *chain;
+       hammer2_trans_t trans;
        struct namecache *ncp;
        const uint8_t *name;
        size_t name_len;
@@ -1732,6 +1751,7 @@ hammer2_vop_nlink(struct vop_nlink_args *ap)
        ncp = ap->a_nch->ncp;
        name = ncp->nc_name;
        name_len = ncp->nc_nlen;
+       hammer2_trans_init(&trans, hmp);
 
        /*
         * ip represents the file being hardlinked.  The file could be a
@@ -1746,7 +1766,7 @@ hammer2_vop_nlink(struct vop_nlink_args *ap)
         */
        ip = VTOI(ap->a_vp);
        hammer2_inode_ref(ip);
-       error = hammer2_hardlink_consolidate(ip, &chain, dip, 1);
+       error = hammer2_hardlink_consolidate(&trans, ip, &chain, dip, 1);
        if (error)
                goto done;
 
@@ -1754,13 +1774,19 @@ hammer2_vop_nlink(struct vop_nlink_args *ap)
         * Create a directory entry connected to the specified chain.
         * This function unlocks and NULL's chain on return.
         */
-       error = hammer2_inode_connect(dip, &chain, name, name_len);
+       error = hammer2_inode_connect(&trans, dip, ip, &chain, name, name_len);
+       if (chain) {
+               hammer2_chain_unlock(chain);
+               chain = NULL;
+       }
        if (error == 0) {
                cache_setunresolved(ap->a_nch);
                cache_setvp(ap->a_nch, ap->a_vp);
        }
 done:
        hammer2_inode_drop(ip);
+       hammer2_trans_done(&trans);
+
        return error;
 }
 
@@ -1777,7 +1803,7 @@ hammer2_vop_ncreate(struct vop_ncreate_args *ap)
        hammer2_mount_t *hmp;
        hammer2_inode_t *dip;
        hammer2_inode_t *nip;
-       hammer2_chain_t *nchain;
+       hammer2_trans_t trans;
        struct namecache *ncp;
        const uint8_t *name;
        size_t name_len;
@@ -1791,16 +1817,18 @@ hammer2_vop_ncreate(struct vop_ncreate_args *ap)
        ncp = ap->a_nch->ncp;
        name = ncp->nc_name;
        name_len = ncp->nc_nlen;
+       hammer2_trans_init(&trans, hmp);
 
-       error = hammer2_inode_create(dip, ap->a_vap, ap->a_cred,
-                                    name, name_len, &nip, &nchain);
+       nip = hammer2_inode_create(&trans, dip, ap->a_vap, ap->a_cred,
+                                  name, name_len, &error);
        if (error) {
                KKASSERT(nip == NULL);
                *ap->a_vpp = NULL;
-               return error;
+       } else {
+               *ap->a_vpp = hammer2_igetv(nip, &error);
+               hammer2_inode_unlock_ex(nip);
        }
-       *ap->a_vpp = hammer2_igetv(nip, &error);
-       hammer2_inode_unlock_ex(nip, nchain);
+       hammer2_trans_done(&trans);
 
        if (error == 0) {
                cache_setunresolved(ap->a_nch);
@@ -1819,7 +1847,7 @@ hammer2_vop_nsymlink(struct vop_nsymlink_args *ap)
        hammer2_mount_t *hmp;
        hammer2_inode_t *dip;