hammer2 - flush sequencing part 4 - stabilization and cleanup, flush sep
authorMatthew Dillon <dillon@apollo.backplane.com>
Thu, 9 May 2013 00:24:23 +0000 (17:24 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Thu, 9 May 2013 00:24:23 +0000 (17:24 -0700)
* Fix bugs in the handling of DIRECTDATA.  The data for small
  files <= 512 bytes is stored in the inode itself using the
  area that would otherwise contain the indirect block table.

  Cleanup the flag handling for this data, ensure synchronous bwrite()s
  for this data area (the strategy code for direct-data reads and writes
  doesn't actually do any I/O), and ensure proper read-before-write
  operation.

* Adjust hammer2_inode_create() and hammer2_chain_modify_ip() to take/return
  (*chainp).

* Change the inode locking APIs to return the locked chain (unlocking APIs
  take the locked chain as an argument).  This allows the chain to be
  manipulated by the code inbetween, including replacing it with other
  chains, instead of the mess we had before where ip->chain had an implied
  lock associated with it.

  With this change, replacing ip->chain is just a matter of adjusting
  ref counts and not also having to worry about locks.

* Modify the inode chain proactively instead of indirectly.

* Set HAMMER2_INODE_MODIFIED proactively instead of indirectly.  This flag
  is now only used to filter vfs_sync scans.

* Start working on flush transitions.  hammer2_modify_chain() and friends
  will now delete/duplicate an underlying chain that is already in the
  MODIFIED state (returning the new chain) for the case where the previous
  modifications are associated with a flush-in-progress and the current
  modifications are outside of that flush.

* Add hammer2_chain_delete_duplicate(), a function which combines the
  DELETE and duplication operation in one atomic op.  Otherwise a lookup
  or find can catch such operations in the middle and improperly believe
  that the element is fully deleted when it isn't.

* Fix chain refs in a few places.

sys/vfs/hammer2/hammer2.h
sys/vfs/hammer2/hammer2_chain.c
sys/vfs/hammer2/hammer2_inode.c
sys/vfs/hammer2/hammer2_ioctl.c
sys/vfs/hammer2/hammer2_subr.c
sys/vfs/hammer2/hammer2_vfsops.c
sys/vfs/hammer2/hammer2_vnops.c

index 2bad2af..9f87def 100644 (file)
@@ -269,7 +269,7 @@ struct hammer2_inode {
 typedef struct hammer2_inode hammer2_inode_t;
 
 #define HAMMER2_INODE_MODIFIED         0x0001
-#define HAMMER2_INODE_DIRTYEMBED       0x0002
+#define HAMMER2_INODE_UNUSED0002       0x0002
 #define HAMMER2_INODE_RENAME_INPROG    0x0004
 #define HAMMER2_INODE_ONRBTREE         0x0008
 
@@ -453,10 +453,10 @@ extern long hammer2_ioa_volu_write;
 #define hammer2_icrc32(buf, size)      iscsi_crc32((buf), (size))
 #define hammer2_icrc32c(buf, size, crc)        iscsi_crc32_ext((buf), (size), (crc))
 
-void hammer2_inode_lock_ex(hammer2_inode_t *ip);
-void hammer2_inode_lock_sh(hammer2_inode_t *ip);
-void hammer2_inode_unlock_ex(hammer2_inode_t *ip);
-void hammer2_inode_unlock_sh(hammer2_inode_t *ip);
+hammer2_chain_t *hammer2_inode_lock_ex(hammer2_inode_t *ip);
+hammer2_chain_t *hammer2_inode_lock_sh(hammer2_inode_t *ip);
+void hammer2_inode_unlock_ex(hammer2_inode_t *ip, hammer2_chain_t *chain);
+void hammer2_inode_unlock_sh(hammer2_inode_t *ip, hammer2_chain_t *chain);
 void hammer2_voldata_lock(hammer2_mount_t *hmp);
 void hammer2_voldata_unlock(hammer2_mount_t *hmp, int modify);
 ccms_state_t hammer2_inode_lock_temp_release(hammer2_inode_t *ip);
@@ -495,7 +495,7 @@ hammer2_inode_t *hammer2_inode_lookup(hammer2_pfsmount_t *pmp,
 hammer2_inode_t *hammer2_inode_get(hammer2_mount_t *hmp,
                        hammer2_pfsmount_t *pmp, hammer2_inode_t *dip,
                        hammer2_chain_t *chain);
-void hammer2_inode_put(hammer2_inode_t *ip);
+void hammer2_inode_put(hammer2_inode_t *ip, hammer2_chain_t *chain);
 void hammer2_inode_free(hammer2_inode_t *ip);
 void hammer2_inode_ref(hammer2_inode_t *ip);
 void hammer2_inode_drop(hammer2_inode_t *ip);
@@ -507,7 +507,7 @@ hammer2_inode_t *hammer2_inode_create(hammer2_trans_t *trans,
                        hammer2_inode_t *dip,
                        struct vattr *vap, struct ucred *cred,
                        const uint8_t *name, size_t name_len,
-                       int *errorp);
+                       hammer2_chain_t **chainp, int *errorp);
 int hammer2_inode_connect(hammer2_trans_t *trans, int hlink,
                        hammer2_inode_t *dip, hammer2_chain_t **chainp,
                        const uint8_t *name, size_t name_len);
@@ -541,7 +541,8 @@ void hammer2_chain_moved(hammer2_chain_t *chain);
 void hammer2_chain_modify(hammer2_trans_t *trans,
                                hammer2_chain_t **chainp, int flags);
 hammer2_inode_data_t *hammer2_chain_modify_ip(hammer2_trans_t *trans,
-                               hammer2_inode_t *ip, int flags);
+                               hammer2_inode_t *ip, hammer2_chain_t **chainp,
+                               int flags);
 void hammer2_chain_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
                                struct buf *bp,
                                hammer2_chain_t *parent,
@@ -574,6 +575,8 @@ int hammer2_chain_snapshot(hammer2_trans_t *trans, hammer2_inode_t *ip,
                                hammer2_ioc_pfs_t *pfs);
 void hammer2_chain_delete(hammer2_trans_t *trans, hammer2_chain_t *parent,
                                hammer2_chain_t *chain);
+void hammer2_chain_delete_duplicate(hammer2_trans_t *trans,
+                               hammer2_chain_t **chainp);
 void hammer2_chain_flush(hammer2_trans_t *trans, hammer2_chain_t *chain);
 void hammer2_chain_commit(hammer2_trans_t *trans, hammer2_chain_t *chain);
 void hammer2_chain_parent_setsubmod(hammer2_trans_t *trans,
index 3b9ee47..a07b8a7 100644 (file)
@@ -819,12 +819,14 @@ hammer2_chain_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
         * returning a new chain.  This allows the old chain to still be
         * used by the flush code.  Duplication occurs in-place.
         *
+        * The parent does not have to be locked for the delete/duplicate call,
+        * but is in this particular code path.
+        *
         * NOTE: If we are not crossing a synchronization point the
         *       duplication code will simply reuse the existing chain
         *       structure.
         */
-       hammer2_chain_delete(trans, parent, chain);
-       hammer2_chain_duplicate(trans, parent, chain->index, &chain, NULL);
+       hammer2_chain_delete_duplicate(trans, &chain);
 
        /*
         * Set MODIFIED and add a chain ref to prevent destruction.  Both
@@ -837,7 +839,6 @@ hammer2_chain_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
         * worry about snapshots.  XXX check flush synchronization.
         */
        if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) {
-               atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
                atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
                hammer2_chain_ref(chain);
        } else {
@@ -936,15 +937,11 @@ hammer2_chain_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
 }
 
 /*
- * Convert a locked chain that was retrieved read-only to read-write,
- * duplicating it if necessary to satisfy active flush points.
- *
- * If not already marked modified a new physical block will be allocated
- * and assigned to the bref.
+ * Set a chain modified, making it read-write and duplicating it if necessary.
+ * This function will assign a new physical block to the chain if necessary
  *
- * If already modified and the new modification crosses a synchronization
- * point the chain is duplicated in order to allow the flush to synchronize
- * the old chain.  The new chain replaces the old.
+ * Duplication of already-modified chains is possible when the modification
+ * crosses a flush synchronization boundary.
  *
  * Non-data blocks - The chain should be locked to at least the RESOLVE_MAYBE
  *                  level or the COW operation will not work.
@@ -954,17 +951,25 @@ hammer2_chain_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
  *
  * This function may return a different chain than was passed, in which case
  * the old chain will be unlocked and the new chain will be locked.
+ *
+ * ip->chain may be adjusted by hammer2_chain_modify_ip().
  */
 hammer2_inode_data_t *
-hammer2_chain_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip, int flags)
+hammer2_chain_modify_ip(hammer2_trans_t *trans, hammer2_inode_t *ip,
+                       hammer2_chain_t **parentp, int flags)
 {
-       hammer2_chain_t *ochain;
+       hammer2_chain_t *chain;
 
-       ochain = ip->chain;
-       hammer2_chain_modify(trans, &ip->chain, flags);
-       if (ochain != ip->chain) {
-               hammer2_chain_ref(ip->chain);
-               hammer2_chain_drop(ochain);
+       atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
+       hammer2_chain_modify(trans, parentp, flags);
+       if ((chain = ip->chain) != NULL) {
+               while (chain->duplink && (chain->flags & HAMMER2_CHAIN_DELETED))
+                       chain = chain->duplink;
+               if (ip->chain != chain) {
+                       hammer2_chain_ref(chain);
+                       hammer2_chain_drop(ip->chain);
+                       ip->chain = chain;
+               }
        }
        return(&ip->chain->data->ipdata);
 }
@@ -974,7 +979,7 @@ hammer2_chain_modify(hammer2_trans_t *trans, hammer2_chain_t **chainp,
                     int flags)
 {
        hammer2_mount_t *hmp = trans->hmp;
-       hammer2_chain_t *chain = *chainp;
+       hammer2_chain_t *chain;
        hammer2_off_t pbase;
        struct buf *nbp;
        int error;
@@ -987,38 +992,57 @@ hammer2_chain_modify(hammer2_trans_t *trans, hammer2_chain_t **chainp,
         * propagated brefs.  mirror_tid will be updated regardless during
         * the flush, no need to set it here.
         */
-       if ((flags & HAMMER2_MODIFY_NO_MODIFY_TID) == 0)
-               chain->bref.modify_tid = trans->sync_tid;
+       chain = *chainp;
 
        /*
         * If the chain is already marked MODIFIED we can usually just
-        * return.
-        *
-        * WARNING!  It is possible that a prior lock/modify sequence
-        *           retired the buffer.  During this lock/modify sequence
-        *           MODIFIED may still be set but the buffer could wind up
-        *           clean.  Since the caller is going to modify the buffer
-        *           further we have to be sure that DIRTYBP is set again.
-        *
-        * WARNING!  Currently the caller is responsible for handling
-        *           any delete/duplication roll of the chain to account
-        *           for modifications crossing synchronization points.
+        * return.  However, if a modified chain is modified again in
+        * a synchronization-point-crossing manner we have to
+        * delete/duplicate the chain so as not to interfere with the
+        * atomicy of the flush.
         */
        if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
-               if ((flags & HAMMER2_MODIFY_OPTDATA) == 0 &&
-                   chain->bp == NULL) {
-                       goto skip1;
+               if (chain->modify_tid <= hmp->flush_tid &&
+                   trans->sync_tid > hmp->flush_tid) {
+                       /*
+                        * Modifications cross synchronization point,
+                        * requires delete-duplicate.
+                        */
+                       hammer2_chain_delete_duplicate(trans, chainp);
+                       chain = *chainp;
+                       /* fall through using duplicate */
+               } else {
+                       /*
+                        * It is possible that a prior lock/modify sequence
+                        * retired the buffer.  During this lock/modify
+                        * sequence MODIFIED may still be set but the buffer
+                        * could wind up clean.  Since the caller is going
+                        * to modify the buffer further we have to be sure
+                        * that DIRTYBP is set so our chain code knows to
+                        * bwrite/bdwrite the bp.
+                        */
+                       if ((flags & HAMMER2_MODIFY_OPTDATA) == 0 &&
+                           chain->bp == NULL) {
+                               goto skip1;
+                       }
+                       atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
+                       if ((flags & HAMMER2_MODIFY_NO_MODIFY_TID) == 0)
+                               chain->bref.modify_tid = trans->sync_tid;
+                       return;
                }
-               atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
-               return;
        }
 
+       if ((flags & HAMMER2_MODIFY_NO_MODIFY_TID) == 0)
+               chain->bref.modify_tid = trans->sync_tid;
+
        /*
         * Set MODIFIED and add a chain ref to prevent destruction.  Both
         * modified flags share the same ref.
         */
-       atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
-       hammer2_chain_ref(chain);
+       if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) {
+               atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
+               hammer2_chain_ref(chain);
+       }
 
        /*
         * Adjust chain->modify_tid so the flusher knows when the
@@ -2144,6 +2168,8 @@ hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t *parent, int i,
         */
        if (ochain->flags & HAMMER2_CHAIN_INITIAL)
                atomic_set_int(&nchain->flags, HAMMER2_CHAIN_INITIAL);
+       if (ochain->flags & HAMMER2_CHAIN_DIRTYBP)
+               atomic_set_int(&nchain->flags, HAMMER2_CHAIN_DIRTYBP);
 
        /*
         * If the old chain is modified the new one must be too,
@@ -2317,6 +2343,184 @@ hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t *parent, int i,
        }
 }
 
+/*
+ * Special in-place delete-duplicate sequence which does not require a
+ * locked parent.  (*chainp) is marked DELETED and atomically replaced
+ * with a duplicate.  Atomicy is at the very-fine spin-lock level in
+ * order to ensure that lookups do not race us.
+ */
+void
+hammer2_chain_delete_duplicate(hammer2_trans_t *trans,
+                              hammer2_chain_t **chainp)
+{
+       hammer2_mount_t *hmp = trans->hmp;
+       hammer2_chain_t *ochain;
+       hammer2_chain_t *nchain;
+       hammer2_chain_t *parent;
+       size_t bytes;
+
+       /*
+        * First create a duplicate of the chain structure, associating
+        * it with the same core, making it the same size, pointing it
+        * to the same bref (the same media block), and copying any inline
+        * data.
+        */
+       ochain = *chainp;
+       nchain = hammer2_chain_alloc(hmp, &ochain->bref);    /* 1 ref */
+       hammer2_chain_core_alloc(nchain, ochain->core);
+
+       kprintf("delete_duplicate %p.%d(%d)\n", ochain, ochain->bref.type, ochain->refs);
+
+       bytes = (hammer2_off_t)1 <<
+               (int)(ochain->bref.data_off & HAMMER2_OFF_MASK_RADIX);
+       nchain->bytes = bytes;
+
+       /*
+        * Be sure to copy the INITIAL flag as well or we could end up
+        * loading garbage from the bref.
+        */
+       if (ochain->flags & HAMMER2_CHAIN_INITIAL)
+               atomic_set_int(&nchain->flags, HAMMER2_CHAIN_INITIAL);
+       if (ochain->flags & HAMMER2_CHAIN_DIRTYBP)
+               atomic_set_int(&nchain->flags, HAMMER2_CHAIN_DIRTYBP);
+
+       /*
+        * If the old chain is modified the new one must be too,
+        * but we only want to allocate a new bref.
+        */
+       if (ochain->flags & HAMMER2_CHAIN_MODIFIED) {
+               /*
+                * When duplicating chains the MODIFIED state is inherited.
+                * A new bref typically must be allocated.  However, file
+                * data chains may already have the data offset assigned
+                * to a logical buffer cache buffer so we absolutely cannot
+                * allocate a new bref here for TYPE_DATA.
+                *
+                * Basically the flusher core only dumps media topology
+                * and meta-data, not file data.  The VOP_FSYNC code deals
+                * with the file data.  XXX need back-pointer to inode.
+                */
+               if (nchain->bref.type == HAMMER2_BREF_TYPE_DATA) {
+                       atomic_set_int(&nchain->flags, HAMMER2_CHAIN_MODIFIED);
+                       hammer2_chain_ref(nchain);
+               } else {
+                       hammer2_chain_modify(trans, &nchain,
+                                            HAMMER2_MODIFY_OPTDATA |
+                                            HAMMER2_MODIFY_ASSERTNOCOPY);
+               }
+       } else if (nchain->flags & HAMMER2_CHAIN_INITIAL) {
+               /*
+                * When duplicating chains in the INITITAL state we need
+                * to ensure that the chain is marked modified so a
+                * block is properly assigned to it, otherwise the MOVED
+                * bit won't do the right thing.
+                */
+               KKASSERT (nchain->bref.type != HAMMER2_BREF_TYPE_DATA);
+               hammer2_chain_modify(trans, &nchain,
+                                    HAMMER2_MODIFY_OPTDATA |
+                                    HAMMER2_MODIFY_ASSERTNOCOPY);
+       }
+
+       /*
+        * Unconditionally set the MOVED and SUBMODIFIED bit to force
+        * update of parent bref and indirect blockrefs during flush.
+        */
+       if ((nchain->flags & HAMMER2_CHAIN_MOVED) == 0) {
+               atomic_set_int(&nchain->flags, HAMMER2_CHAIN_MOVED);
+               hammer2_chain_ref(nchain);
+       }
+       atomic_set_int(&nchain->flags, HAMMER2_CHAIN_SUBMODIFIED);
+
+       /*
+        * Copy media contents as needed.
+        */
+       switch(nchain->bref.type) {
+       case HAMMER2_BREF_TYPE_VOLUME:
+               panic("hammer2_chain_duplicate: cannot be called w/volhdr");
+               break;
+       case HAMMER2_BREF_TYPE_INODE:
+               KKASSERT(bytes == HAMMER2_INODE_BYTES);
+               if (ochain->data) {
+                       nchain->data = kmalloc(sizeof(nchain->data->ipdata),
+                                             hmp->minode, M_WAITOK | M_ZERO);
+                       nchain->data->ipdata = ochain->data->ipdata;
+               }
+               break;
+       case HAMMER2_BREF_TYPE_INDIRECT:
+               if ((nchain->flags & HAMMER2_CHAIN_MODIFIED) &&
+                   nchain->data) {
+                       bcopy(ochain->data, nchain->data,
+                             nchain->bytes);
+               }
+               break;
+       case HAMMER2_BREF_TYPE_FREEMAP_ROOT:
+       case HAMMER2_BREF_TYPE_FREEMAP_NODE:
+               panic("hammer2_chain_duplicate: cannot be used to"
+                     "create a freemap root or node");
+               break;
+       case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
+       case HAMMER2_BREF_TYPE_DATA:
+       default:
+               if ((nchain->flags & HAMMER2_CHAIN_MODIFIED) &&
+                   nchain->data) {
+                       bcopy(ochain->data, nchain->data,
+                             nchain->bytes);
+               }
+               /* leave chain->data NULL */
+               KKASSERT(nchain->data == NULL);
+               break;
+       }
+
+       /*
+        * Both chains must be locked for us to be able to set the
+        * duplink.  The caller may expect valid data.
+        *
+        * Unmodified duplicated blocks may have the same bref, we
+        * must be careful to avoid buffer cache deadlocks so we
+        * unlock the old chain before resolving the new one.
+        *
+        * Insert nchain at the end of the duplication list.
+        */
+       hammer2_chain_lock(nchain, HAMMER2_RESOLVE_NEVER);
+       /* extra ref still present from original allocation */
+
+       parent = ochain->parent;
+       nchain->parent = parent;
+       nchain->index = ochain->index;
+       hammer2_chain_ref(parent);      /* nchain->parent ref */
+
+       kprintf("duplicate ochain %p(%d) nchain %p(%d) %08x\n",
+               ochain, ochain->refs, nchain, nchain->refs, nchain->flags);
+
+       spin_lock(&ochain->core->cst.spin);
+       atomic_set_int(&nchain->flags, HAMMER2_CHAIN_ONRBTREE);
+       ochain->delete_tid = trans->sync_tid;
+       atomic_set_int(&ochain->flags, HAMMER2_CHAIN_DELETED);
+       if ((ochain->flags & HAMMER2_CHAIN_MOVED) == 0) {
+               hammer2_chain_ref(ochain);
+               atomic_set_int(&ochain->flags, HAMMER2_CHAIN_MOVED);
+       }
+       if (RB_INSERT(hammer2_chain_tree, &parent->core->rbtree, nchain)) {
+               panic("hammer2_chain_link: collision");
+       }
+       KKASSERT(nchain->duplink == NULL);
+       nchain->duplink = ochain->duplink;
+       ochain->duplink = nchain;       /* inherits excess ref from alloc */
+       spin_unlock(&ochain->core->cst.spin);
+
+       /*
+        * Cleanup.  Also note that nchain must be re-resolved to ensure
+        * that it's data is resolved because we locked it RESOLVE_NEVER
+        * up above.
+        */
+       *chainp = nchain;               /* inherits locked */
+       hammer2_chain_unlock(ochain);   /* replacing ochain */
+       hammer2_chain_lock(nchain, HAMMER2_RESOLVE_MAYBE);
+       hammer2_chain_unlock(nchain);
+
+       hammer2_chain_parent_setsubmod(trans, nchain);
+}
+
 /*
  * Create a snapshot of the specified {parent, chain} with the specified
  * label.
@@ -2399,6 +2603,9 @@ hammer2_chain_snapshot(hammer2_trans_t *trans, hammer2_inode_t *ip,
         * operation.
         */
        trans->flags |= HAMMER2_TRANS_RESTRICTED;
+       kprintf("SNAPSHOTA\n");
+       tsleep(trans, 0, "snapslp", hz*4);
+       kprintf("SNAPSHOTB\n");
        hammer2_chain_flush(trans, nchain);
        trans->flags &= ~HAMMER2_TRANS_RESTRICTED;
 
index 29fced9..1e51efe 100644 (file)
@@ -281,8 +281,8 @@ hammer2_igetv(hammer2_inode_t *ip, int *errorp)
 
 /*
  * The passed-in chain must be locked and the returned inode will also be
- * locked.  A ref is added to both the chain and the inode.  The chain lock
- * is inherited by the inode structure and should not be separately released.
+ * locked.  This routine typically locates or allocates the inode, assigns
+ * ip->chain (adding a ref to chain if necessary), and returns the inode.
  *
  * The hammer2_inode structure regulates the interface between the high level
  * kernel VNOPS API and the filesystem backend (the chains).
@@ -325,6 +325,7 @@ again:
                        nip->chain = chain;             /* fully locked   */
                        hammer2_chain_drop(ochain);     /* old nip->chain */
                }
+
                /*
                 * Consolidated nip/nip->chain is locked (chain locked
                 * by caller).
@@ -385,10 +386,9 @@ again:
  * drops.
  */
 void
-hammer2_inode_put(hammer2_inode_t *ip)
+hammer2_inode_put(hammer2_inode_t *ip, hammer2_chain_t *chain)
 {
        hammer2_inode_t *pip;
-       hammer2_chain_t *chain;
 
        /*
         * Disconnect and unlock chain
@@ -397,18 +397,21 @@ hammer2_inode_put(hammer2_inode_t *ip)
        KKASSERT(ip->topo_cst.count == -1);     /* one excl lock allowed */
        if ((chain = ip->chain) != NULL) {
                ip->chain = NULL;
-               hammer2_inode_unlock_ex(ip);
-               hammer2_chain_unlock(chain);    /* because ip->chain now NULL */
                hammer2_chain_drop(chain);      /* from *_get() */
        }
 
        /*
         * Disconnect pip
         */
-       if ((pip = ip->pip) != NULL) {
-               ip->pip = NULL;
+       pip = ip->pip;
+       ip->pip = NULL;
+       hammer2_inode_unlock_ex(ip, chain);
+
+       /*
+        * Cleanup delayed actions
+        */
+       if (pip)
                hammer2_inode_drop(pip);
-       }
 }
 
 /*
@@ -428,7 +431,7 @@ hammer2_inode_t *
 hammer2_inode_create(hammer2_trans_t *trans, hammer2_inode_t *dip,
                     struct vattr *vap, struct ucred *cred,
                     const uint8_t *name, size_t name_len,
-                    int *errorp)
+                    hammer2_chain_t **chainp, int *errorp)
 {
        hammer2_inode_data_t *dipdata;
        hammer2_inode_data_t *nipdata;
@@ -455,13 +458,12 @@ hammer2_inode_create(hammer2_trans_t *trans, hammer2_inode_t *dip,
         * NOTE: hidden inodes do not have iterators.
         */
 retry:
-       hammer2_inode_lock_ex(dip);
+       parent = hammer2_inode_lock_ex(dip);
        dipdata = &dip->chain->data->ipdata;
        dip_uid = dipdata->uid;
        dip_gid = dipdata->gid;
        dip_mode = dipdata->mode;
 
-       parent = hammer2_chain_lookup_init(dip->chain, 0);
        error = 0;
        while (error == 0) {
                chain = hammer2_chain_lookup(&parent, lhc, lhc, 0);
@@ -487,14 +489,12 @@ retry:
         */
        if (error == EAGAIN) {
                hammer2_chain_ref(parent);
-               hammer2_chain_lookup_done(parent);
-               hammer2_inode_unlock_ex(dip);
+               hammer2_inode_unlock_ex(dip, parent);
                hammer2_chain_wait(parent);
                hammer2_chain_drop(parent);
                goto retry;
        }
-       hammer2_chain_lookup_done(parent);
-       hammer2_inode_unlock_ex(dip);
+       hammer2_inode_unlock_ex(dip, parent);
 
        if (error) {
                KKASSERT(chain == NULL);
@@ -570,6 +570,7 @@ retry:
        bcopy(name, nipdata->filename, name_len);
        nipdata->name_key = lhc;
        nipdata->name_len = name_len;
+       *chainp = chain;
 
        return (nip);
 }
@@ -998,9 +999,7 @@ hammer2_unlink_file(hammer2_trans_t *trans, hammer2_inode_t *dip,
         */
        if (hlinkp)
                *hlinkp = 0;
-       hammer2_inode_lock_ex(dip);
-
-       parent = hammer2_chain_lookup_init(dip->chain, 0);
+       parent = hammer2_inode_lock_ex(dip);
        chain = hammer2_chain_lookup(&parent,
                                     lhc, lhc + HAMMER2_DIRHASH_LOMASK,
                                     0);
@@ -1014,7 +1013,7 @@ hammer2_unlink_file(hammer2_trans_t *trans, hammer2_inode_t *dip,
                                           lhc, lhc + HAMMER2_DIRHASH_LOMASK,
                                           0);
        }
-       hammer2_inode_unlock_ex(dip);   /* retain parent */
+       hammer2_inode_unlock_ex(dip, NULL);     /* retain parent */
 
        /*
         * Not found or wrong type (isdir < 0 disables the type check).
@@ -1188,13 +1187,10 @@ hammer2_inode_calc_alloc(hammer2_key_t filesize)
  * creation, adding (nlinks) to the file's link count and potentially
  * relocating the inode to a directory common to ip->pip and tdip.
  *
- * Returns a locked chain in (*chainp) (the chain's lock is in addition to
- * any lock it might already have due to the inode being locked).  *chainp
- * is set unconditionally and its previous contents can be garbage.
+ * Replaces (*chainp) if consolidation occurred, unlocking the old chain
+ * and returning a new locked chain.
  *
- * The caller is responsible for replacing ip->chain, not us.  For certain
- * operations such as renames the caller may do additional manipulation
- * of the chain before replacing ip->chain.
+ * NOTE!  This function will also replace ip->chain.
  */
 int
 hammer2_hardlink_consolidate(hammer2_trans_t *trans, hammer2_inode_t *ip,
@@ -1210,22 +1206,14 @@ hammer2_hardlink_consolidate(hammer2_trans_t *trans, hammer2_inode_t *ip,
        hammer2_chain_t *parent;
        int error;
 
-       /*
-        * Extra lock on chain so it can be returned locked.
-        */
        hmp = tdip->hmp;
 
-       chain = ip->chain;
-       error = hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
-       KKASSERT(error == 0);
-
+       chain = *chainp;
        if (nlinks == 0 &&                      /* no hardlink needed */
            (chain->data->ipdata.name_key & HAMMER2_DIRHASH_VISIBLE)) {
-               *chainp = chain;
                return (0);
        }
        if (hammer2_hardlink_enable < 0) {      /* fake hardlinks */
-               *chainp = chain;
                return (0);
        }
 
@@ -1256,7 +1244,6 @@ hammer2_hardlink_consolidate(hammer2_trans_t *trans, hammer2_inode_t *ip,
                        hammer2_chain_modify(trans, &chain, 0);
                        chain->data->ipdata.nlinks += nlinks;
                }
-               *chainp = chain;
                error = 0;
                goto done;
        }
@@ -1350,19 +1337,20 @@ hammer2_hardlink_consolidate(hammer2_trans_t *trans, hammer2_inode_t *ip,
                 * Return the new chain.
                 */
                hammer2_chain_unlock(chain);
-               *chainp = nchain;
+               chain = nchain;
        } else {
                /*
                 * Return an error
                 */
                hammer2_chain_unlock(chain);
-               *chainp = NULL;
+               chain = NULL;
        }
 
        /*
         * Cleanup, chain/nchain already dealt with.
         */
 done:
+       *chainp = chain;
        hammer2_inode_drop(cdip);
 
        return (error);
@@ -1426,18 +1414,17 @@ hammer2_hardlink_find(hammer2_inode_t *dip, hammer2_chain_t **chainp,
        chain = NULL;
 
        while ((ip = pip) != NULL) {
-               hammer2_inode_lock_ex(ip);
-               parent = hammer2_chain_lookup_init(ip->chain, 0);
+               parent = hammer2_inode_lock_ex(ip);
                hammer2_inode_drop(ip);                 /* loop */
                KKASSERT(parent->bref.type == HAMMER2_BREF_TYPE_INODE);
                chain = hammer2_chain_lookup(&parent, lhc, lhc, 0);
-               hammer2_chain_lookup_done(parent);
+               hammer2_chain_lookup_done(parent);      /* discard parent */
                if (chain)
                        break;
                pip = ip->pip;          /* safe, ip held locked */
                if (pip)
                        hammer2_inode_ref(pip);         /* loop */
-               hammer2_inode_unlock_ex(ip);
+               hammer2_inode_unlock_ex(ip, NULL);
        }
 
        /*
@@ -1447,7 +1434,7 @@ hammer2_hardlink_find(hammer2_inode_t *dip, hammer2_chain_t **chainp,
         * (parent is already unlocked).
         */
        if (ip)
-               hammer2_inode_unlock_ex(ip);
+               hammer2_inode_unlock_ex(ip, NULL);
        *chainp = chain;
        if (chain) {
                KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_INODE);
index 14e06d9..7900214 100644 (file)
@@ -469,6 +469,7 @@ hammer2_ioctl_pfs_create(hammer2_inode_t *ip, void *data)
        hammer2_mount_t *hmp;
        hammer2_ioc_pfs_t *pfs;
        hammer2_inode_t *nip;
+       hammer2_chain_t *nchain;
        hammer2_trans_t trans;
        int error;
 
@@ -483,14 +484,14 @@ hammer2_ioctl_pfs_create(hammer2_inode_t *ip, void *data)
        hammer2_trans_init(hmp, &trans, 0);
        nip = hammer2_inode_create(&trans, hmp->sroot, NULL, NULL,
                                     pfs->name, strlen(pfs->name),
-                                    &error);
+                                    &nchain, &error);
        if (error == 0) {
-               nipdata = hammer2_chain_modify_ip(&trans, nip,
+               nipdata = hammer2_chain_modify_ip(&trans, nip, &nchain,
                                                  HAMMER2_MODIFY_ASSERTNOCOPY);
                nipdata->pfs_type = pfs->pfs_type;
                nipdata->pfs_clid = pfs->pfs_clid;
                nipdata->pfs_fsid = pfs->pfs_fsid;
-               hammer2_inode_unlock_ex(nip);
+               hammer2_inode_unlock_ex(nip, nchain);
        }
        hammer2_trans_done(&trans);
        return (error);
@@ -522,6 +523,7 @@ hammer2_ioctl_pfs_snapshot(hammer2_inode_t *ip, void *data)
        hammer2_mount_t *hmp = ip->hmp;
        hammer2_ioc_pfs_t *pfs = data;
        hammer2_trans_t trans;
+       hammer2_chain_t *parent;
        int error;
 
        if (pfs->name[0] == 0)
@@ -530,9 +532,9 @@ hammer2_ioctl_pfs_snapshot(hammer2_inode_t *ip, void *data)
                return(EINVAL);
 
        hammer2_trans_init(hmp, &trans, 0);
-       hammer2_inode_lock_ex(ip);
+       parent = hammer2_inode_lock_ex(ip);
        error = hammer2_chain_snapshot(&trans, ip, pfs);
-       hammer2_inode_unlock_ex(ip);
+       hammer2_inode_unlock_ex(ip, parent);
        hammer2_trans_done(&trans);
 
        return (error);
@@ -545,11 +547,12 @@ static int
 hammer2_ioctl_inode_get(hammer2_inode_t *ip, void *data)
 {
        hammer2_ioc_inode_t *ino = data;
+       hammer2_chain_t *parent;
 
-       hammer2_inode_lock_sh(ip);
+       parent = hammer2_inode_lock_sh(ip);
        ino->ip_data = ip->chain->data->ipdata;
        ino->kdata = ip;
-       hammer2_inode_unlock_sh(ip);
+       hammer2_inode_unlock_sh(ip, parent);
 
        return (0);
 }
@@ -558,16 +561,17 @@ static int
 hammer2_ioctl_inode_set(hammer2_inode_t *ip, void *data)
 {
        hammer2_ioc_inode_t *ino = data;
+       hammer2_chain_t *parent;
        int error = EINVAL;
 
-       hammer2_inode_lock_ex(ip);
+       parent = hammer2_inode_lock_ex(ip);
        if (ino->flags & HAMMER2IOC_INODE_FLAG_IQUOTA) {
        }
        if (ino->flags & HAMMER2IOC_INODE_FLAG_DQUOTA) {
        }
        if (ino->flags & HAMMER2IOC_INODE_FLAG_COPIES) {
        }
-       hammer2_inode_unlock_ex(ip);
+       hammer2_inode_unlock_ex(ip, parent);
 
        return (error);
 }
index 1fc174a..cffee18 100644 (file)
@@ -56,7 +56,7 @@
  * NOTE: We don't combine the inode/chain lock because putting away an
  *       inode would otherwise confuse multiple lock holders of the inode.
  */
-void
+hammer2_chain_t *
 hammer2_inode_lock_ex(hammer2_inode_t *ip)
 {
        hammer2_chain_t *chain;
@@ -68,14 +68,12 @@ hammer2_inode_lock_ex(hammer2_inode_t *ip)
         * ip->chain fixup.  Certain duplications used to move inodes
         * into indirect blocks (for example) can cause ip->chain to
         * become stale.
-        *
-        *
         */
 again:
        chain = ip->chain;
        while (chain->duplink && (chain->flags & HAMMER2_CHAIN_DELETED))
                chain = chain->duplink;
-       if (chain != ip->chain) {
+       if (ip->chain != chain) {
                hammer2_chain_ref(chain);
                hammer2_chain_drop(ip->chain);
                ip->chain = chain;
@@ -91,25 +89,33 @@ again:
                hammer2_chain_unlock(chain);
                goto again;
        }
+       return (chain);
 }
 
 void
-hammer2_inode_unlock_ex(hammer2_inode_t *ip)
+hammer2_inode_unlock_ex(hammer2_inode_t *ip, hammer2_chain_t *chain)
 {
-       hammer2_chain_t *chain;
-
        /*
         * XXX this will catch parent directories too which we don't
         *     really want.
         */
-       chain = ip->chain;
-       if (chain) {
-               if (chain->flags & (HAMMER2_CHAIN_MODIFIED |
-                                   HAMMER2_CHAIN_SUBMODIFIED)) {
-                       atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
-               }
+       if (chain)
                hammer2_chain_unlock(chain);
+
+       /*
+        * Recalculate ip->chain on exclusive unlock too, it may
+        * allow us to free stale chains more quickly.
+        */
+       if ((chain = ip->chain) != NULL) {
+               while (chain->duplink && (chain->flags & HAMMER2_CHAIN_DELETED))
+                       chain = chain->duplink;
+               if (ip->chain != chain) {
+                       hammer2_chain_ref(chain);
+                       hammer2_chain_drop(ip->chain);
+                       ip->chain = chain;
+               }
        }
+
        ccms_thread_unlock(&ip->topo_cst);
        hammer2_inode_drop(ip);
 }
@@ -123,7 +129,7 @@ hammer2_inode_unlock_ex(hammer2_inode_t *ip)
  *      need to upgrade them.  Only one count of a shared lock can be
  *      upgraded.
  */
-void
+hammer2_chain_t *
 hammer2_inode_lock_sh(hammer2_inode_t *ip)
 {
        hammer2_chain_t *chain;
@@ -143,18 +149,18 @@ again:
        if (chain->duplink && (chain->flags & HAMMER2_CHAIN_DELETED)) {
                hammer2_chain_unlock(chain);
                ccms_thread_unlock(&ip->topo_cst);
-               hammer2_inode_lock_ex(ip);
-               hammer2_inode_unlock_ex(ip);
+               chain = hammer2_inode_lock_ex(ip);
+               hammer2_inode_unlock_ex(ip, chain);
                goto again;
        }
-
+       return (chain);
 }
 
 void
-hammer2_inode_unlock_sh(hammer2_inode_t *ip)
+hammer2_inode_unlock_sh(hammer2_inode_t *ip, hammer2_chain_t *chain)
 {
-       if (ip->chain)
-               hammer2_chain_unlock(ip->chain);
+       if (chain)
+               hammer2_chain_unlock(chain);
        ccms_thread_unlock(&ip->topo_cst);
        hammer2_inode_drop(ip);
 }
index 9886927..95e9f15 100644 (file)
@@ -447,7 +447,7 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
                hmp->schain = schain;           /* left locked for inode_get */
                hmp->sroot = hammer2_inode_get(hmp, NULL, NULL, schain);
                hammer2_inode_ref(hmp->sroot);       /* for hmp->sroot */
-               hammer2_inode_unlock_ex(hmp->sroot); /* eats schain lock */
+               hammer2_inode_unlock_ex(hmp->sroot, schain);
        } else {
                schain = hmp->schain;
        }
@@ -482,6 +482,12 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
                hammer2_vfs_unmount(mp, MNT_FORCE);
                return EBUSY;
        }
+       if (rchain->flags & HAMMER2_CHAIN_RECYCLE) {
+               kprintf("hammer2_mount: PFS label currently recycling\n");
+               hammer2_vfs_unmount(mp, MNT_FORCE);
+               return EBUSY;
+       }
+
        atomic_set_int(&rchain->flags, HAMMER2_CHAIN_MOUNTED);
 
        /*
@@ -491,7 +497,7 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
        pmp->rchain = rchain;                   /* left held & unlocked */
        pmp->iroot = hammer2_inode_get(hmp, pmp, NULL, rchain);
        hammer2_inode_ref(pmp->iroot);          /* ref for pmp->iroot */
-       hammer2_inode_unlock_ex(pmp->iroot);    /* iroot & its chain */
+       hammer2_inode_unlock_ex(pmp->iroot, rchain);
 
        kprintf("iroot %p\n", pmp->iroot);
 
@@ -544,6 +550,7 @@ hammer2_vfs_unmount(struct mount *mp, int mntflags)
 {
        hammer2_pfsmount_t *pmp;
        hammer2_mount_t *hmp;
+       hammer2_chain_t *parent;
        int flags;
        int error = 0;
        int ronly = ((mp->mnt_flag & MNT_RDONLY) != 0);
@@ -602,8 +609,8 @@ hammer2_vfs_unmount(struct mount *mp, int mntflags)
         * clean).
         */
        if (pmp->iroot) {
-               hammer2_inode_lock_ex(pmp->iroot);
-               hammer2_inode_put(pmp->iroot);
+               parent = hammer2_inode_lock_ex(pmp->iroot);
+               hammer2_inode_put(pmp->iroot, parent);
                /* lock destroyed by the put */
 #if REPORT_REFS_ERRORS
                if (pmp->iroot->refs != 1)
@@ -672,9 +679,11 @@ hammer2_vfs_unmount(struct mount *mp, int mntflags)
                 */
                dumpcnt = 200;
                hammer2_dump_chain(&hmp->vchain, 0, &dumpcnt);
+               hammer2_mount_unlock(hmp);
                hammer2_chain_drop(&hmp->vchain);
+       } else {
+               hammer2_mount_unlock(hmp);
        }
-       hammer2_mount_unlock(hmp);
 
        pmp->mp = NULL;
        pmp->hmp = NULL;
@@ -708,6 +717,7 @@ hammer2_vfs_root(struct mount *mp, struct vnode **vpp)
 {
        hammer2_pfsmount_t *pmp;
        hammer2_mount_t *hmp;
+       hammer2_chain_t *parent;
        int error;
        struct vnode *vp;
 
@@ -718,9 +728,9 @@ hammer2_vfs_root(struct mount *mp, struct vnode **vpp)
                *vpp = NULL;
                error = EINVAL;
        } else {
-               hammer2_inode_lock_sh(pmp->iroot);
+               parent = hammer2_inode_lock_sh(pmp->iroot);
                vp = hammer2_igetv(pmp->iroot, &error);
-               hammer2_inode_unlock_sh(pmp->iroot);
+               hammer2_inode_unlock_sh(pmp->iroot, parent);
                *vpp = vp;
                if (vp == NULL)
                        kprintf("vnodefail\n");
@@ -820,6 +830,7 @@ hammer2_vfs_sync(struct mount *mp, int waitfor)
                flags |= VMSC_ONEPASS;
 
        hammer2_trans_init(hmp, &info.trans, HAMMER2_TRANS_ISFLUSH);
+
        info.error = 0;
        info.waitfor = MNT_NOWAIT;
        vmntvnodescan(mp, flags | VMSC_NOWAIT,
@@ -839,6 +850,10 @@ hammer2_vfs_sync(struct mount *mp, int waitfor)
                /* XXX */
        }
 #endif
+       /*
+        * Rollup flush.  The fsyncs above basically just flushed
+        * data blocks.  The flush below gets all the meta-data.
+        */
        hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
        if (hmp->vchain.flags & (HAMMER2_CHAIN_MODIFIED |
                                 HAMMER2_CHAIN_SUBMODIFIED)) {
@@ -851,15 +866,9 @@ hammer2_vfs_sync(struct mount *mp, int waitfor)
        /*
         * We can't safely flush the volume header until we have
         * flushed any device buffers which have built up.
+        *
+        * XXX this isn't being incremental
         */
-#if 0
-       if ((waitfor & MNT_LAZY) == 0) {
-               waitfor = MNT_NOWAIT;
-               vn_lock(hmp->devvp, LK_EXCLUSIVE | LK_RETRY);
-               error = VOP_FSYNC(hmp->devvp, waitfor, 0);
-               vn_unlock(hmp->devvp);
-       }
-#endif
        vn_lock(hmp->devvp, LK_EXCLUSIVE | LK_RETRY);
        error = VOP_FSYNC(hmp->devvp, MNT_WAIT, 0);
        vn_unlock(hmp->devvp);
@@ -926,8 +935,7 @@ hammer2_sync_scan1(struct mount *mp, struct vnode *vp, void *data)
 
        ip = VTOI(vp);
        if (vp->v_type == VNON || ip == NULL ||
-           ((ip->flags & (HAMMER2_INODE_MODIFIED |
-                          HAMMER2_INODE_DIRTYEMBED)) == 0 &&
+           ((ip->flags & HAMMER2_INODE_MODIFIED) == 0 &&
             RB_EMPTY(&vp->v_rbdirty_tree))) {
                return(-1);
        }
@@ -939,12 +947,12 @@ hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data)
 {
        struct hammer2_sync_info *info = data;
        hammer2_inode_t *ip;
+       hammer2_chain_t *parent;
        int error;
 
        ip = VTOI(vp);
        if (vp->v_type == VNON || vp->v_type == VBAD ||
-           ((ip->flags & (HAMMER2_INODE_MODIFIED |
-                          HAMMER2_INODE_DIRTYEMBED)) == 0 &&
+           ((ip->flags & HAMMER2_INODE_MODIFIED) == 0 &&
             RB_EMPTY(&vp->v_rbdirty_tree))) {
                return(0);
        }
@@ -953,17 +961,12 @@ hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data)
         * VOP_FSYNC will start a new transaction so replicate some code
         * here to do it inline (see hammer2_vop_fsync()).
         */
-       hammer2_inode_lock_ex(ip);
+       parent = hammer2_inode_lock_ex(ip);
+       atomic_clear_int(&ip->flags, HAMMER2_INODE_MODIFIED);
        if (ip->vp)
                vfsync(ip->vp, MNT_NOWAIT, 1, NULL, NULL);
-       if (ip->flags & HAMMER2_INODE_DIRTYEMBED) {
-               atomic_clear_int(&ip->flags, HAMMER2_INODE_DIRTYEMBED);
-               atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
-               hammer2_chain_modify_ip(&info->trans, ip, 0);
-               /* ip->chain may have changed */
-       }
-       hammer2_chain_flush(&info->trans, ip->chain);
-       hammer2_inode_unlock_ex(ip);
+       hammer2_chain_flush(&info->trans, parent);
+       hammer2_inode_unlock_ex(ip, parent);
        error = 0;
 #if 0
        error = VOP_FSYNC(vp, MNT_NOWAIT, 0);
@@ -1104,6 +1107,7 @@ void
 hammer2_cluster_reconnect(hammer2_pfsmount_t *pmp, struct file *fp)
 {
        hammer2_inode_data_t *ipdata;
+       hammer2_chain_t *parent;
        size_t name_len;
 
        /*
@@ -1116,14 +1120,13 @@ hammer2_cluster_reconnect(hammer2_pfsmount_t *pmp, struct file *fp)
        /*
         * Setup LNK_CONN fields for autoinitiated state machine
         */
-       hammer2_inode_lock_ex(pmp->iroot);
-       ipdata = &pmp->iroot->chain->data->ipdata;
+       parent = hammer2_inode_lock_ex(pmp->iroot);
+       ipdata = &parent->data->ipdata;
        pmp->iocom.auto_lnk_conn.pfs_clid = ipdata->pfs_clid;
        pmp->iocom.auto_lnk_conn.pfs_fsid = ipdata->pfs_fsid;
        pmp->iocom.auto_lnk_conn.pfs_type = ipdata->pfs_type;
        pmp->iocom.auto_lnk_conn.proto_version = DMSG_SPAN_PROTO_1;
        pmp->iocom.auto_lnk_conn.peer_type = pmp->hmp->voldata.peer_type;
-       hammer2_inode_unlock_ex(pmp->iroot);
 
        /*
         * Filter adjustment.  Clients do not need visibility into other
@@ -1164,6 +1167,7 @@ hammer2_cluster_reconnect(hammer2_pfsmount_t *pmp, struct file *fp)
              pmp->iocom.auto_lnk_span.fs_label,
              name_len);
        pmp->iocom.auto_lnk_span.fs_label[name_len] = 0;
+       hammer2_inode_unlock_ex(pmp->iroot, parent);
 
        kdmsg_iocom_autoinitiate(&pmp->iocom, hammer2_autodmsg);
 }
index 3539809..17b0ff0 100644 (file)
 
 static int hammer2_read_file(hammer2_inode_t *ip, struct uio *uio,
                                int seqcount);
-static int hammer2_write_file(hammer2_inode_t *ip, hammer2_trans_t *trans,
+static int hammer2_write_file(hammer2_trans_t *trans, hammer2_inode_t *ip,
+                               hammer2_chain_t **parentp,
                                struct uio *uio, int ioflag, int seqcount);
 static hammer2_off_t hammer2_assign_physical(hammer2_trans_t *trans,
-                               hammer2_inode_t *ip,
+                               hammer2_inode_t *ip, hammer2_chain_t **parentp,
                                hammer2_key_t lbase, int lblksize,
                                int *errorp);
-static void hammer2_extend_file(hammer2_trans_t *trans,
-                               hammer2_inode_t *ip, hammer2_key_t nsize);
-static void hammer2_truncate_file(hammer2_trans_t *trans,
-                               hammer2_inode_t *ip, hammer2_key_t nsize);
+static void hammer2_extend_file(hammer2_trans_t *trans, hammer2_inode_t *ip,
+                               hammer2_chain_t **parentp, hammer2_key_t nsize);
+static void hammer2_truncate_file(hammer2_trans_t *trans, hammer2_inode_t *ip,
+                               hammer2_chain_t **parentp, hammer2_key_t nsize);
 
 static __inline
 void
@@ -85,11 +86,8 @@ int
 hammer2_vop_inactive(struct vop_inactive_args *ap)
 {
        hammer2_inode_t *ip;
+       hammer2_chain_t *parent;
        struct vnode *vp;
-#if 0
-       hammer2_trans_t trans;
-       struct hammer2_mount *hmp;
-#endif
 
        vp = ap->a_vp;
        ip = VTOI(vp);
@@ -107,27 +105,17 @@ hammer2_vop_inactive(struct vop_inactive_args *ap)
         * the strategy code.  Simply mark the inode modified so it gets
         * picked up by our normal flush.
         */
-       hammer2_inode_lock_ex(ip);
-       KKASSERT(ip->chain);
-#if 0
-       /* XXX lock order reversal on inode/trans */
-       if (ip->flags & HAMMER2_INODE_DIRTYEMBED) {
-               atomic_clear_int(&ip->flags, HAMMER2_INODE_DIRTYEMBED);
-               atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
-               hammer2_trans_init(ip->hmp, &trans, 0);
-               hammer2_chain_modify_ip(&trans, ip, 0);
-               hammer2_trans_done(&trans);
-       }
-#endif
+       parent = hammer2_inode_lock_ex(ip);
+       KKASSERT(parent);
 
        /*
         * Check for deleted inodes and recycle immediately.
         */
-       if (ip->chain->flags & HAMMER2_CHAIN_DELETED) {
-               hammer2_inode_unlock_ex(ip);
+       if (parent->flags & HAMMER2_CHAIN_DELETED) {
+               hammer2_inode_unlock_ex(ip, parent);
                vrecycle(vp);
        } else {
-               hammer2_inode_unlock_ex(ip);
+               hammer2_inode_unlock_ex(ip, parent);
        }
        return (0);
 }
@@ -163,18 +151,9 @@ hammer2_vop_reclaim(struct vop_reclaim_args *ap)
         * file hasn't been.  XXX ip->chain should never be stale on
         * reclaim.
         */
-       hammer2_inode_lock_ex(ip);
-       chain = ip->chain;
+       chain = hammer2_inode_lock_ex(ip);
        if (chain->duplink)
                kprintf("RECLAIM DUPLINKED IP: %p %p\n", ip, ip->chain);
-#if 0
-       while (chain->duplink)
-               chain = chain->duplink;
-       if (ip->chain != chain) {
-               hammer2_inode_repoint(ip, ip->pip, chain);
-               chain = ip->chain;
-       }
-#endif
 
        /*
         * The final close of a deleted file or directory marks it for
@@ -209,9 +188,9 @@ hammer2_vop_reclaim(struct vop_reclaim_args *ap)
        }
 #endif
        if (ip->refs > 2)                           /* (our lock + vp ref) */
-               hammer2_inode_unlock_ex(ip);        /* unlock */
+               hammer2_inode_unlock_ex(ip, chain); /* unlock */
        else
-               hammer2_inode_put(ip);              /* unlock & disconnect */
+               hammer2_inode_put(ip, chain);       /* unlock & disconnect */
        /* chain no longer referenced */
        /* chain = NULL; not needed */
        hammer2_inode_drop(ip);                     /* vp ref */
@@ -232,6 +211,7 @@ hammer2_vop_fsync(struct vop_fsync_args *ap)
        hammer2_mount_t *hmp;
        hammer2_inode_t *ip;
        hammer2_trans_t trans;
+       hammer2_chain_t *chain;
        struct vnode *vp;
 
        vp = ap->a_vp;
@@ -239,21 +219,10 @@ hammer2_vop_fsync(struct vop_fsync_args *ap)
        hmp = ip->hmp;
 
        hammer2_trans_init(hmp, &trans, HAMMER2_TRANS_ISFLUSH);
-       hammer2_inode_lock_ex(ip);
+       chain = hammer2_inode_lock_ex(ip);
 
        vfsync(vp, ap->a_waitfor, 1, NULL, NULL);
 
-       /*
-        * Detect updates to the embedded data which may be synchronized by
-        * the strategy code.  Simply mark the inode modified so it gets
-        * picked up by our normal flush.
-        */
-       if (ip->flags & HAMMER2_INODE_DIRTYEMBED) {
-               atomic_clear_int(&ip->flags, HAMMER2_INODE_DIRTYEMBED);
-               atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
-               hammer2_chain_modify_ip(&trans, ip, 0);
-       }
-
        /*
         * Calling chain_flush here creates a lot of duplicative
         * COW operations due to non-optimal vnode ordering.
@@ -262,11 +231,11 @@ hammer2_vop_fsync(struct vop_fsync_args *ap)
         * which call this function will eventually call chain_flush
         * on the volume root as a catch-all, which is far more optimal.
         */
+       atomic_clear_int(&ip->flags, HAMMER2_INODE_MODIFIED);
        if (ap->a_flags & VOP_FSYNC_SYSCALL) {
-               atomic_clear_int(&ip->flags, HAMMER2_INODE_MODIFIED);
-               hammer2_chain_flush(&trans, ip->chain);
+               hammer2_chain_flush(&trans, chain);
        }
-       hammer2_inode_unlock_ex(ip);
+       hammer2_inode_unlock_ex(ip, chain);
        hammer2_trans_done(&trans);
 
        return (0);
@@ -278,16 +247,17 @@ hammer2_vop_access(struct vop_access_args *ap)
 {
        hammer2_inode_t *ip = VTOI(ap->a_vp);
        hammer2_inode_data_t *ipdata;
+       hammer2_chain_t *chain;
        uid_t uid;
        gid_t gid;
        int error;
 
-       hammer2_inode_lock_sh(ip);
-       ipdata = &ip->chain->data->ipdata;
+       chain = hammer2_inode_lock_sh(ip);
+       ipdata = &chain->data->ipdata;
        uid = hammer2_to_unix_xid(&ipdata->uid);
        gid = hammer2_to_unix_xid(&ipdata->gid);
        error = vop_helper_access(ap, uid, gid, ipdata->mode, ipdata->uflags);
-       hammer2_inode_unlock_sh(ip);
+       hammer2_inode_unlock_sh(ip, chain);
 
        return (error);
 }
@@ -297,6 +267,7 @@ int
 hammer2_vop_getattr(struct vop_getattr_args *ap)
 {
        hammer2_inode_data_t *ipdata;
+       hammer2_chain_t *chain;
        hammer2_pfsmount_t *pmp;
        hammer2_inode_t *ip;
        struct vnode *vp;
@@ -308,8 +279,8 @@ hammer2_vop_getattr(struct vop_getattr_args *ap)
        ip = VTOI(vp);
        pmp = ip->pmp;
 
-       hammer2_inode_lock_sh(ip);
-       ipdata = &ip->chain->data->ipdata;
+       chain = hammer2_inode_lock_sh(ip);
+       ipdata = &chain->data->ipdata;
 
        vap->va_fsid = pmp->mp->mnt_stat.f_fsid.val[0];
        vap->va_fileid = ipdata->inum;
@@ -327,14 +298,14 @@ hammer2_vop_getattr(struct vop_getattr_args *ap)
        hammer2_time_to_timespec(ipdata->mtime, &vap->va_atime);
        vap->va_gen = 1;
        vap->va_bytes = vap->va_size;   /* XXX */
-       vap->va_type = hammer2_get_vtype(ip->chain);
+       vap->va_type = hammer2_get_vtype(chain);
        vap->va_filerev = 0;
        vap->va_uid_uuid = ipdata->uid;
        vap->va_gid_uuid = ipdata->gid;
        vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
                          VA_FSID_UUID_VALID;
 
-       hammer2_inode_unlock_sh(ip);
+       hammer2_inode_unlock_sh(ip, chain);
 
        return (0);
 }
@@ -346,6 +317,7 @@ hammer2_vop_setattr(struct vop_setattr_args *ap)
        hammer2_inode_data_t *ipdata;
        hammer2_inode_t *ip;
        hammer2_mount_t *hmp;
+       hammer2_chain_t *chain;
        hammer2_trans_t trans;
        struct vnode *vp;
        struct vattr *vap;
@@ -365,8 +337,8 @@ hammer2_vop_setattr(struct vop_setattr_args *ap)
                return(EROFS);
 
        hammer2_trans_init(hmp, &trans, 0);
-       hammer2_inode_lock_ex(ip);
-       ipdata = &ip->chain->data->ipdata;
+       chain = hammer2_inode_lock_ex(ip);
+       ipdata = &chain->data->ipdata;
        error = 0;
 
        if (vap->va_flags != VNOVAL) {
@@ -378,7 +350,8 @@ hammer2_vop_setattr(struct vop_setattr_args *ap)
                                         ap->a_cred);
                if (error == 0) {
                        if (ipdata->uflags != flags) {
-                               ipdata = hammer2_chain_modify_ip(&trans, ip, 0);
+                               ipdata = hammer2_chain_modify_ip(&trans, ip,
+                                                                &chain, 0);
                                ipdata->uflags = flags;
                                ipdata->ctime = ctime;
                                kflags |= NOTE_ATTRIB;
@@ -411,7 +384,8 @@ hammer2_vop_setattr(struct vop_setattr_args *ap)
                            bcmp(&uuid_gid, &ipdata->gid, sizeof(uuid_gid)) ||
                            ipdata->mode != cur_mode
                        ) {
-                               ipdata = hammer2_chain_modify_ip(&trans, ip, 0);
+                               ipdata = hammer2_chain_modify_ip(&trans, ip,
+                                                                &chain, 0);
                                ipdata->uid = uuid_uid;
                                ipdata->gid = uuid_gid;
                                ipdata->mode = cur_mode;
@@ -430,11 +404,13 @@ hammer2_vop_setattr(struct vop_setattr_args *ap)
                        if (vap->va_size == ipdata->size)
                                break;
                        if (vap->va_size < ipdata->size) {
-                               hammer2_truncate_file(&trans, ip, vap->va_size);
+                               hammer2_truncate_file(&trans, ip,
+                                                     &chain, vap->va_size);
                        } else {
-                               hammer2_extend_file(&trans, ip, vap->va_size);
+                               hammer2_extend_file(&trans, ip,
+                                                   &chain, vap->va_size);
                        }
-                       ipdata = &ip->chain->data->ipdata; /* RELOAD */
+                       ipdata = &chain->data->ipdata; /* RELOAD */
                        domtime = 1;
                        break;
                default:
@@ -445,13 +421,13 @@ hammer2_vop_setattr(struct vop_setattr_args *ap)
 #if 0
        /* atime not supported */
        if (vap->va_atime.tv_sec != VNOVAL) {
-               ipdata = hammer2_chain_modify_ip(&trans, ip, 0);
+               ipdata = hammer2_chain_modify_ip(&trans, ip, &chain, 0);
                ipdata->atime = hammer2_timespec_to_time(&vap->va_atime);
                kflags |= NOTE_ATTRIB;
        }
 #endif
        if (vap->va_mtime.tv_sec != VNOVAL) {
-               ipdata = hammer2_chain_modify_ip(&trans, ip, 0);
+               ipdata = hammer2_chain_modify_ip(&trans, ip, &chain, 0);
                ipdata->mtime = hammer2_timespec_to_time(&vap->va_mtime);
                kflags |= NOTE_ATTRIB;
        }
@@ -463,14 +439,14 @@ hammer2_vop_setattr(struct vop_setattr_args *ap)
                error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
                                         cur_uid, cur_gid, &cur_mode);
                if (error == 0 && ipdata->mode != cur_mode) {
-                       ipdata = hammer2_chain_modify_ip(&trans, ip, 0);
+                       ipdata = hammer2_chain_modify_ip(&trans, ip, &chain, 0);
                        ipdata->mode = cur_mode;
                        ipdata->ctime = ctime;
                        kflags |= NOTE_ATTRIB;
                }
        }
 done:
-       hammer2_inode_unlock_ex(ip);
+       hammer2_inode_unlock_ex(ip, chain);
        hammer2_trans_done(&trans);
        return (error);
 }
@@ -485,6 +461,7 @@ hammer2_vop_readdir(struct vop_readdir_args *ap)
        hammer2_inode_t *xip;
        hammer2_chain_t *parent;
        hammer2_chain_t *chain;
+       hammer2_chain_t *xchain;
        hammer2_tid_t inum;
        hammer2_key_t lkey;
        struct uio *uio;
@@ -515,8 +492,8 @@ hammer2_vop_readdir(struct vop_readdir_args *ap)
        }
        cookie_index = 0;
 
-       hammer2_inode_lock_sh(ip);
-       ipdata = &ip->chain->data->ipdata;
+       parent = hammer2_inode_lock_sh(ip);
+       ipdata = &parent->data->ipdata;
 
        /*
         * Handle artificial entries.  To ensure that only positive 64 bit
@@ -553,17 +530,17 @@ hammer2_vop_readdir(struct vop_readdir_args *ap)
                while (ip->pip != NULL && ip != ip->pmp->iroot) {
                        xip = ip->pip;
                        hammer2_inode_ref(xip);
-                       hammer2_inode_unlock_sh(ip);
-                       hammer2_inode_lock_sh(xip);
-                       hammer2_inode_lock_sh(ip);
+                       hammer2_inode_unlock_sh(ip, parent);
+                       xchain = hammer2_inode_lock_sh(xip);
+                       parent = hammer2_inode_lock_sh(ip);
                        hammer2_inode_drop(xip);
                        if (xip == ip->pip) {
                                inum = xip->chain->data->ipdata.inum &
                                       HAMMER2_DIRHASH_USERMSK;
-                               hammer2_inode_unlock_sh(xip);
+                               hammer2_inode_unlock_sh(xip, xchain);
                                break;
                        }
-                       hammer2_inode_unlock_sh(xip);
+                       hammer2_inode_unlock_sh(xip, xchain);
                }
                r = vop_write_dirent(&error, uio, inum, DT_DIR, 2, "..");
                if (r)
@@ -585,7 +562,6 @@ hammer2_vop_readdir(struct vop_readdir_args *ap)
        if (error) {
                goto done;
        }
-       parent = hammer2_chain_lookup_init(ip->chain, HAMMER2_LOOKUP_SHARED);
        chain = hammer2_chain_lookup(&parent, lkey, lkey,
                                     HAMMER2_LOOKUP_SHARED);
        if (chain == NULL) {
@@ -634,9 +610,8 @@ hammer2_vop_readdir(struct vop_readdir_args *ap)
        }
        if (chain)
                hammer2_chain_unlock(chain);
-       hammer2_chain_lookup_done(parent);
 done:
-       hammer2_inode_unlock_sh(ip);
+       hammer2_inode_unlock_sh(ip, parent);
        if (ap->a_eofflag)
                *ap->a_eofflag = (chain == NULL);
        uio->uio_offset = saveoff & ~HAMMER2_DIRHASH_VISIBLE;
@@ -718,6 +693,7 @@ hammer2_vop_write(struct vop_write_args *ap)
        hammer2_mount_t *hmp;
        hammer2_inode_t *ip;
        hammer2_trans_t trans;
+       hammer2_chain_t *parent;
        thread_t td;
        struct vnode *vp;
        struct uio *uio;
@@ -765,9 +741,10 @@ hammer2_vop_write(struct vop_write_args *ap)
         * might wind up being copied into the embedded data area.
         */
        hammer2_trans_init(ip->hmp, &trans, 0);
-       hammer2_inode_lock_ex(ip);
-       error = hammer2_write_file(ip, &trans, uio, ap->a_ioflag, seqcount);
-       hammer2_inode_unlock_ex(ip);
+       parent = hammer2_inode_lock_ex(ip);
+       error = hammer2_write_file(&trans, ip, &parent,
+                                  uio, ap->a_ioflag, seqcount);
+       hammer2_inode_unlock_ex(ip, parent);
        hammer2_trans_done(&trans);
 
        return (error);
@@ -784,6 +761,7 @@ int
 hammer2_read_file(hammer2_inode_t *ip, struct uio *uio, int seqcount)
 {
        hammer2_off_t size;
+       hammer2_chain_t *parent;
        struct buf *bp;
        int error;
 
@@ -792,7 +770,7 @@ hammer2_read_file(hammer2_inode_t *ip, struct uio *uio, int seqcount)
        /*
         * UIO read loop.
         */
-       hammer2_inode_lock_sh(ip);
+       parent = hammer2_inode_lock_sh(ip);
        size = ip->chain->data->ipdata.size;
 
        while (uio->uio_resid > 0 && uio->uio_offset < size) {
@@ -821,7 +799,7 @@ hammer2_read_file(hammer2_inode_t *ip, struct uio *uio, int seqcount)
                uiomove((char *)bp->b_data + loff, n, uio);
                bqrelse(bp);
        }
-       hammer2_inode_unlock_sh(ip);
+       hammer2_inode_unlock_sh(ip, parent);
        return (error);
 }
 
@@ -831,7 +809,8 @@ hammer2_read_file(hammer2_inode_t *ip, struct uio *uio, int seqcount)
  */
 static
 int
-hammer2_write_file(hammer2_inode_t *ip, hammer2_trans_t *trans,
+hammer2_write_file(hammer2_trans_t *trans, hammer2_inode_t *ip,
+                  hammer2_chain_t **parentp,
                   struct uio *uio, int ioflag, int seqcount)
 {
        hammer2_inode_data_t *ipdata;
@@ -850,6 +829,14 @@ hammer2_write_file(hammer2_inode_t *ip, hammer2_trans_t *trans,
        kflags = 0;
        error = 0;
 
+       /*
+        * vfs_sync visibility.  Interlocked by the inode ex lock so we
+        * shouldn't have to reassert it multiple times if the ip->chain
+        * is modified/flushed multiple times during the write, except
+        * when we release/reacquire the inode ex lock.
+        */
+       atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
+
        /*
         * Extend the file if necessary.  If the write fails at some point
         * we will truncate it back down to cover as much as we were able
@@ -862,7 +849,7 @@ hammer2_write_file(hammer2_inode_t *ip, hammer2_trans_t *trans,
        old_eof = ipdata->size;
        if (uio->uio_offset + uio->uio_resid > ipdata->size) {
                modified = 1;
-               hammer2_extend_file(trans, ip,
+               hammer2_extend_file(trans, ip, parentp,
                                    uio->uio_offset + uio->uio_resid);
                ipdata = &ip->chain->data->ipdata;      /* RELOAD */
                kflags |= NOTE_EXTEND;
@@ -889,9 +876,10 @@ hammer2_write_file(hammer2_inode_t *ip, hammer2_trans_t *trans,
                         * XXX should try to leave this unlocked through
                         *      the whole loop
                         */
-                       hammer2_inode_unlock_ex(ip);
+                       hammer2_inode_unlock_ex(ip, *parentp);
                        bwillwrite(HAMMER2_PBUFSIZE);
-                       hammer2_inode_lock_ex(ip);
+                       *parentp = hammer2_inode_lock_ex(ip);
+                       atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
                        ipdata = &ip->chain->data->ipdata;      /* reload */
                }
 
@@ -915,7 +903,8 @@ hammer2_write_file(hammer2_inode_t *ip, hammer2_trans_t *trans,
                n = lblksize - loff;
                if (n > uio->uio_resid) {
                        n = uio->uio_resid;
-                       if (uio->uio_offset + n == ipdata->size)
+                       if (loff == lbase &&
+                           uio->uio_offset + n == ipdata->size)
                                trivial = 1;
                } else if (loff == 0) {
                        trivial = 1;
@@ -973,7 +962,7 @@ hammer2_write_file(hammer2_inode_t *ip, hammer2_trans_t *trans,
                 * strategy code will take care of it in that case.
                 */
                bp->b_bio2.bio_offset =
-                       hammer2_assign_physical(trans, ip,
+                       hammer2_assign_physical(trans, ip, parentp,
                                                lbase, lblksize, &error);
                ipdata = &ip->chain->data->ipdata;      /* RELOAD */
                if (error) {
@@ -984,9 +973,10 @@ hammer2_write_file(hammer2_inode_t *ip, hammer2_trans_t *trans,
                /*
                 * Ok, copy the data in
                 */
-               hammer2_inode_unlock_ex(ip);
+               hammer2_inode_unlock_ex(ip, *parentp);
                error = uiomove(bp->b_data + loff, n, uio);
-               hammer2_inode_lock_ex(ip);
+               *parentp = hammer2_inode_lock_ex(ip);
+               atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
                ipdata = &ip->chain->data->ipdata;      /* reload */
                kflags |= NOTE_WRITE;
                modified = 1;
@@ -1006,12 +996,20 @@ hammer2_write_file(hammer2_inode_t *ip, hammer2_trans_t *trans,
                 *       eof-straddling blocksize and is incorrect.
                 */
                bp->b_flags |= B_AGE;
-               if ((ioflag & IO_SYNC) ||
-                   (lbase == 0 && (ipdata->op_flags &
-                                   HAMMER2_OPFLAG_DIRECTDATA))) {
+               if (lbase == 0 && (ipdata->op_flags &
+                                  HAMMER2_OPFLAG_DIRECTDATA)) {
+                       /*
+                        * Writing to the inode's embedded data must be
+                        * synchronous because the strategy code isn't
+                        * allowed to acquire chain locks.
+                        *
+                        * Deal with chain interactions here.
+                        */
+                       ipdata = hammer2_chain_modify_ip(trans, ip, parentp, 0);
+                       bwrite(bp);
+               } else if (ioflag & IO_SYNC) {
                        /*
-                        * Synchronous I/O requested or writing to the
-                        * inode's embedded data (which must be synchronous).
+                        * Synchronous I/O requested.
                         */
                        bwrite(bp);
                } else if ((ioflag & IO_DIRECT) && loff + n == lblksize) {
@@ -1036,10 +1034,10 @@ hammer2_write_file(hammer2_inode_t *ip, hammer2_trans_t *trans,
         * the entire write is a failure and we have to back-up.
         */
        if (error && ipdata->size != old_eof) {
-               hammer2_truncate_file(trans, ip, old_eof);
+               hammer2_truncate_file(trans, ip, parentp, old_eof);
                ipdata = &ip->chain->data->ipdata;      /* RELOAD */
        } else if (modified) {
-               ipdata = hammer2_chain_modify_ip(trans, ip, 0);
+               ipdata = hammer2_chain_modify_ip(trans, ip, parentp, 0);
                hammer2_update_time(&ipdata->mtime);
        }
        hammer2_knote(ip->vp, kflags);
@@ -1062,6 +1060,7 @@ hammer2_write_file(hammer2_inode_t *ip, hammer2_trans_t *trans,
 static
 hammer2_off_t
 hammer2_assign_physical(hammer2_trans_t *trans, hammer2_inode_t *ip,
+                       hammer2_chain_t **parentp,
                        hammer2_key_t lbase, int lblksize, int *errorp)
 {
        hammer2_mount_t *hmp;
@@ -1078,8 +1077,8 @@ hammer2_assign_physical(hammer2_trans_t *trans, hammer2_inode_t *ip,
        hmp = ip->hmp;
        *errorp = 0;
 retry:
-       hammer2_inode_lock_ex(ip);
-       parent = hammer2_chain_lookup_init(ip->chain, 0);
+       parent = *parentp;
+       hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); /* extra lock */
        chain = hammer2_chain_lookup(&parent,
                                     lbase, lbase,
                                     HAMMER2_LOOKUP_NODATA);
@@ -1096,7 +1095,6 @@ retry:
                                               HAMMER2_BREF_TYPE_DATA,
                                               lblksize);
                if (chain == NULL) {
-                       hammer2_inode_unlock_ex(ip);
                        hammer2_chain_lookup_done(parent);
                        panic("hammer2_chain_create: par=%p error=%d\n",
                                parent, *errorp);
@@ -1133,11 +1131,22 @@ retry:
                        break;
                }
        }
-       if (chain)
-               hammer2_chain_unlock(chain);
-       hammer2_chain_lookup_done(parent);
 
-       hammer2_inode_unlock_ex(ip);
+       /*
+        * Cleanup.  If chain wound up being the inode (i.e. DIRECTDATA),
+        * we might have to replace *parentp.
+        */
+       hammer2_chain_lookup_done(parent);
+       if (chain) {
+               if (*parentp != chain &&
+                   (*parentp)->core == chain->core) {
+                       parent = *parentp;
+                       *parentp = chain;
+                       hammer2_chain_unlock(parent);
+               } else {
+                       hammer2_chain_unlock(chain);
+               }
+       }
 
        return (pbase);
 }
@@ -1153,8 +1162,8 @@ retry:
  */
 static
 void
-hammer2_truncate_file(hammer2_trans_t *trans,
-                     hammer2_inode_t *ip, hammer2_key_t nsize)
+hammer2_truncate_file(hammer2_trans_t *trans, hammer2_inode_t *ip,
+                     hammer2_chain_t **parentp, hammer2_key_t nsize)
 {
        hammer2_inode_data_t *ipdata;
        hammer2_chain_t *parent;
@@ -1169,7 +1178,7 @@ hammer2_truncate_file(hammer2_trans_t *trans,
 
        bp = NULL;
        error = 0;
-       ipdata = hammer2_chain_modify_ip(trans, ip, 0);
+       ipdata = hammer2_chain_modify_ip(trans, ip, parentp, 0);
 
        /*
         * Destroy any logical buffer cache buffers beyond the file EOF.
@@ -1233,8 +1242,6 @@ hammer2_truncate_file(hammer2_trans_t *trans,
                                break;
                        }
                        hammer2_chain_unlock(chain);
-                       if (bp->b_bcount == HAMMER2_PBUFSIZE)
-                               bp->b_flags |= B_CLUSTEROK;
                        if (lbase == 0 &&
                            (ipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA)) {
                                /*
@@ -1246,6 +1253,8 @@ hammer2_truncate_file(hammer2_trans_t *trans,
                                /*
                                 * Else a delayed-write is fine.
                                 */
+                               if (bp->b_bcount == HAMMER2_PBUFSIZE)
+                                       bp->b_flags |= B_CLUSTEROK;
                                bdwrite(bp);
                        }
                } else {
@@ -1341,8 +1350,8 @@ hammer2_truncate_file(hammer2_trans_t *trans,
  */
 static
 void
-hammer2_extend_file(hammer2_trans_t *trans,
-                   hammer2_inode_t *ip, hammer2_key_t nsize)
+hammer2_extend_file(hammer2_trans_t *trans, hammer2_inode_t *ip,
+                   hammer2_chain_t **parentp, hammer2_key_t nsize)
 {
        hammer2_inode_data_t *ipdata;
        hammer2_mount_t *hmp;
@@ -1361,7 +1370,7 @@ hammer2_extend_file(hammer2_trans_t *trans,
        KKASSERT(ip->vp);
        hmp = ip->hmp;
 
-       ipdata = hammer2_chain_modify_ip(trans, ip, 0);
+       ipdata = hammer2_chain_modify_ip(trans, ip, parentp, 0);
 
        /*
         * Nothing to do if the direct-data case is still intact
@@ -1502,8 +1511,7 @@ hammer2_vop_nresolve(struct vop_nresolve_args *ap)
        /*
         * Note: In DragonFly the kernel handles '.' and '..'.
         */
-       hammer2_inode_lock_sh(dip);
-       parent = hammer2_chain_lookup_init(dip->chain, HAMMER2_LOOKUP_SHARED);
+       parent = hammer2_inode_lock_sh(dip);
        chain = hammer2_chain_lookup(&parent,
                                     lhc, lhc + HAMMER2_DIRHASH_LOMASK,
                                     HAMMER2_LOOKUP_SHARED);
@@ -1517,8 +1525,7 @@ hammer2_vop_nresolve(struct vop_nresolve_args *ap)
                                           lhc, lhc + HAMMER2_DIRHASH_LOMASK,
                                           HAMMER2_LOOKUP_SHARED);
        }
-       hammer2_chain_lookup_done(parent);
-       hammer2_inode_unlock_sh(dip);
+       hammer2_inode_unlock_sh(dip, parent);
 
        /*
         * If the inode represents a forwarding entry for a hardlink we have
@@ -1583,7 +1590,7 @@ hammer2_vop_nresolve(struct vop_nresolve_args *ap)
                } else if (error == ENOENT) {
                        cache_setvp(ap->a_nch, NULL);
                }
-               hammer2_inode_unlock_ex(ip);
+               hammer2_inode_unlock_ex(ip, chain);
 
                /*
                 * The vp should not be released until after we've disposed
@@ -1612,6 +1619,7 @@ hammer2_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
        hammer2_inode_t *dip;
        hammer2_inode_t *ip;
        hammer2_mount_t *hmp;
+       hammer2_chain_t *parent;
        int error;
 
        dip = VTOI(ap->a_dvp);
@@ -1621,9 +1629,9 @@ hammer2_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
                *ap->a_vpp = NULL;
                return ENOENT;
        }
-       hammer2_inode_lock_ex(ip);
+       parent = hammer2_inode_lock_ex(ip);
        *ap->a_vpp = hammer2_igetv(ip, &error);
-       hammer2_inode_unlock_ex(ip);
+       hammer2_inode_unlock_ex(ip, parent);
 
        return error;
 }
@@ -1636,6 +1644,7 @@ hammer2_vop_nmkdir(struct vop_nmkdir_args *ap)
        hammer2_inode_t *dip;
        hammer2_inode_t *nip;
        hammer2_trans_t trans;
+       hammer2_chain_t *chain;
        struct namecache *ncp;
        const uint8_t *name;
        size_t name_len;
@@ -1652,13 +1661,13 @@ hammer2_vop_nmkdir(struct vop_nmkdir_args *ap)
 
        hammer2_trans_init(hmp, &trans, 0);
        nip = hammer2_inode_create(&trans, dip, ap->a_vap, ap->a_cred,
-                                  name, name_len, &error);
+                                  name, name_len, &chain, &error);
        if (error) {
                KKASSERT(nip == NULL);
                *ap->a_vpp = NULL;
        } else {
                *ap->a_vpp = hammer2_igetv(nip, &error);
-               hammer2_inode_unlock_ex(nip);
+               hammer2_inode_unlock_ex(nip, chain);
        }
        hammer2_trans_done(&trans);
 
@@ -1718,16 +1727,14 @@ hammer2_vop_bmap(struct vop_bmap_args *ap)
                lend = lbeg;
        loff = ap->a_loffset & HAMMER2_OFF_MASK_LO;
 
-       hammer2_inode_lock_sh(ip);
-       parent = hammer2_chain_lookup_init(ip->chain, HAMMER2_LOOKUP_SHARED);
+       parent = hammer2_inode_lock_sh(ip);
        chain = hammer2_chain_lookup(&parent,
                                     lbeg, lend,
                                     HAMMER2_LOOKUP_NODATA |
                                     HAMMER2_LOOKUP_SHARED);
        if (chain == NULL) {
                *ap->a_doffsetp = ZFOFFSET;
-               hammer2_chain_lookup_done(parent);
-               hammer2_inode_unlock_sh(ip);
+               hammer2_inode_unlock_sh(ip, parent);
                return (0);
        }
 
@@ -1743,8 +1750,7 @@ hammer2_vop_bmap(struct vop_bmap_args *ap)
                                           HAMMER2_LOOKUP_NODATA |
                                           HAMMER2_LOOKUP_SHARED);
        }
-       hammer2_chain_lookup_done(parent);
-       hammer2_inode_unlock_sh(ip);
+       hammer2_inode_unlock_sh(ip, parent);
 
        /*
         * If the requested loffset is not mappable physically we can't
@@ -1789,11 +1795,12 @@ int
 hammer2_vop_advlock(struct vop_advlock_args *ap)
 {
        hammer2_inode_t *ip = VTOI(ap->a_vp);
+       hammer2_chain_t *parent;
        hammer2_off_t size;
 
-       hammer2_inode_lock_sh(ip);
-       size = ip->chain->data->ipdata.size;
-       hammer2_inode_unlock_sh(ip);
+       parent = hammer2_inode_lock_sh(ip);
+       size = parent->data->ipdata.size;
+       hammer2_inode_unlock_sh(ip, parent);
        return (lf_advlock(ap, &ip->advlock, size));
 }
 
@@ -1846,7 +1853,7 @@ hammer2_vop_nlink(struct vop_nlink_args *ap)
         * returned chain is locked.
         */
        ip = VTOI(ap->a_vp);
-       hammer2_inode_lock_ex(ip);
+       chain = hammer2_inode_lock_ex(ip);
        error = hammer2_hardlink_consolidate(&trans, ip, &chain, dip, 1);
        if (error)
                goto done;
@@ -1868,9 +1875,7 @@ hammer2_vop_nlink(struct vop_nlink_args *ap)
                cache_setvp(ap->a_nch, ap->a_vp);
        }
 done:
-       if (chain)
-               hammer2_chain_unlock(chain);
-       hammer2_inode_unlock_ex(ip);
+       hammer2_inode_unlock_ex(ip, chain);
        hammer2_trans_done(&trans);
 
        return error;
@@ -1890,6 +1895,7 @@ hammer2_vop_ncreate(struct vop_ncreate_args *ap)
        hammer2_inode_t *dip;
        hammer2_inode_t *nip;
        hammer2_trans_t trans;
+       hammer2_chain_t *nchain;
        struct namecache *ncp;
        const uint8_t *name;
        size_t name_len;
@@ -1906,13 +1912,13 @@ hammer2_vop_ncreate(struct vop_ncreate_args *ap)
        hammer2_trans_init(hmp, &trans, 0);
 
        nip = hammer2_inode_create(&trans, dip, ap->a_vap, ap->a_cred,
-                                  name, name_len, &error);
+                                  name, name_len, &nchain, &error);
        if (error) {
                KKASSERT(nip == NULL);
                *ap->a_vpp = NULL;
        } else {
                *ap->a_vpp = hammer2_igetv(nip, &error);
-               hammer2_inode_unlock_ex(nip);
+               hammer2_inode_unlock_ex(nip, nchain);
        }
        hammer2_trans_done(&trans);
 
@@ -1933,6 +1939,7 @@ hammer2_vop_nsymlink(struct vop_nsymlink_args *ap)
        hammer2_mount_t *hmp;
        hammer2_inode_t *dip;
        hammer2_inode_t *nip;
+       hammer2_chain_t *nparent;
        hammer2_trans_t trans;
        struct namecache *ncp;
        const uint8_t *name;
@@ -1952,7 +1959,7 @@ hammer2_vop_nsymlink(struct vop_nsymlink_args *ap)
        ap->a_vap->va_type = VLNK;      /* enforce type */
 
        nip = hammer2_inode_create(&trans, dip, ap->a_vap, ap->a_cred,
-                                  name, name_len, &error);
+                                  name, name_len, &nparent, &error);
        if (error) {
                KKASSERT(nip == NULL);
                *ap->a_vpp = NULL;
@@ -1989,14 +1996,14 @@ hammer2_vop_nsymlink(struct vop_nsymlink_args *ap)
                        auio.uio_td = curthread;
                        aiov.iov_base = ap->a_target;
                        aiov.iov_len = bytes;
-                       error = hammer2_write_file(nip, &trans,
+                       error = hammer2_write_file(&trans, nip, &nparent,
                                                   &auio, IO_APPEND, 0);
                        nipdata = &nip->chain->data->ipdata; /* RELOAD */
                        /* XXX handle error */
                        error = 0;
                }
        }
-       hammer2_inode_unlock_ex(nip);
+       hammer2_inode_unlock_ex(nip, nparent);
        hammer2_trans_done(&trans);
 
        /*
@@ -2159,7 +2166,7 @@ hammer2_vop_nrename(struct vop_nrename_args *ap)
         *           we do use one later remember that it must be reloaded
         *           on any modification to the inode, including connects.
         */
-       hammer2_inode_lock_ex(ip);
+       chain = hammer2_inode_lock_ex(ip);
        error = hammer2_hardlink_consolidate(&trans, ip, &chain, tdip, 0);
        if (error)
                goto done;
@@ -2196,9 +2203,7 @@ hammer2_vop_nrename(struct vop_nrename_args *ap)
                cache_rename(ap->a_fnch, ap->a_tnch);
        }
 done:
-       if (chain)
-               hammer2_chain_unlock(chain);
-       hammer2_inode_unlock_ex(ip);
+       hammer2_inode_unlock_ex(ip, chain);
        hammer2_inode_drop(ip);
        hammer2_trans_done(&trans);
 
@@ -2268,6 +2273,11 @@ hammer2_strategy_read(struct vop_strategy_args *ap)
        chain = NULL;
        KKASSERT(((int)lbase & HAMMER2_PBUFMASK) == 0);
 
+#if 0
+       kprintf("read lbase %jd cached %016jx\n",
+               lbase, nbio->bio_offset);
+#endif
+
        /*
         * We must characterize the logical->physical translation if it
         * has not already been cached.
@@ -2276,10 +2286,7 @@ hammer2_strategy_read(struct vop_strategy_args *ap)
         * includes both small-block allocations and inode-embedded data.
         */
        if (nbio->bio_offset == NOOFFSET) {
-               hammer2_inode_lock_sh(ip);
-
-               parent = hammer2_chain_lookup_init(ip->chain,
-                                                  HAMMER2_LOOKUP_SHARED);
+               parent = hammer2_inode_lock_sh(ip);
 
                chain = hammer2_chain_lookup(&parent, lbase, lbase,
                                             HAMMER2_LOOKUP_NODATA |
@@ -2295,6 +2302,7 @@ hammer2_strategy_read(struct vop_strategy_args *ap)
                         */
                        KKASSERT(chain == parent);
                        hammer2_chain_unlock(chain);
+                       nbio->bio_offset = NOOFFSET;
                } else if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
                        /*
                         * Data is on-media
@@ -2307,8 +2315,7 @@ hammer2_strategy_read(struct vop_strategy_args *ap)
                } else {
                        panic("hammer2_strategy_read: unknown bref type");
                }
-               hammer2_chain_lookup_done(parent);
-               hammer2_inode_unlock_sh(ip);
+               hammer2_inode_unlock_sh(ip, parent);
        }
 
        if (hammer2_debug & 0x0020) {
@@ -2368,8 +2375,10 @@ hammer2_strategy_write(struct vop_strategy_args *ap)
                /*
                 * The data is embedded in the inode.  Note that strategy
                 * calls for embedded data are synchronous in order to
-                * ensure that ip->chain is stable.
+                * ensure that ip->chain is stable.  Chain modification
+                * status is handled by the caller.
                 */
+               KKASSERT(ip->chain->flags & HAMMER2_CHAIN_MODIFIED);
                KKASSERT(bio->bio_offset == 0);
                KKASSERT(ip->chain && ip->chain->data);
                chain = ip->chain;
@@ -2378,13 +2387,6 @@ hammer2_strategy_write(struct vop_strategy_args *ap)
                bp->b_resid = 0;
                bp->b_error = 0;
                biodone(nbio);
-
-               /*
-                * This special flag does not follow the normal MODIFY rules
-                * because we might deadlock on ip.  Instead we depend on
-                * VOP_FSYNC() to detect the case.
-                */
-               atomic_set_int(&ip->flags, HAMMER2_INODE_DIRTYEMBED);
        } else {
                /*
                 * Forward direct IO to the device