hammer2 - Major hammer2_chain_*() API cleanup
authorMatthew Dillon <dillon@apollo.backplane.com>
Thu, 22 Mar 2012 04:10:40 +0000 (21:10 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Thu, 22 Mar 2012 05:34:50 +0000 (22:34 -0700)
* Do a major rewrite of the hammer2_chain_*() API functions, primarily
  dealing with whether the caller wants data instantiated for a chain
  element or not.

  Also revamp the locking calls.  Integrate the ref count into
  hammer2_chain_lock() to simplify the operation.

* Add infrastructure to allow a minimum physical I/O size to be specified
  independently from the allocation size.  Not finished yet and currently
  disabled.

* Optimize I/O a little, but this still needs a ton of work.
  Set B_CLUSTEROK for logical buffers.

* Add some statistics gathering.

* Remove several duplicative functions.

sys/vfs/hammer2/TODO
sys/vfs/hammer2/hammer2.h
sys/vfs/hammer2/hammer2_chain.c
sys/vfs/hammer2/hammer2_disk.h
sys/vfs/hammer2/hammer2_freemap.c
sys/vfs/hammer2/hammer2_inode.c
sys/vfs/hammer2/hammer2_subr.c
sys/vfs/hammer2/hammer2_vfsops.c
sys/vfs/hammer2/hammer2_vnops.c

index ad85d1f..df6ae9d 100644 (file)
@@ -1,5 +1,8 @@
 * Nesting problems in the flusher.
 
+* Inefficient vfsync due to thousands of file buffers, one per-vnode.
+  (need to aggregate using a device buffer?)
+
 * Adjust the flusher to unlock the parent after the child is locked,
   then restart if the parent changed out from under us.  This will
   greatly reduce namecache contention.
index eee41d1..b19565a 100644 (file)
@@ -127,7 +127,7 @@ SPLAY_PROTOTYPE(hammer2_chain_splay, hammer2_chain, snode, hammer2_chain_cmp);
 #define HAMMER2_CHAIN_DIRTYBP          0x00000004      /* dirty on unlock */
 #define HAMMER2_CHAIN_SUBMODIFIED      0x00000008      /* 1+ subs modified */
 #define HAMMER2_CHAIN_DELETED          0x00000010
-#define HAMMER2_CHAIN_INITIAL          0x00000020      /* initial write */
+#define HAMMER2_CHAIN_INITIAL          0x00000020      /* initial create */
 #define HAMMER2_CHAIN_FLUSHED          0x00000040      /* flush on unlock */
 #define HAMMER2_CHAIN_MOVED            0x00000080      /* moved */
 #define HAMMER2_CHAIN_IOFLUSH          0x00000100      /* bawrite on put */
@@ -141,6 +141,22 @@ SPLAY_PROTOTYPE(hammer2_chain_splay, hammer2_chain, snode, hammer2_chain_cmp);
 #define HAMMER2_LOOKUP_NODATA          0x00000002      /* data left NULL */
 
 /*
+ * Flags passed to hammer2_chain_modify() and hammer2_chain_resize()
+ *
+ * NOTE: OPTDATA allows us to avoid instantiating buffers for INDIRECT
+ *      blocks in the INITIAL-create state.
+ */
+#define HAMMER2_MODIFY_NOSUB           0x00000001      /* do not set SUBMOD */
+#define HAMMER2_MODIFY_OPTDATA         0x00000002      /* data can be NULL */
+
+/*
+ * Flags passed to hammer2_chain_lock()
+ */
+#define HAMMER2_RESOLVE_NEVER          1
+#define HAMMER2_RESOLVE_MAYBE          2
+#define HAMMER2_RESOLVE_ALWAYS         3
+
+/*
  * Cluster different types of storage together for allocations
  */
 #define HAMMER2_FREECACHE_INODE                0
@@ -228,6 +244,7 @@ struct hammer2_mount {
        hammer2_chain_t *schain;        /* super-root */
        hammer2_chain_t *rchain;        /* label-root */
        struct hammer2_inode *iroot;
+       struct lock     alloclk;        /* lockmgr lock */
 
        hammer2_volume_data_t voldata;
        hammer2_off_t   freecache[HAMMER2_FREECACHE_TYPES][HAMMER2_MAX_RADIX];
@@ -261,6 +278,21 @@ extern struct vop_ops hammer2_spec_vops;
 extern struct vop_ops hammer2_fifo_vops;
 
 extern int hammer2_debug;
+extern int hammer2_cluster_enable;
+extern long hammer2_iod_file_read;
+extern long hammer2_iod_meta_read;
+extern long hammer2_iod_indr_read;
+extern long hammer2_iod_file_write;
+extern long hammer2_iod_meta_write;
+extern long hammer2_iod_indr_write;
+extern long hammer2_iod_volu_write;
+extern long hammer2_ioa_file_read;
+extern long hammer2_ioa_meta_read;
+extern long hammer2_ioa_indr_read;
+extern long hammer2_ioa_file_write;
+extern long hammer2_ioa_meta_write;
+extern long hammer2_ioa_indr_write;
+extern long hammer2_ioa_volu_write;
 
 /*
  * hammer2_subr.c
@@ -318,21 +350,18 @@ hammer2_chain_t *hammer2_chain_alloc(hammer2_mount_t *hmp,
 void hammer2_chain_free(hammer2_mount_t *hmp, hammer2_chain_t *chain);
 void hammer2_chain_ref(hammer2_mount_t *hmp, hammer2_chain_t *chain);
 void hammer2_chain_drop(hammer2_mount_t *hmp, hammer2_chain_t *chain);
-int hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain);
+int hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain, int how);
+void hammer2_chain_moved(hammer2_mount_t *hmp, hammer2_chain_t *chain);
 void hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain,
-                               int setsubmod);
+                               int flags);
 void hammer2_chain_resize(hammer2_mount_t *hmp, hammer2_chain_t *chain,
-                               int nradix);
-void hammer2_chain_modify_quick(hammer2_mount_t *hmp, hammer2_chain_t *chain);
-void hammer2_chain_resize_quick(hammer2_mount_t *hmp, hammer2_chain_t *chain,
-                               int nradix);
+                               int nradix, int flags);
 void hammer2_chain_unlock(hammer2_mount_t *hmp, hammer2_chain_t *chain);
 hammer2_chain_t *hammer2_chain_find(hammer2_mount_t *hmp,
                                hammer2_chain_t *parent, int index);
 hammer2_chain_t *hammer2_chain_get(hammer2_mount_t *hmp,
                                hammer2_chain_t *parent,
                                int index, int flags);
-void hammer2_chain_put(hammer2_mount_t *hmp, hammer2_chain_t *chain);
 hammer2_chain_t *hammer2_chain_lookup(hammer2_mount_t *hmp,
                                hammer2_chain_t **parentp,
                                hammer2_key_t key_beg, hammer2_key_t key_end,
index 9d5bfed..c916115 100644 (file)
@@ -49,8 +49,6 @@
 
 #include "hammer2.h"
 
-SPLAY_GENERATE(hammer2_chain_splay, hammer2_chain, snode, hammer2_chain_cmp);
-
 static int hammer2_indirect_optimize;  /* XXX SYSCTL */
 
 static hammer2_chain_t *hammer2_chain_create_indirect(
@@ -58,8 +56,10 @@ static hammer2_chain_t *hammer2_chain_create_indirect(
                        hammer2_key_t key, int keybits);
 
 /*
- * Compare function for chain splay tree
+ * Splay tree
  */
+SPLAY_GENERATE(hammer2_chain_splay, hammer2_chain, snode, hammer2_chain_cmp);
+
 int
 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2)
 {
@@ -67,6 +67,33 @@ hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2)
 }
 
 /*
+ * Recursively mark the parent chain elements so flushes can find
+ * modified elements.
+ *
+ * NOTE: The flush code will modify a SUBMODIFIED-flagged chain
+ *      during the flush recursion after clearing the parent's
+ *      SUBMODIFIED bit.  We don't want to re-set the parent's
+ *      SUBMODIFIED bit in this case!
+ *
+ * XXX rename of parent can create a SMP race
+ */
+static void
+hammer2_chain_parent_setsubmod(hammer2_mount_t *hmp, hammer2_chain_t *chain)
+{
+       hammer2_chain_t *parent;
+
+       if ((chain->flags & HAMMER2_CHAIN_SUBMODIFIED) == 0) {
+               parent = chain->parent;
+               while (parent &&
+                      (parent->flags & HAMMER2_CHAIN_SUBMODIFIED) == 0) {
+                       atomic_set_int(&parent->flags,
+                                      HAMMER2_CHAIN_SUBMODIFIED);
+                       parent = parent->parent;
+               }
+       }
+}
+
+/*
  * Allocate a new disconnected chain element representing the specified
  * bref.  The chain element is locked exclusively and refs is set to 1.
  *
@@ -225,69 +252,121 @@ hammer2_chain_drop(hammer2_mount_t *hmp, hammer2_chain_t *chain)
 }
 
 /*
- * Lock a chain element, acquiring its data with I/O if necessary.
+ * Ref and lock a chain element, acquiring its data with I/O if necessary,
+ * and specify how you would like the data to be resolved.
  *
  * Returns 0 on success or an error code if the data could not be acquired.
  * The chain element is locked either way.
  *
- * chain->data will be pointed either at the embedded data (e.g. for
- * inodes), in which case the buffer cache buffer is released, or will
- * point into the bp->b_data buffer with the bp left intact while locked.
+ * The lock is allowed to recurse, multiple locking ops will aggregate
+ * the requested resolve types.  Once data is assigned it will not be
+ * removed until the last unlock.
+ *
+ * HAMMER2_RESOLVE_NEVER - Do not resolve the data element.
+ *                        (typically used to avoid device/logical buffer
+ *                         aliasing for data)
+ *
+ * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in
+ *                        the INITIAL-create state (indirect blocks only).
+ *
+ *                        Do not resolve data elements for DATA chains.
+ *                        (typically used to avoid device/logical buffer
+ *                         aliasing for data)
+ *
+ * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element.
+ *
  *
- * NOTE: Chain elements of type DATA do not instantiate a buffer or set
- *      the data pointer.
+ * NOTE: Embedded elements (volume header, inodes) are always resolved
+ *      regardless.
+ *
+ * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded
+ *      element will instantiate and zero its buffer, and flush it on
+ *      release.
+ *
+ * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE
+ *      so as not to instantiate a device buffer, which could alias against
+ *      a logical file buffer.  However, if ALWAYS is specified the
+ *      device buffer will be instantiated anyway.
  */
 int
-hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
+hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain, int how)
 {
        hammer2_blockref_t *bref;
        hammer2_off_t pbase;
+       hammer2_off_t peof;
+       size_t boff;
+       size_t bbytes;
        int error;
+       char *bdata;
 
        /*
         * Lock the element.  Under certain conditions this might end up
         * being a recursive lock.
         */
        KKASSERT(chain->refs > 0);
+       atomic_add_int(&chain->refs, 1);
        lockmgr(&chain->lk, LK_EXCLUSIVE);
 
        /*
-        * The volume header is a special case
-        */
-       if (chain->bref.type == HAMMER2_BREF_TYPE_VOLUME)
-               return(0);
-
-       /*
-        * bp must be NULL, so if the data pointer is valid here it points
-        * to embedded data and no I/O is necessary (whether modified or not).
+        * If we already have a valid data pointer no further action is
+        * necessary.
         */
-       KKASSERT(chain->bp == NULL);
        if (chain->data)
                return (0);
 
        /*
-        * We do not instantiate a device buffer for DATA chain elements,
-        * as this would cause unnecessary double-buffering.
+        * Do we have to resolve the data?
         */
-       if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
+       switch(how) {
+       case HAMMER2_RESOLVE_NEVER:
                return(0);
+       case HAMMER2_RESOLVE_MAYBE:
+               if (chain->flags & HAMMER2_CHAIN_INITIAL)
+                       return(0);
+               if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
+                       return(0);
+               /* fall through */
+       case HAMMER2_RESOLVE_ALWAYS:
+               break;
+       }
 
        /*
-        * If data is NULL we must issue I/O.  Any error returns the error
-        * code but leaves the chain locked.
+        * We must resolve to a device buffer, either by issuing I/O or
+        * by creating a zero-fill element.  We do not mark the buffer
+        * dirty when creating a zero-fill element (the hammer2_chain_modify()
+        * API must still be used to do that).
         *
-        * If the chain was modified a new bref will have already been
-        * allocated and its related bp is probably still sitting in the
-        * buffer cache.
+        * The device buffer is variable-sized in powers of 2 down
+        * to HAMMER2_MINALLOCSIZE (typically 1K).  A 64K physical storage
+        * chunk always contains buffers of the same size. (XXX)
         *
-        * The buffer cache buffer is variable-sized in powers of 2 down
-        * to HAMMER2_MINIOSIZE (typically 1K).
+        * The minimum physical IO size may be larger than the variable
+        * block size.
         */
        bref = &chain->bref;
 
-       pbase = bref->data_off & ~(hammer2_off_t)(chain->bytes - 1);
+       if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
+               bbytes = HAMMER2_MINIOSIZE;
+       pbase = bref->data_off & ~(hammer2_off_t)(bbytes - 1);
+       peof = (pbase + HAMMER2_PBUFSIZE64) & ~HAMMER2_PBUFMASK64;
+       boff = bref->data_off & HAMMER2_OFF_MASK & (bbytes - 1);
        KKASSERT(pbase != 0);
-       error = bread(hmp->devvp, pbase, chain->bytes, &chain->bp);
+
+       /*
+        * The getblk() optimization can only be used on newly created
+        * elements if the physical block size matches the request.
+        */
+       if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
+           chain->bytes == bbytes) {
+               chain->bp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
+               error = 0;
+       } else if (hammer2_cluster_enable) {
+               error = cluster_read(hmp->devvp, peof, pbase, bbytes,
+                                    HAMMER2_PBUFSIZE, HAMMER2_PBUFSIZE,
+                                    &chain->bp);
+       } else {
+               error = bread(hmp->devvp, pbase, bbytes, &chain->bp);
+       }
 
        if (error) {
                kprintf("hammer2_chain_get: I/O error %016jx: %d\n",
@@ -298,6 +377,13 @@ hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
        }
 
        /*
+        * Zero the data area if the chain is in the INITIAL-create state
+        */
+       bdata = (char *)chain->bp->b_data + boff;
+       if (chain->flags & HAMMER2_CHAIN_INITIAL)
+               bzero(bdata, chain->bytes);
+
+       /*
         * Setup the data pointer, either pointing it to an embedded data
         * structure and copying the data from the buffer, or pointing it
         * into the buffer.
@@ -311,19 +397,23 @@ hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
                /*
                 * Copy data from bp to embedded buffer
                 */
-               KKASSERT(0);    /* not yet - have mount use this soon */
+               panic("hammer2_chain_lock: called on unresolved volume header");
+#if 0
+               /* NOT YET */
                KKASSERT(pbase == 0);
                KKASSERT(chain->bytes == HAMMER2_PBUFSIZE);
-               bcopy(chain->bp->b_data, &hmp->voldata, chain->bytes);
+               bcopy(bdata, &hmp->voldata, chain->bytes);
                chain->data = (void *)&hmp->voldata;
                bqrelse(chain->bp);
                chain->bp = NULL;
+#endif
                break;
        case HAMMER2_BREF_TYPE_INODE:
                /*
-                * Copy data from bp to embedded buffer.
+                * Copy data from bp to embedded buffer, do not retain the
+                * device buffer.
                 */
-               bcopy(chain->bp->b_data, &chain->u.ip->ip_data, chain->bytes);
+               bcopy(bdata, &chain->u.ip->ip_data, chain->bytes);
                chain->data = (void *)&chain->u.ip->ip_data;
                bqrelse(chain->bp);
                chain->bp = NULL;
@@ -332,163 +422,149 @@ hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
        case HAMMER2_BREF_TYPE_DATA:
        default:
                /*
-                * Leave bp intact
+                * Point data at the device buffer and leave bp intact.
                 */
-               chain->data = (void *)chain->bp->b_data;
+               chain->data = (void *)bdata;
                break;
        }
        return (0);
 }
 
 /*
- * Recursively mark the parent chain elements so flushes can find
- * modified elements.
- *
- * NOTE: The flush code will modify a SUBMODIFIED-flagged chain
- *      during the flush recursion after clearing the parent's
- *      SUBMODIFIED bit.  We don't want to re-set the parent's
- *      SUBMODIFIED bit in this case!
- *
- * XXX rename of parent can create a SMP race
- */
-static void
-hammer2_chain_parent_setsubmod(hammer2_mount_t *hmp, hammer2_chain_t *chain)
-{
-       hammer2_chain_t *parent;
-
-       if ((chain->flags & HAMMER2_CHAIN_SUBMODIFIED) == 0) {
-               parent = chain->parent;
-               while (parent &&
-                      (parent->flags & HAMMER2_CHAIN_SUBMODIFIED) == 0) {
-                       atomic_set_int(&parent->flags,
-                                      HAMMER2_CHAIN_SUBMODIFIED);
-                       parent = parent->parent;
-               }
-       }
-}
-
-/*
- * Resize the chain's physical storage allocation.  Chains can be resized
- * smaller without reallocating the storage.  Resizing larger will reallocate
- * the storage.
+ * Unlock and deref a chain element.
  *
- * Must be passed a locked chain
+ * On the last lock release any non-embedded data (chain->bp) will be
+ * retired.
  */
 void
-hammer2_chain_resize(hammer2_mount_t *hmp, hammer2_chain_t *chain, int nradix)
+hammer2_chain_unlock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
 {
-       struct buf *nbp;
-       size_t obytes;
-       size_t nbytes;
-       void *ndata;
-       int error;
+       long *counterp;
 
        /*
-        * Only data and indirect blocks can be resized for now
+        * Undo a recursive lock
         */
-       KKASSERT(chain != &hmp->vchain);
-       KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA ||
-                chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT);
+       if (lockcountnb(&chain->lk) > 1) {
+               KKASSERT(chain->refs > 1);
+               atomic_add_int(&chain->refs, -1);
+               lockmgr(&chain->lk, LK_RELEASE);
+               return;
+       }
 
        /*
-        * Nothing to do if the element is already the proper size
+        * Shortcut the case if the data is embedded or not resolved.
+        * Do NOT null-out pointers to embedded data (e.g. inode).
         */
-       obytes = chain->bytes;
-       nbytes = 1 << nradix;
-       if (obytes == nbytes)
+       if (chain->bp == NULL) {
+               lockmgr(&chain->lk, LK_RELEASE);
+               hammer2_chain_drop(hmp, chain);
                return;
+       }
 
        /*
-        * Set MODIFIED1 and add a chain ref to prevent destruction.  Both
-        * modified flags share the same ref.
+        * Statistics
         */
-       if ((chain->flags & HAMMER2_CHAIN_MODIFIED1) == 0) {
-               atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED1);
-               hammer2_chain_ref(hmp, chain);
-       }
-
-       if (nbytes < obytes) {
-               /*
-                * If we are making it smaller we don't have to reallocate
-                * the block but we still need to resize it.
-                */
-               chain->bref.data_off &= ~HAMMER2_OFF_MASK_RADIX;
-               chain->bref.data_off |= (nradix & HAMMER2_OFF_MASK_RADIX);
-               chain->bytes = nbytes;
-               allocbuf(chain->bp, nbytes);
-       } else {
-               /*
-                * Otherwise we do
-                */
-               chain->bref.data_off =
-                       hammer2_freemap_alloc(hmp, chain->bref.type, nbytes);
-               chain->bytes = nbytes;
-
+       if ((chain->flags & HAMMER2_CHAIN_DIRTYBP) == 0) {
+               ;
+       } else if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
                switch(chain->bref.type) {
-               case HAMMER2_BREF_TYPE_VOLUME:          /* embedded */
-               case HAMMER2_BREF_TYPE_INODE:           /* embedded */
-                       /*
-                        * data points to embedded structure, no copy needed
-                        */
-                       error = 0;
+               case HAMMER2_BREF_TYPE_DATA:
+                       counterp = &hammer2_ioa_file_write;
+                       break;
+               case HAMMER2_BREF_TYPE_INODE:
+                       counterp = &hammer2_ioa_meta_write;
                        break;
                case HAMMER2_BREF_TYPE_INDIRECT:
-                       panic("hammer2_chain_resize: "
-                             "cannot resize indirect block");
-                       /* NOT REACHED */
+                       counterp = &hammer2_ioa_indr_write;
+                       break;
+               default:
+                       counterp = &hammer2_ioa_volu_write;
                        break;
+               }
+               ++*counterp;
+       } else {
+               switch(chain->bref.type) {
                case HAMMER2_BREF_TYPE_DATA:
-                       /*
-                        * data (if not NULL) points into original bp or
-                        * to embedded data.  Copy-on-write to new block.
-                        */
-                       KKASSERT(chain != &hmp->vchain);        /* safety */
-                       nbp = getblk(hmp->devvp,
-                                    chain->bref.data_off &
-                                     ~(hammer2_off_t)(nbytes - 1),
-                                    nbytes, 0, 0);
-                       vfs_bio_clrbuf(nbp);
-                       error = 0;
-
-                       /*
-                        * The new block is larger than the old one, only
-                        * copy what fits.
-                        */
-                       ndata = nbp->b_data;
-                       if (chain->data) {
-                               if (nbytes < obytes)
-                                       bcopy(chain->data, ndata, nbytes);
-                               else
-                                       bcopy(chain->data, ndata, obytes);
-                               KKASSERT(chain->bp != NULL);
-                       }
-                       if (chain->bp) {
-                               chain->bp->b_flags |= B_RELBUF;
-                               brelse(chain->bp);
-                       }
-                       chain->bp = nbp;
-                       chain->data = ndata;
+                       counterp = &hammer2_iod_file_write;
+                       break;
+               case HAMMER2_BREF_TYPE_INODE:
+                       counterp = &hammer2_iod_meta_write;
+                       break;
+               case HAMMER2_BREF_TYPE_INDIRECT:
+                       counterp = &hammer2_iod_indr_write;
                        break;
                default:
-                       panic("hammer2_chain_modify: unknown bref type");
+                       counterp = &hammer2_iod_volu_write;
                        break;
+               }
+               ++*counterp;
+       }
 
+       /*
+        * Clean out the bp.
+        *
+        * If a device buffer was used for data be sure to destroy the
+        * buffer when we are done to avoid aliases (XXX what about the
+        * underlying VM pages?).
+        */
+       if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
+               chain->bp->b_flags |= B_RELBUF;
+
+       chain->data = NULL;
+       if (chain->flags & HAMMER2_CHAIN_DIRTYBP) {
+               atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
+               if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
+                       atomic_clear_int(&chain->flags,
+                                        HAMMER2_CHAIN_IOFLUSH);
+                       chain->bp->b_flags |= B_RELBUF;
+                       bawrite(chain->bp);
+               } else {
+                       chain->bp->b_flags |= B_CLUSTEROK;
+                       bdwrite(chain->bp);
+               }
+       } else {
+               if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
+                       atomic_clear_int(&chain->flags,
+                                        HAMMER2_CHAIN_IOFLUSH);
+                       chain->bp->b_flags |= B_RELBUF;
+                       brelse(chain->bp);
+               } else {
+                       /* bp might still be dirty */
+                       bqrelse(chain->bp);
                }
        }
-       hammer2_chain_parent_setsubmod(hmp, chain);
+       chain->bp = NULL;
+       lockmgr(&chain->lk, LK_RELEASE);
+       hammer2_chain_drop(hmp, chain);
 }
 
 /*
- * This is the same as hammer2_chain_resize() except the chain does NOT
- * have to be locked and any underlying data is NOT copied to the new
- * location.
+ * Resize the chain's physical storage allocation.  Chains can be resized
+ * smaller without reallocating the storage.  Resizing larger will reallocate
+ * the storage.
+ *
+ * Must be passed a locked chain.  If you want the resize to copy the data
+ * you should lock the chain with RESOLVE_MAYBE or RESOLVE_ALWAYS, otherwise
+ * the resize operation will not copy the data.
+ *
+ * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order
+ * to avoid instantiating a device buffer that conflicts with the vnode
+ * data buffer.
+ *
+ * XXX flags currently ignored, uses chain->bp to detect data/no-data.
  */
 void
-hammer2_chain_resize_quick(hammer2_mount_t *hmp, hammer2_chain_t *chain,
-                          int nradix)
+hammer2_chain_resize(hammer2_mount_t *hmp, hammer2_chain_t *chain,
+                    int nradix, int flags)
 {
+       struct buf *nbp;
+       hammer2_off_t pbase;
        size_t obytes;
        size_t nbytes;
+       size_t bbytes;
+       int boff;
+       char *bdata;
+       int error;
 
        /*
         * Only data and indirect blocks can be resized for now
@@ -505,9 +581,6 @@ hammer2_chain_resize_quick(hammer2_mount_t *hmp, hammer2_chain_t *chain,
        if (obytes == nbytes)
                return;
 
-       lockmgr(&chain->lk, LK_EXCLUSIVE);
-       KKASSERT(chain->bp == NULL);
-
        /*
         * Set MODIFIED1 and add a chain ref to prevent destruction.  Both
         * modified flags share the same ref.
@@ -517,24 +590,65 @@ hammer2_chain_resize_quick(hammer2_mount_t *hmp, hammer2_chain_t *chain,
                hammer2_chain_ref(hmp, chain);
        }
 
-       if (nbytes < obytes) {
+       /*
+        * Relocate the block, even if making it smaller (because different
+        * block sizes may be in different regions).
+        */
+       chain->bref.data_off = hammer2_freemap_alloc(hmp, chain->bref.type,
+                                                    nbytes);
+       chain->bytes = nbytes;
+
+       /*
+        * The device buffer may be larger than the allocation size.
+        */
+       if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
+               bbytes = HAMMER2_MINIOSIZE;
+       pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
+       boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
+
+       /*
+        * Only copy the data if resolved, otherwise the caller is
+        * responsible.
+        */
+       if (chain->bp) {
+               KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
+                        chain->bref.type == HAMMER2_BREF_TYPE_DATA);
+               KKASSERT(chain != &hmp->vchain);        /* safety */
+
                /*
-                * If we are making it smaller we don't have to reallocate
-                * the block but we still need to resize it.
+                * The getblk() optimization can only be used if the
+                * physical block size matches the request.
                 */
-               chain->bref.data_off &= ~HAMMER2_OFF_MASK_RADIX;
-               chain->bref.data_off |= (nradix & HAMMER2_OFF_MASK_RADIX);
-               chain->bytes = nbytes;
-       } else {
+               if (nbytes == bbytes) {
+                       nbp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
+                       error = 0;
+               } else {
+                       error = bread(hmp->devvp, pbase, bbytes, &nbp);
+                       KKASSERT(error == 0);
+               }
+               bdata = (char *)nbp->b_data + boff;
+
+               if (nbytes < obytes) {
+                       bcopy(chain->data, bdata, nbytes);
+               } else {
+                       bcopy(chain->data, bdata, obytes);
+                       bzero(bdata + obytes, nbytes - obytes);
+               }
+
                /*
-                * Otherwise we do
+                * NOTE: The INITIAL state of the chain is left intact.
+                *
+                * NOTE: Because of the reallocation we have to set DIRTYBP
+                *       if INITIAL is not set.
                 */
-               chain->bref.data_off =
-                       hammer2_freemap_alloc(hmp, chain->bref.type, nbytes);
-               chain->bytes = nbytes;
+               chain->bp->b_flags |= B_RELBUF;
+               brelse(chain->bp);
+               chain->bp = nbp;
+               chain->data = (void *)bdata;
+               if ((chain->flags & HAMMER2_CHAIN_INITIAL) == 0)
+                       atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
        }
        hammer2_chain_parent_setsubmod(hmp, chain);
-       lockmgr(&chain->lk, LK_RELEASE);
 }
 
 /*
@@ -543,42 +657,30 @@ hammer2_chain_resize_quick(hammer2_mount_t *hmp, hammer2_chain_t *chain,
  * If not already marked modified a new physical block will be allocated
  * and assigned to the bref.
  *
- * allocated->modified (without calling hammer2_chain_lock()) results
- * in chain->data typically being NULL.  In this situation chain->data
- * is assigned and the target area is zero'd out.
- *
- * If the data is pointing into a bp it will be relocated to a new bp.
- * If the data is embedded we leave it alone for now.
+ * Non-data blocks - The chain should be locked to at least the RESOLVE_MAYBE
+ *                  level or the COW operation will not work.
  *
- * NOTE: Not used for DATA chain types, hammer2_chain_modify_quick() is
- *      used instead.  We don't want to allocate a device buffer for
- *      data that would interfere with the file's logical buffers.
+ * Data blocks    - The chain is usually locked RESOLVE_NEVER so as not to
+ *                  run the data through the device buffers.
  */
 void
-hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain,
-                    int setsubmod)
+hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain, int flags)
 {
-       hammer2_off_t pbase;
        struct buf *nbp;
-       void *ndata;
        int error;
-
-       KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_DATA);
-
-       /*
-        * Setting the DIRTYBP flag will cause the buffer to be dirtied or
-        * written-out on unlock.  This bit is independent of the MODIFIED1
-        * bit because the chain may still need meta-data adjustments done
-        * by virtue of MODIFIED1 for its parent, and the buffer can be
-        * flushed out (possibly multiple times) by the OS before that.
-        */
-       atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
+       hammer2_off_t pbase;
+       size_t bbytes;
+       size_t boff;
+       void *bdata;
 
        /*
         * If the chain is already marked MODIFIED1 we can just return.
         */
        if (chain->flags & HAMMER2_CHAIN_MODIFIED1) {
-               KKASSERT(chain->data != NULL);
+               if ((flags & HAMMER2_MODIFY_OPTDATA) == 0 &&
+                   chain->bp == NULL) {
+                       goto skip1;
+               }
                return;
        }
 
@@ -604,7 +706,7 @@ hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain,
         */
        if (chain != &hmp->vchain) {
                if ((hammer2_debug & 0x0001) &&
-                   (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) {
+                   (chain->bref.data_off & HAMMER2_OFF_MASK)) {
                        kprintf("Replace %d\n", chain->bytes);
                }
                chain->bref.data_off =
@@ -613,156 +715,102 @@ hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain,
                /* XXX failed allocation */
        }
 
+       /*
+        * If data instantiation is optional and the chain has no current
+        * data association (typical for DATA and newly-created INDIRECT
+        * elements), don't instantiate the buffer now.
+        */
+       if ((flags & HAMMER2_MODIFY_OPTDATA) && chain->bp == NULL)
+               goto skip2;
+
+skip1:
+       /*
+        * Setting the DIRTYBP flag will cause the buffer to be dirtied or
+        * written-out on unlock.  This bit is independent of the MODIFIED1
+        * bit because the chain may still need meta-data adjustments done
+        * by virtue of MODIFIED1 for its parent, and the buffer can be
+        * flushed out (possibly multiple times) by the OS before that.
+        *
+        * Clearing the INITIAL flag (for indirect blocks) indicates that
+        * a zero-fill buffer has been instantiated.
+        */
+       atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
+       atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
+
+       /*
+        * We currently should never instantiate a device buffer for a
+        * data chain.
+        */
+       KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_DATA);
+
+       /*
+        * Execute COW operation
+        */
        switch(chain->bref.type) {
-       case HAMMER2_BREF_TYPE_VOLUME:          /* embedded */
-       case HAMMER2_BREF_TYPE_INODE:           /* embedded */
+       case HAMMER2_BREF_TYPE_VOLUME:
+       case HAMMER2_BREF_TYPE_INODE:
                /*
-                * Inode and Volume data already points to the embedded
-                * structure, no copy is needed
+                * The data is embedded, no copy-on-write operation is
+                * needed.
                 */
-               error = 0;
+               KKASSERT(chain->bp == NULL);
                break;
-       case HAMMER2_BREF_TYPE_INDIRECT:
        case HAMMER2_BREF_TYPE_DATA:
+       case HAMMER2_BREF_TYPE_INDIRECT:
                /*
-                * data (if not NULL) points into original bp or to embedded
-                * data, copy-on-write to the new block.
-                *
-                * data (if NULL) indicates that no prior copy exists, the
-                * storage must be zero'd.
+                * Perform the copy-on-write operation
                 */
                KKASSERT(chain != &hmp->vchain);        /* safety */
-               pbase = chain->bref.data_off &
-                        ~(hammer2_off_t)(chain->bytes - 1);
-               nbp = getblk(hmp->devvp, pbase, chain->bytes, 0, 0);
-               vfs_bio_clrbuf(nbp);    /* XXX */
-               error = 0;
+               /*
+                * The device buffer may be larger than the allocation size.
+                */
+               if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
+                       bbytes = HAMMER2_MINIOSIZE;
+               pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
+               boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
+
+               /*
+                * The getblk() optimization can only be used if the
+                * physical block size matches the request.
+                */
+               if (chain->bytes == bbytes) {
+                       nbp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
+                       error = 0;
+               } else {
+                       error = bread(hmp->devvp, pbase, bbytes, &nbp);
+                       KKASSERT(error == 0);
+               }
+               bdata = (char *)nbp->b_data + boff;
 
                /*
                 * Copy or zero-fill on write depending on whether
                 * chain->data exists or not.
                 */
-               ndata = nbp->b_data;
                if (chain->data) {
-                       bcopy(chain->data, ndata, chain->bytes);
+                       bcopy(chain->data, bdata, chain->bytes);
                        KKASSERT(chain->bp != NULL);
                } else {
-                       bzero(ndata, chain->bytes);
+                       bzero(bdata, chain->bytes);
                }
                if (chain->bp) {
                        chain->bp->b_flags |= B_RELBUF;
                        brelse(chain->bp);
                }
                chain->bp = nbp;
-               chain->data = ndata;
+               chain->data = bdata;
                break;
        default:
-               panic("hammer2_chain_modify: unknown bref type");
+               panic("hammer2_chain_modify: illegal non-embedded type %d",
+                     chain->bref.type);
                break;
 
        }
-       if (setsubmod)
+skip2:
+       if ((flags & HAMMER2_MODIFY_NOSUB) == 0)
                hammer2_chain_parent_setsubmod(hmp, chain);
 }
 
 /*
- * Same as hammer2_chain_modify() except the chain does not have to be
- * locked and the underlying data will NOT be copied to the new location.
- */
-void
-hammer2_chain_modify_quick(hammer2_mount_t *hmp, hammer2_chain_t *chain)
-{
-       /*
-        * Set the MODIFIED1 bit and handle degenerate cases.
-        *
-        * We do not set the DIRTYBP flag, we don't want the flush code to
-        * read-modify-write the underlying physical buffer because it
-        * is probably aliased against a logical buffer.
-        *
-        * We must lock the chain but not instantiate its data.
-        *
-        * If the chain is already marked MODIFIED1 we can just return,
-        * but must interlock a failed test to avoid races.
-        */
-       if (chain->flags & HAMMER2_CHAIN_MODIFIED1)
-               return;
-       lockmgr(&chain->lk, LK_EXCLUSIVE);
-       if (chain->flags & HAMMER2_CHAIN_MODIFIED1) {
-               lockmgr(&chain->lk, LK_RELEASE);
-               return;
-       }
-       atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED1);
-       hammer2_chain_ref(hmp, chain);  /* ref for MODIFIED1 bit */
-
-       /*
-        * We must allocate the copy-on-write block.
-        *
-        * If the data is embedded no other action is required.
-        *
-        * If the data is not embedded we acquire and clear the
-        * new block.  If chain->data is not NULL we then do the
-        * copy-on-write.  chain->data will then be repointed to the new
-        * buffer and the old buffer will be released.
-        *
-        * For newly created elements with no prior allocation we go
-        * through the copy-on-write steps except without the copying part.
-        */
-       if (chain != &hmp->vchain) {
-               if ((hammer2_debug & 0x0001) &&
-                   (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) {
-                       kprintf("Replace %d\n", chain->bytes);
-               }
-               chain->bref.data_off =
-                       hammer2_freemap_alloc(hmp, chain->bref.type,
-                                             chain->bytes);
-               /* XXX failed allocation */
-       }
-       hammer2_chain_parent_setsubmod(hmp, chain);
-       lockmgr(&chain->lk, LK_RELEASE);
-}
-
-/*
- * Unlock a chain element without dropping its reference count.
- * (see hammer2_chain_put() to do both).
- *
- * Non-embedded data references (chain->bp != NULL) are returned to the
- * system and the data field is cleared in that case.  If modified the
- * dirty buffer is still returned to the system, can be flushed to disk by
- * the system at any time, and will be reconstituted/re-read as needed.
- */
-void
-hammer2_chain_unlock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
-{
-       if (chain->bp) {
-               chain->data = NULL;
-               if (chain->flags & HAMMER2_CHAIN_DIRTYBP) {
-                       if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
-                               atomic_clear_int(&chain->flags,
-                                                HAMMER2_CHAIN_IOFLUSH);
-                               chain->bp->b_flags |= B_RELBUF;
-                               bawrite(chain->bp);
-                       } else {
-                               chain->bp->b_flags |= B_CLUSTEROK;
-                               bdwrite(chain->bp);
-                       }
-               } else {
-                       if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
-                               atomic_clear_int(&chain->flags,
-                                                HAMMER2_CHAIN_IOFLUSH);
-                               chain->bp->b_flags |= B_RELBUF;
-                               brelse(chain->bp);
-                       } else {
-                               /* bp might still be dirty */
-                               bqrelse(chain->bp);
-                       }
-               }
-               chain->bp = NULL;
-       }
-       atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
-       lockmgr(&chain->lk, LK_RELEASE);
-}
-
-/*
  * Locate an in-memory chain.  The parent must be locked.  The in-memory
  * chain is returned or NULL if no in-memory chain is present.
  *
@@ -791,6 +839,16 @@ hammer2_chain_get(hammer2_mount_t *hmp, hammer2_chain_t *parent,
        hammer2_blockref_t *bref;
        hammer2_chain_t *chain;
        hammer2_chain_t dummy;
+       int how;
+
+       /*
+        * Figure out how to lock.  MAYBE can be used to optimized
+        * the initial-create state for indirect blocks.
+        */
+       if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK))
+               how = HAMMER2_RESOLVE_NEVER;
+       else
+               how = HAMMER2_RESOLVE_MAYBE;
 
        /*
         * First see if we have a (possibly modified) chain element cached
@@ -802,13 +860,23 @@ hammer2_chain_get(hammer2_mount_t *hmp, hammer2_chain_t *parent,
        dummy.index = index;
        chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead, &dummy);
        if (chain) {
-               hammer2_chain_ref(hmp, chain);
-               if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
-                       hammer2_chain_lock(hmp, chain);
+               if (flags & HAMMER2_LOOKUP_NOLOCK)
+                       hammer2_chain_ref(hmp, chain);
+               else
+                       hammer2_chain_lock(hmp, chain, how);
                return(chain);
        }
 
        /*
+        * the get function must always succeed, panic if there's no
+        * data to index.
+        */
+       if (parent->flags & HAMMER2_CHAIN_INITIAL) {
+               panic("hammer2_chain_get: Missing bref(1)");
+               /* NOT REACHED */
+       }
+
+       /*
         * Otherwise lookup the bref and issue I/O (switch on the parent)
         */
        switch(parent->bref.type) {
@@ -817,6 +885,7 @@ hammer2_chain_get(hammer2_mount_t *hmp, hammer2_chain_t *parent,
                bref = &parent->data->ipdata.u.blockset.blockref[index];
                break;
        case HAMMER2_BREF_TYPE_INDIRECT:
+               KKASSERT(parent->data != NULL);
                KKASSERT(index >= 0 &&
                         index < parent->bytes / sizeof(hammer2_blockref_t));
                bref = &parent->data->npdata.blockref[index];
@@ -830,11 +899,16 @@ hammer2_chain_get(hammer2_mount_t *hmp, hammer2_chain_t *parent,
                panic("hammer2_chain_get: unrecognized blockref type: %d",
                      parent->bref.type);
        }
+       if (bref->type == 0) {
+               panic("hammer2_chain_get: Missing bref(2)");
+               /* NOT REACHED */
+       }
 
        /*
         * Allocate a chain structure representing the existing media
-        * entry.  Thus the chain is *not* INITIAL and certainly not
-        * MODIFIED (yet).
+        * entry.
+        *
+        * The locking operation we do later will issue I/O to read it.
         */
        chain = hammer2_chain_alloc(hmp, bref);
 
@@ -867,25 +941,16 @@ hammer2_chain_get(hammer2_mount_t *hmp, hammer2_chain_t *parent,
         *
         * If NOLOCK is set the release will release the one-and-only lock.
         */
-       if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
-               hammer2_chain_lock(hmp, chain);
-       lockmgr(&chain->lk, LK_RELEASE);
+       if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) {
+               hammer2_chain_lock(hmp, chain, how);    /* recusive lock */
+               hammer2_chain_drop(hmp, chain);         /* excess ref */
+       }
+       lockmgr(&chain->lk, LK_RELEASE);                /* from alloc */
 
        return (chain);
 }
 
 /*
- * Unlock and dereference a chain after use.  It is possible for this to
- * recurse up the chain.
- */
-void
-hammer2_chain_put(hammer2_mount_t *hmp, hammer2_chain_t *chain)
-{
-       hammer2_chain_unlock(hmp, chain);
-       hammer2_chain_drop(hmp, chain);
-}
-
-/*
  * Locate any key between key_beg and key_end inclusive.  (*parentp)
  * typically points to an inode but can also point to a related indirect
  * block and this function will recurse upwards and find the inode again.
@@ -937,10 +1002,11 @@ hammer2_chain_lookup(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
                           ((hammer2_key_t)1 << parent->bref.keybits) - 1;
                if (key_beg >= scan_beg && key_end <= scan_end)
                        break;
-               hammer2_chain_unlock(hmp, parent);
+               hammer2_chain_ref(hmp, parent);         /* ref old parent */
+               hammer2_chain_unlock(hmp, parent);      /* unlock old parent */
                parent = parent->parent;
-               hammer2_chain_ref(hmp, parent);         /* ref new parent */
-               hammer2_chain_lock(hmp, parent);        /* lock new parent */
+                                                       /* lock new parent */
+               hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_MAYBE);
                hammer2_chain_drop(hmp, *parentp);      /* drop old parent */
                *parentp = parent;                      /* new parent */
        }
@@ -960,18 +1026,28 @@ again:
                 * This is only applicable to regular files and softlinks.
                 */
                if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
-                       hammer2_chain_ref(hmp, parent);
-                       if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
-                               hammer2_chain_lock(hmp, parent);
+                       if (flags & HAMMER2_LOOKUP_NOLOCK)
+                               hammer2_chain_ref(hmp, parent);
+                       else
+                               hammer2_chain_lock(hmp, parent,
+                                                  HAMMER2_RESOLVE_ALWAYS);
                        return (parent);
                }
                base = &parent->data->ipdata.u.blockset.blockref[0];
                count = HAMMER2_SET_COUNT;
                break;
        case HAMMER2_BREF_TYPE_INDIRECT:
-               if (parent->data == NULL)
-                       panic("parent->data is NULL");
-               base = &parent->data->npdata.blockref[0];
+               /*
+                * Optimize indirect blocks in the INITIAL state to avoid
+                * I/O.
+                */
+               if (parent->flags & HAMMER2_CHAIN_INITIAL) {
+                       base = NULL;
+               } else {
+                       if (parent->data == NULL)
+                               panic("parent->data is NULL");
+                       base = &parent->data->npdata.blockref[0];
+               }
                count = parent->bytes / sizeof(hammer2_blockref_t);
                break;
        case HAMMER2_BREF_TYPE_VOLUME:
@@ -991,9 +1067,14 @@ again:
        bref = NULL;
        for (i = 0; i < count; ++i) {
                tmp = hammer2_chain_find(hmp, parent, i);
-               bref = (tmp) ? &tmp->bref : &base[i];
-               if (bref->type == 0)
+               if (tmp) {
+                       bref = &tmp->bref;
+                       KKASSERT(bref->type != 0);
+               } else if (base == NULL || base[i].type == 0) {
                        continue;
+               } else {
+                       bref = &base[i];
+               }
                scan_beg = bref->key;
                scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
                if (key_beg <= scan_end && key_end >= scan_beg)
@@ -1016,15 +1097,21 @@ again:
 
        /*
         * If the chain element is an indirect block it becomes the new
-        * parent and we loop on it.  We must fixup the chain we loop on
-        * if the caller passed flags to us that aren't sufficient for our
-        * needs.
+        * parent and we loop on it.
+        *
+        * The parent always has to be locked with at least RESOLVE_MAYBE,
+        * so it might need a fixup if the caller passed incompatible flags.
         */
        if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
-               hammer2_chain_put(hmp, parent);
+               hammer2_chain_unlock(hmp, parent);
                *parentp = parent = chain;
-               if (flags & HAMMER2_LOOKUP_NOLOCK)
-                       hammer2_chain_lock(hmp, chain);
+               if (flags & HAMMER2_LOOKUP_NOLOCK) {
+                       hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_MAYBE);
+                       hammer2_chain_drop(hmp, chain); /* excess ref */
+               } else if (flags & HAMMER2_LOOKUP_NODATA) {
+                       hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_MAYBE);
+                       hammer2_chain_unlock(hmp, chain);
+               }
                goto again;
        }
 
@@ -1074,7 +1161,7 @@ again:
                if (flags & HAMMER2_LOOKUP_NOLOCK)
                        hammer2_chain_drop(hmp, chain);
                else
-                       hammer2_chain_put(hmp, chain);
+                       hammer2_chain_unlock(hmp, chain);
 
                /*
                 * Any scan where the lookup returned degenerate data embedded
@@ -1095,9 +1182,6 @@ again:
                 */
                hammer2_chain_t *nparent;
 
-               if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT)
-                       return (NULL);
-
                scan_beg = parent->bref.key;
                scan_end = scan_beg +
                            ((hammer2_key_t)1 << parent->bref.keybits) - 1;
@@ -1107,9 +1191,10 @@ again:
                i = parent->index + 1;
                nparent = parent->parent;
                hammer2_chain_ref(hmp, nparent);        /* ref new parent */
-               hammer2_chain_unlock(hmp, parent);
-               hammer2_chain_lock(hmp, nparent);       /* lock new parent */
-               hammer2_chain_drop(hmp, parent);        /* drop old parent */
+               hammer2_chain_unlock(hmp, parent);      /* unlock old parent */
+                                                       /* lock new parent */
+               hammer2_chain_lock(hmp, nparent, HAMMER2_RESOLVE_MAYBE);
+               hammer2_chain_drop(hmp, nparent);       /* drop excess ref */
                *parentp = parent = nparent;
        }
 
@@ -1124,7 +1209,12 @@ again2:
                count = HAMMER2_SET_COUNT;
                break;
        case HAMMER2_BREF_TYPE_INDIRECT:
-               base = &parent->data->npdata.blockref[0];
+               if (parent->flags & HAMMER2_CHAIN_INITIAL) {
+                       base = NULL;
+               } else {
+                       KKASSERT(parent->data != NULL);
+                       base = &parent->data->npdata.blockref[0];
+               }
                count = parent->bytes / sizeof(hammer2_blockref_t);
                break;
        case HAMMER2_BREF_TYPE_VOLUME:
@@ -1148,10 +1238,13 @@ again2:
        bref = NULL;
        while (i < count) {
                tmp = hammer2_chain_find(hmp, parent, i);
-               bref = (tmp) ? &tmp->bref : &base[i];
-               if (bref->type == 0) {
+               if (tmp) {
+                       bref = &tmp->bref;
+               } else if (base == NULL || base[i].type == 0) {
                        ++i;
                        continue;
+               } else {
+                       bref = &base[i];
                }
                scan_beg = bref->key;
                scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
@@ -1177,15 +1270,21 @@ again2:
 
        /*
         * If the chain element is an indirect block it becomes the new
-        * parent and we loop on it.  We may have to lock the chain when
-        * cycling it in as the new parent as it will not be locked if the
-        * caller passed NOLOCK.
+        * parent and we loop on it.
+        *
+        * The parent always has to be locked with at least RESOLVE_MAYBE,
+        * so it might need a fixup if the caller passed incompatible flags.
         */
        if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
-               hammer2_chain_put(hmp, parent);
+               hammer2_chain_unlock(hmp, parent);
                *parentp = parent = chain;
-               if (flags & HAMMER2_LOOKUP_NOLOCK)
-                       hammer2_chain_lock(hmp, chain);
+               if (flags & HAMMER2_LOOKUP_NOLOCK) {
+                       hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_MAYBE);
+                       hammer2_chain_drop(hmp, chain); /* excess ref */
+               } else if (flags & HAMMER2_LOOKUP_NODATA) {
+                       hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_MAYBE);
+                       hammer2_chain_unlock(hmp, chain);
+               }
                i = 0;
                goto again2;
        }
@@ -1200,7 +1299,7 @@ again2:
  * Create and return a new hammer2 system memory structure of the specified
  * key, type and size and insert it RELATIVE TO (PARENT).
  *
- * (parent) is typically either an inode or an indirect  block, acquired
+ * (parent) is typically either an inode or an indirect block, acquired
  * acquired as a side effect of issuing a prior failed lookup.  parent
  * must be locked and held.  Do not pass the inode chain to this function
  * unless that is the chain returned by the failed lookup.
@@ -1213,8 +1312,14 @@ again2:
  * to emplace the new indirect type.
  *
  * A new locked, referenced chain element is returned of the specified type.
- * This element will also be marked as modified and contain a data area
- * ready for initialization.
+ * The element may or may not have a data area associated with it:
+ *
+ *     VOLUME          not allowed here
+ *     INODE           embedded data are will be set-up
+ *     INDIRECT        not allowed here
+ *     DATA            no data area will be set-up (caller is expected
+ *                     to have logical buffers, we don't want to alias
+ *                     the data onto device buffers!).
  */
 hammer2_chain_t *
 hammer2_chain_create(hammer2_mount_t *hmp, hammer2_chain_t *parent,
@@ -1223,7 +1328,6 @@ hammer2_chain_create(hammer2_mount_t *hmp, hammer2_chain_t *parent,
 {
        hammer2_blockref_t dummy;
        hammer2_blockref_t *base;
-       hammer2_blockref_t *bref;
        hammer2_chain_t dummy_chain;
        int unlock_parent = 0;
        int allocated = 0;
@@ -1246,9 +1350,11 @@ hammer2_chain_create(hammer2_mount_t *hmp, hammer2_chain_t *parent,
                /*
                 * We set the WAS_MODIFIED flag here so the chain gets
                 * marked as modified below.
+                *
+                * We do NOT set INITIAL here (yet).  INITIAL is only
+                * used for indirect blocks.
                 */
-               chain->flags |= HAMMER2_CHAIN_INITIAL |
-                               HAMMER2_CHAIN_WAS_MODIFIED;
+               atomic_set_int(&chain->flags, HAMMER2_CHAIN_WAS_MODIFIED);
 
                /*
                 * Recalculate bytes to reflect the actual media block
@@ -1267,6 +1373,9 @@ hammer2_chain_create(hammer2_mount_t *hmp, hammer2_chain_t *parent,
                        chain->data = (void *)&chain->u.ip->ip_data;
                        break;
                case HAMMER2_BREF_TYPE_INDIRECT:
+                       panic("hammer2_chain_create: cannot be used to"
+                             "create indirect block");
+                       break;
                case HAMMER2_BREF_TYPE_DATA:
                default:
                        /* leave chain->data NULL */
@@ -1294,8 +1403,12 @@ again:
                count = HAMMER2_SET_COUNT;
                break;
        case HAMMER2_BREF_TYPE_INDIRECT:
-               KKASSERT(parent->data != NULL);
-               base = &parent->data->npdata.blockref[0];
+               if (parent->flags & HAMMER2_CHAIN_INITIAL) {
+                       base = NULL;
+               } else {
+                       KKASSERT(parent->data != NULL);
+                       base = &parent->data->npdata.blockref[0];
+               }
                count = parent->bytes / sizeof(hammer2_blockref_t);
                break;
        case HAMMER2_BREF_TYPE_VOLUME:
@@ -1316,14 +1429,19 @@ again:
         * in the parent's bref array.
         */
        bzero(&dummy_chain, sizeof(dummy_chain));
-       bref = NULL;
        for (i = 0; i < count; ++i) {
-               bref = &base[i];
-               dummy_chain.index = i;
-               if (bref->type == 0 &&
-                   SPLAY_FIND(hammer2_chain_splay,
-                              &parent->shead, &dummy_chain) == NULL) {
-                       break;
+               if (base == NULL) {
+                       dummy_chain.index = i;
+                       if (SPLAY_FIND(hammer2_chain_splay,
+                                      &parent->shead, &dummy_chain) == NULL) {
+                               break;
+                       }
+               } else if (base[i].type == 0) {
+                       dummy_chain.index = i;
+                       if (SPLAY_FIND(hammer2_chain_splay,
+                                      &parent->shead, &dummy_chain) == NULL) {
+                               break;
+                       }
                }
        }
 
@@ -1349,7 +1467,7 @@ again:
                }
                if (parent != nparent) {
                        if (unlock_parent)
-                               hammer2_chain_put(hmp, parent);
+                               hammer2_chain_unlock(hmp, parent);
                        parent = nparent;
                        unlock_parent = 1;
                }
@@ -1385,30 +1503,39 @@ again:
        /*
         * WAS_MODIFIED indicates that this is a newly-created chain element
         * rather than a renamed chain element.  In this situation we want
-        * to mark non-data chain elements as modified in order to resolve
-        * the data pointer.
+        * to place the chain element in the MODIFIED1 state.
         *
-        * data chain elements are marked modified but WITHOUT resolving the
-        * data pointer, as a device buffer would interfere otherwise.
+        * The data area will be set up as follows:
         *
-        * Chain elements with embedded data will not issue I/O at this time.
-        * A new block will be allocated for the buffer but not instantiated.
+        *      VOLUME          not allowed here.
         *
-        * NON-DATA chain elements which do not use embedded data will
-        * allocate the new block AND instantiate its buffer cache buffer,
-        * pointing the data at the bp.
+        *      INODE           embedded data are will be set-up.
+        *
+        *      INDIRECT        not allowed here.
+        *
+        *      DATA            no data area will be set-up (caller is expected
+        *                      to have logical buffers, we don't want to alias
+        *                      the data onto device buffers!).
         */
        if (chain->flags & HAMMER2_CHAIN_WAS_MODIFIED) {
                atomic_clear_int(&chain->flags, HAMMER2_CHAIN_WAS_MODIFIED);
-               if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
-                       hammer2_chain_modify_quick(hmp, chain);
-               else
-                       hammer2_chain_modify(hmp, chain, 1);
+               if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
+                       hammer2_chain_modify(hmp, chain,
+                                            HAMMER2_MODIFY_OPTDATA);
+               } else if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
+                       /* not supported in this function */
+                       panic("hammer2_chain_create: bad type");
+                       atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
+                       hammer2_chain_modify(hmp, chain,
+                                            HAMMER2_MODIFY_OPTDATA);
+               } else {
+                       hammer2_chain_modify(hmp, chain, 0);
+               }
        }
 
 done:
        if (unlock_parent)
-               hammer2_chain_put(hmp, parent);
+               hammer2_chain_unlock(hmp, parent);
        return (chain);
 }
 
@@ -1466,33 +1593,57 @@ hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
        int i;
 
        /*
-        * Mark the parent modified so our base[] pointer remains valid
-        * while we move entries.
+        * Calculate the base blockref pointer or NULL if the chain
+        * is known to be empty.
         */
-       hammer2_chain_modify(hmp, parent, 1);
+       hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_OPTDATA);
+       if (parent->flags & HAMMER2_CHAIN_INITIAL) {
+               base = NULL;
 
-       /*
-        * Locate a free blockref in the parent's array
-        */
-       switch(parent->bref.type) {
-       case HAMMER2_BREF_TYPE_INODE:
-               base = &parent->data->ipdata.u.blockset.blockref[0];
-               count = HAMMER2_SET_COUNT;
-               break;
-       case HAMMER2_BREF_TYPE_INDIRECT:
-               base = &parent->data->npdata.blockref[0];
-               count = parent->bytes / sizeof(hammer2_blockref_t);
-               break;
-       case HAMMER2_BREF_TYPE_VOLUME:
-               base = &hmp->voldata.sroot_blockset.blockref[0];
-               count = HAMMER2_SET_COUNT;
-               break;
-       default:
-               panic("hammer2_chain_create_indirect: "
-                     "unrecognized blockref type: %d",
-                     parent->bref.type);
-               count = 0;
-               break;
+               /*
+                * We still need to calculate the count for SPLAY lookups
+                */
+               switch(parent->bref.type) {
+               case HAMMER2_BREF_TYPE_INODE:
+                       count = HAMMER2_SET_COUNT;
+                       break;
+               case HAMMER2_BREF_TYPE_INDIRECT:
+                       count = parent->bytes / sizeof(hammer2_blockref_t);
+                       break;
+               case HAMMER2_BREF_TYPE_VOLUME:
+                       count = HAMMER2_SET_COUNT;
+                       break;
+               default:
+                       panic("hammer2_chain_create_indirect: "
+                             "unrecognized blockref type: %d",
+                             parent->bref.type);
+                       count = 0;
+                       break;
+               }
+       } else {
+               /*
+                * Locate a free blockref in the parent's array
+                */
+               switch(parent->bref.type) {
+               case HAMMER2_BREF_TYPE_INODE:
+                       base = &parent->data->ipdata.u.blockset.blockref[0];
+                       count = HAMMER2_SET_COUNT;
+                       break;
+               case HAMMER2_BREF_TYPE_INDIRECT:
+                       base = &parent->data->npdata.blockref[0];
+                       count = parent->bytes / sizeof(hammer2_blockref_t);
+                       break;
+               case HAMMER2_BREF_TYPE_VOLUME:
+                       base = &hmp->voldata.sroot_blockset.blockref[0];
+                       count = HAMMER2_SET_COUNT;
+                       break;
+               default:
+                       panic("hammer2_chain_create_indirect: "
+                             "unrecognized blockref type: %d",
+                             parent->bref.type);
+                       count = 0;
+                       break;
+               }
        }
 
        /*
@@ -1504,14 +1655,19 @@ hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
        for (i = 0; i < count; ++i) {
                int nkeybits;
 
-               bref = &base[i];
-               if (bref->type == 0) {
+               /*
+                * Optimize the case where the parent is still in its
+                * initially created state.
+                */
+               if (base == NULL || base[i].type == 0) {
                        dummy.index = i;
-                       chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead,
-                                          &dummy);
+                       chain = SPLAY_FIND(hammer2_chain_splay,
+                                          &parent->shead, &dummy);
                        if (chain == NULL)
                                continue;
                        bref = &chain->bref;
+               } else {
+                       bref = &base[i];
                }
 
                /*
@@ -1608,7 +1764,7 @@ hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
        dummy.bref.keybits = keybits;
        dummy.bref.data_off = hammer2_bytes_to_radix(nbytes);
        ichain = hammer2_chain_alloc(hmp, &dummy.bref);
-       ichain->flags |= HAMMER2_CHAIN_INITIAL;
+       atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL);
 
        /*
         * Iterate the original parent and move the matching brefs into
@@ -1622,11 +1778,10 @@ hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
                 * anyway so we can avoid checking the cache when the media
                 * has a key.
                 */
-               bref = &base[i];
-               if (bref->type == 0) {
+               if (base == NULL || base[i].type == 0) {
                        dummy.index = i;
-                       chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead,
-                                          &dummy);
+                       chain = SPLAY_FIND(hammer2_chain_splay,
+                                          &parent->shead, &dummy);
                        if (chain == NULL) {
                                /*
                                 * Select index indirect block is placed in
@@ -1636,6 +1791,8 @@ hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
                                continue;
                        }
                        bref = &chain->bref;
+               } else {
+                       bref = &base[i];
                }
 
                /*
@@ -1672,24 +1829,20 @@ hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
                 *     data.  NODATA feature needed.
                 */
                chain = hammer2_chain_get(hmp, parent, i,
-                                         HAMMER2_LOOKUP_NOLOCK);
-               lockmgr(&chain->lk, LK_EXCLUSIVE);
+                                         HAMMER2_LOOKUP_NODATA);
                SPLAY_REMOVE(hammer2_chain_splay, &parent->shead, chain);
                if (SPLAY_INSERT(hammer2_chain_splay, &ichain->shead, chain))
                        panic("hammer2_chain_create_indirect: collision");
                chain->parent = ichain;
-               bzero(&base[i], sizeof(base[i]));
+               if (base)
+                       bzero(&base[i], sizeof(base[i]));
                atomic_add_int(&parent->refs, -1);
                atomic_add_int(&ichain->refs, 1);
-               if (chain->flags & HAMMER2_CHAIN_MOVED) {
-                       /* We don't need the ref from the chain_get */
-                       lockmgr(&chain->lk, LK_RELEASE);
-                       hammer2_chain_drop(hmp, chain);
-               } else {
-                       /* MOVED bit inherits the ref from the chain_get */
+               if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
+                       hammer2_chain_ref(hmp, chain);
                        atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
-                       lockmgr(&chain->lk, LK_RELEASE);
                }
+               hammer2_chain_unlock(hmp, chain);
                KKASSERT(parent->refs > 0);
                chain = NULL;
        }
@@ -1717,7 +1870,7 @@ hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
         * our moved blocks, then call setsubmod() to set the bit
         * recursively.
         */
-       hammer2_chain_modify(hmp, ichain, 1);
+       hammer2_chain_modify(hmp, ichain, HAMMER2_MODIFY_OPTDATA);
        atomic_set_int(&ichain->flags, HAMMER2_CHAIN_SUBMODIFIED);
        hammer2_chain_parent_setsubmod(hmp, ichain);
 
@@ -1729,14 +1882,14 @@ hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
                 * Key being created is way outside the key range,
                 * return the original parent.
                 */
-               hammer2_chain_put(hmp, ichain);
+               hammer2_chain_unlock(hmp, ichain);
        } else if (~(((hammer2_key_t)1 << keybits) - 1) &
                   (create_key ^ key)) {
                /*
                 * Key being created is outside the key range,
                 * return the original parent.
                 */
-               hammer2_chain_put(hmp, ichain);
+               hammer2_chain_unlock(hmp, ichain);
        } else {
                /*
                 * Otherwise its in the range, return the new parent.
@@ -1772,22 +1925,27 @@ hammer2_chain_delete(hammer2_mount_t *hmp, hammer2_chain_t *parent,
 
        /*
         * Mark the parent modified so our base[] pointer remains valid
-        * while we move entries.
+        * while we move entries.  For the optimized indirect block
+        * case mark the parent moved instead.
         *
         * Calculate the blockref reference in the parent
         */
-       hammer2_chain_modify(hmp, parent, 1);
-
        switch(parent->bref.type) {
        case HAMMER2_BREF_TYPE_INODE:
+               hammer2_chain_modify(hmp, parent, 0);
                base = &parent->data->ipdata.u.blockset.blockref[0];
                count = HAMMER2_SET_COUNT;
                break;
        case HAMMER2_BREF_TYPE_INDIRECT:
-               base = &parent->data->npdata.blockref[0];
+               hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_OPTDATA);
+               if (parent->flags & HAMMER2_CHAIN_INITIAL)
+                       base = NULL;
+               else
+                       base = &parent->data->npdata.blockref[0];
                count = parent->bytes / sizeof(hammer2_blockref_t);
                break;
        case HAMMER2_BREF_TYPE_VOLUME:
+               hammer2_chain_modify(hmp, parent, 0);
                base = &hmp->voldata.sroot_blockset.blockref[0];
                count = HAMMER2_SET_COUNT;
                break;
@@ -1803,8 +1961,8 @@ hammer2_chain_delete(hammer2_mount_t *hmp, hammer2_chain_t *parent,
         * disconnect in-memory fields from the parent.
         */
        KKASSERT(chain->index >= 0 && chain->index < count);
-       base += chain->index;
-       bzero(base, sizeof(*base));
+       if (base)
+               bzero(&base[chain->index], sizeof(*base));
 
        SPLAY_REMOVE(hammer2_chain_splay, &parent->shead, chain);
        atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
@@ -1838,7 +1996,11 @@ hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *chain, int tab)
 {
        hammer2_blockref_t *bref;
        hammer2_off_t pbase;
+       size_t bbytes;
+       size_t boff;
+       char *bdata;
        struct buf *bp;
+       int error;
 
        if (hammer2_debug & 0x0008)
                kprintf("%*.*sCHAIN type=%d@%08jx %p/%d %04x {\n",
@@ -1894,8 +2056,7 @@ hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *chain, int tab)
                        /*
                         * Recurse the flush
                         */
-                       hammer2_chain_ref(hmp, scan);
-                       hammer2_chain_lock(hmp, scan);
+                       hammer2_chain_lock(hmp, scan, HAMMER2_RESOLVE_MAYBE);
                        if (chain->flags & HAMMER2_CHAIN_DESTROYED) {
                                atomic_set_int(&scan->flags,
                                               HAMMER2_CHAIN_DESTROYED);
@@ -1919,7 +2080,7 @@ hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *chain, int tab)
                        }
                        if (hammer2_debug & 0x0008)
                                kprintf("\n");
-                       hammer2_chain_put(hmp, scan);
+                       hammer2_chain_unlock(hmp, scan);
                }
 
                if (submodified || (chain->flags & HAMMER2_CHAIN_SUBMODIFIED)) {
@@ -1950,7 +2111,7 @@ hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *chain, int tab)
                         *       recursively set the SUBMODIFIED flag
                         *       upward in this case!
                         */
-                       hammer2_chain_modify(hmp, chain, 0);
+                       hammer2_chain_modify(hmp, chain, HAMMER2_MODIFY_NOSUB);
 
                        switch(chain->bref.type) {
                        case HAMMER2_BREF_TYPE_INODE:
@@ -1983,16 +2144,15 @@ hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *chain, int tab)
                                                  &chain->shead, scan);
                                KKASSERT(scan->index >= 0 &&
                                         scan->index < count);
-                               hammer2_chain_ref(hmp, scan);
-                               lockmgr(&scan->lk, LK_EXCLUSIVE);
+                               hammer2_chain_lock(hmp, scan,
+                                                  HAMMER2_RESOLVE_NEVER);
                                base[scan->index] = scan->bref;
                                if (scan->flags & HAMMER2_CHAIN_MOVED) {
                                        atomic_clear_int(&scan->flags,
                                                 HAMMER2_CHAIN_MOVED);
                                        hammer2_chain_drop(hmp, scan);
                                }
-                               lockmgr(&scan->lk, LK_RELEASE);
-                               hammer2_chain_drop(hmp, scan);
+                               hammer2_chain_unlock(hmp, scan);
                        }
                }
        }
@@ -2067,30 +2227,57 @@ hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *chain, int tab)
                 * the vop_write code.
                 */
                break;
+       case HAMMER2_BREF_TYPE_INDIRECT:
+               /*
+                * Indirect blocks may be in an INITIAL state.
+                */
+               break;
        default:
+               /*
+                * Embedded elements have to be flushed out.
+                */
                KKASSERT(chain->data != NULL);
                bref = &chain->bref;
 
-               pbase = bref->data_off & ~(hammer2_off_t)(chain->bytes - 1);
-               KKASSERT(pbase != 0);   /* not the root volume header */
+               KKASSERT((bref->data_off & HAMMER2_OFF_MASK) != 0);
 
                if (chain->bp == NULL) {
                        /*
                         * The data is embedded, we have to acquire the
                         * buffer cache buffer and copy the data into it.
                         */
-                       bp = getblk(hmp->devvp, pbase, chain->bytes, 0, 0);
+                       if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
+                               bbytes = HAMMER2_MINIOSIZE;
+                       pbase = bref->data_off & ~(hammer2_off_t)(bbytes - 1);
+                       boff = bref->data_off & HAMMER2_OFF_MASK & (bbytes - 1);
+
+                       /*
+                        * The getblk() optimization can only be used if the
+                        * physical block size matches the request.
+                        */
+                       if (chain->bytes == bbytes) {
+                               bp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
+                               error = 0;
+                       } else {
+                               error = bread(hmp->devvp, pbase, bbytes, &bp);
+                               KKASSERT(error == 0);
+                       }
+                       bdata = (char *)bp->b_data + boff;
 
                        /*
                         * Copy the data to the buffer, mark the buffer
                         * dirty, and convert the chain to unmodified.
                         */
-                       bcopy(chain->data, bp->b_data, chain->bytes);
+                       bcopy(chain->data, bdata, chain->bytes);
                        bp->b_flags |= B_CLUSTEROK;
                        bdwrite(bp);
                        bp = NULL;
                        chain->bref.check.iscsi32.value =
                                hammer2_icrc32(chain->data, chain->bytes);
+                       if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
+                               ++hammer2_iod_meta_write;
+                       else
+                               ++hammer2_iod_indr_write;
                } else {
                        chain->bref.check.iscsi32.value =
                                hammer2_icrc32(chain->data, chain->bytes);
@@ -2192,13 +2379,12 @@ hammer2_chain_flush(hammer2_mount_t *hmp, hammer2_chain_t *chain)
        }
 
        /*
-        * We updating brefs but we have to call chain_modify() w/
+        * We are updating brefs but we have to call chain_modify() w/
         * setsubmod = TRUE because our caller is not a recursive
         * flush.
         */
-       hammer2_chain_ref(hmp, parent);
-       hammer2_chain_lock(hmp, parent);
-       hammer2_chain_modify(hmp, parent, 1);
+       hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_MAYBE);
+       hammer2_chain_modify(hmp, parent, 0);
 
        switch(parent->bref.type) {
        case HAMMER2_BREF_TYPE_INODE:
@@ -2233,6 +2419,6 @@ hammer2_chain_flush(hammer2_mount_t *hmp, hammer2_chain_t *chain)
                hammer2_chain_drop(hmp, chain);
        }
 
-       lockmgr(&parent->lk, LK_RELEASE);
-       hammer2_chain_put(hmp, parent);
+       lockmgr(&parent->lk, LK_RELEASE);       /* release manual lockmgr op */
+       hammer2_chain_unlock(hmp, parent);
 }
index d458325..9f559c5 100644 (file)
 #define HAMMER2_KEY_RADIX      64      /* number of bits in key */
 
 /*
- * HAMMER2 utilizes 64K physical buffers and 16K logical filesystem buffers.
- * The smaller logical filesystem buffers reduce ram waste when the OS is
- * caching lots of small files.
+ * MINALLOCSIZE                - The minimum allocation size.  This can be smaller
+ *                       or larger than the minimum physical IO size.
+ *
+ *                       NOTE: Should not be larger than 1K since inodes
+ *                             are 1K.
+ *
+ * MINIOSIZE           - The minimum IO size.  This must be less than
+ *                       or equal to HAMMER2_PBUFSIZE.
+ *
+ *                       XXX currently must be set to MINALLOCSIZE until/if
+ *                           we deal with recursive buffer cache locks.
+ *
+ * HAMMER2_PBUFSIZE    - Topological block size used by files for all
+ *                       blocks except the block straddling EOF.
+ *
+ * HAMMER2_SEGSIZE     - Allocation map segment size, typically 2MB
  */
+
+#define HAMMER2_SEGSIZE                (65536 * 8)
+
 #define HAMMER2_PBUFRADIX      16      /* physical buf (1<<16) bytes */
 #define HAMMER2_PBUFSIZE       65536
 #define HAMMER2_LBUFRADIX      14      /* logical buf (1<<14) bytes */
 #define HAMMER2_LBUFSIZE       16384
-#define HAMMER2_MINIORADIX     10      /* minimum IO size for direct IO */
-#define HAMMER2_MINIOSIZE      1024
+
+#if 0
+#define HAMMER2_MINIORADIX     16      /* minimum phsical IO size */
+#define HAMMER2_MINIOSIZE      65536
+#endif
+#define HAMMER2_MINIORADIX     HAMMER2_MINALLOCRADIX
+#define HAMMER2_MINIOSIZE      HAMMER2_MINALLOCSIZE
+
+#define HAMMER2_MINALLOCRADIX  10      /* minimum block allocation size */
+#define HAMMER2_MINALLOCSIZE   1024
 #define HAMMER2_IND_BYTES_MIN  4096    /* first indirect layer only */
 #define HAMMER2_IND_BYTES_MAX  HAMMER2_PBUFSIZE
 #define HAMMER2_IND_COUNT_MIN  (HAMMER2_IND_BYTES_MIN / \
 
 #define HAMMER2_PBUFMASK       (HAMMER2_PBUFSIZE - 1)
 #define HAMMER2_LBUFMASK       (HAMMER2_LBUFSIZE - 1)
+#define HAMMER2_SEGMASK                (HAMMER2_SEGSIZE - 1)
 
+#define HAMMER2_LBUFMASK64     ((hammer2_off_t)HAMMER2_LBUFMASK)
 #define HAMMER2_PBUFSIZE64     ((hammer2_off_t)HAMMER2_PBUFSIZE)
 #define HAMMER2_PBUFMASK64     ((hammer2_off_t)HAMMER2_PBUFMASK)
-#define HAMMER2_LBUFMASK64     ((hammer2_off_t)HAMMER2_LBUFMASK)
+#define HAMMER2_SEGSIZE64      ((hammer2_off_t)HAMMER2_SEGSIZE)
+#define HAMMER2_SEGMASK64      ((hammer2_off_t)HAMMER2_SEGMASK)
 
 #define HAMMER2_UUID_STRING    "5cbb9ad1-862d-11dc-a94d-01301bb8a9f5"
 
index 5cacf25..1f669c2 100644 (file)
@@ -57,6 +57,7 @@ hammer2_freemap_alloc(hammer2_mount_t *hmp, int type, size_t bytes)
 {
        hammer2_off_t data_off;
        hammer2_off_t data_next;
+       /*struct buf *bp;*/
        int radix;
        int fctype;
 
@@ -65,7 +66,7 @@ hammer2_freemap_alloc(hammer2_mount_t *hmp, int type, size_t bytes)
                fctype = HAMMER2_FREECACHE_INODE;
                break;
        case HAMMER2_BREF_TYPE_INDIRECT:
-               fctype = HAMMER2_FREECACHE_INDIR;
+               fctype = HAMMER2_FREECACHE_INODE;
                break;
        case HAMMER2_BREF_TYPE_DATA:
                fctype = HAMMER2_FREECACHE_DATA;
@@ -81,34 +82,60 @@ hammer2_freemap_alloc(hammer2_mount_t *hmp, int type, size_t bytes)
        radix = hammer2_bytes_to_radix(bytes);
        bytes = 1 << radix;
 
+       lockmgr(&hmp->alloclk, LK_EXCLUSIVE);
        if (radix < HAMMER2_MAX_RADIX && hmp->freecache[fctype][radix]) {
                /*
                 * Allocate from our packing cache
                 */
                data_off = hmp->freecache[fctype][radix];
                hmp->freecache[fctype][radix] += bytes;
-               if ((hmp->freecache[fctype][radix] & HAMMER2_PBUFMASK) == 0)
+               if ((hmp->freecache[fctype][radix] & HAMMER2_SEGMASK) == 0)
                        hmp->freecache[fctype][radix] = 0;
        } else {
                /*
-                * Allocate from the allocation iterator using a PBUFSIZE
+                * Allocate from the allocation iterator using a SEGSIZE
                 * aligned block and reload the packing cache if possible.
                 */
                data_off = hmp->voldata.allocator_beg;
-               data_off = (data_off + HAMMER2_PBUFMASK64) &
-                          ~HAMMER2_PBUFMASK64;
+               data_off = (data_off + HAMMER2_SEGMASK64) & ~HAMMER2_SEGMASK64;
                data_next = data_off + bytes;
 
-               if ((data_next & HAMMER2_PBUFMASK) == 0) {
+               if ((data_next & HAMMER2_SEGMASK) == 0) {
                        hmp->voldata.allocator_beg = data_next;
                } else {
-                       KKASSERT(radix < HAMMER2_MAX_RADIX);
+                       KKASSERT(radix <= HAMMER2_MAX_RADIX);
                        hmp->voldata.allocator_beg =
-                                       (data_next + HAMMER2_PBUFMASK64) &
-                                       ~HAMMER2_PBUFMASK64;
+                                       (data_next + HAMMER2_SEGMASK64) &
+                                       ~HAMMER2_SEGMASK64;
                        hmp->freecache[fctype][radix] = data_next;
                }
        }
+       lockmgr(&hmp->alloclk, LK_RELEASE);
+
+#if 0
+       /*
+        * Allocations on-media are always in multiples of 64K but
+        * partial-block allocations can be tracked in-memory.
+        *
+        * We can reduce the need for read-modify-write IOs by
+        * telling the kernel that the contents of a new 64K block is
+        * initially good (before we use any of it).
+        *
+        * Worst case is the kernel evicts the buffer and causes HAMMER2's
+        * bread later on to actually issue a read I/O.
+        *
+        * XXX Maybe do this in SEGSIZE increments? Needs a lot of work.
+        *     Also watch out for buffer size mismatches.
+        */
+       if (bytes < HAMMER2_MINIOSIZE &&
+           (data_off & (HAMMER2_MINIOSIZE - 1)) == 0) {
+               bp = getblk(hmp->devvp, data_off, HAMMER2_MINIOSIZE, 0, 0);
+               bp->b_flags |= B_CACHE;
+               bp->b_resid = 0;
+               bqrelse(bp);
+       }
+#endif
+
        if (hammer2_debug & 0x0001) {
                kprintf("hammer2: allocate %d %016jx: %zd\n",
                        type, (intmax_t)data_off, bytes);
index 6e361ec..0530528 100644 (file)
@@ -218,8 +218,7 @@ hammer2_inode_create(hammer2_mount_t *hmp,
         * and iterate until we don't get one.
         */
        parent = &dip->chain;
-       hammer2_chain_ref(hmp, parent);
-       hammer2_chain_lock(hmp, parent);
+       hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
 
        error = 0;
        while (error == 0) {
@@ -228,7 +227,7 @@ hammer2_inode_create(hammer2_mount_t *hmp,
                        break;
                if ((lhc & HAMMER2_DIRHASH_LOMASK) == HAMMER2_DIRHASH_LOMASK)
                        error = ENOSPC;
-               hammer2_chain_put(hmp, chain);
+               hammer2_chain_unlock(hmp, chain);
                chain = NULL;
                ++lhc;
        }
@@ -239,7 +238,7 @@ hammer2_inode_create(hammer2_mount_t *hmp,
                if (chain == NULL)
                        error = EIO;
        }
-       hammer2_chain_put(hmp, parent);
+       hammer2_chain_unlock(hmp, parent);
 
        /*
         * Handle the error case
@@ -305,8 +304,7 @@ hammer2_inode_connect(hammer2_inode_t *dip, hammer2_inode_t *ip,
         * and iterate until we don't get one.
         */
        parent = &dip->chain;
-       hammer2_chain_ref(hmp, parent);
-       hammer2_chain_lock(hmp, parent);
+       hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
 
        error = 0;
        while (error == 0) {
@@ -315,7 +313,7 @@ hammer2_inode_connect(hammer2_inode_t *dip, hammer2_inode_t *ip,
                        break;
                if ((lhc & HAMMER2_DIRHASH_LOMASK) == HAMMER2_DIRHASH_LOMASK)
                        error = ENOSPC;
-               hammer2_chain_put(hmp, chain);
+               hammer2_chain_unlock(hmp, chain);
                chain = NULL;
                ++lhc;
        }
@@ -332,7 +330,7 @@ hammer2_inode_connect(hammer2_inode_t *dip, hammer2_inode_t *ip,
                if (chain == NULL)
                        error = EIO;
        }
-       hammer2_chain_put(hmp, parent);
+       hammer2_chain_unlock(hmp, parent);
 
        /*
         * Handle the error case
@@ -348,7 +346,7 @@ hammer2_inode_connect(hammer2_inode_t *dip, hammer2_inode_t *ip,
         */
        if (ip->ip_data.name_len != name_len ||
            bcmp(ip->ip_data.filename, name, name_len) != 0) {
-               hammer2_chain_modify(hmp, chain, 1);
+               hammer2_chain_modify(hmp, chain, 0);
                KKASSERT(name_len < HAMMER2_INODE_MAXNAME);
                bcopy(name, ip->ip_data.filename, name_len);
                ip->ip_data.name_key = lhc;
@@ -410,13 +408,16 @@ hammer2_hardlink_create(hammer2_inode_t *ip, hammer2_inode_t *dip,
                 return error;
         }
         KKASSERT(nip->ip_data.type == HAMMER2_OBJTYPE_HARDLINK);
-        hammer2_chain_modify(&nip->chain, 1);
+        hammer2_chain_modify(&nip->chain, 0);
         nip->ip_data.inum = ip->ip_data.inum;
-       hammer2_chain_put(hmp, &nip->chain);
+       hammer2_chain_unlock(hmp, &nip->chain);
        /
 #endif
 }
 
+/*
+ * Calculate the allocation size for the file fragment straddling EOF
+ */
 int
 hammer2_inode_calc_alloc(hammer2_key_t filesize)
 {
@@ -425,7 +426,7 @@ hammer2_inode_calc_alloc(hammer2_key_t filesize)
 
        if (frag == 0)
                return(0);
-       for (radix = HAMMER2_MINIORADIX; frag > (1 << radix); ++radix)
+       for (radix = HAMMER2_MINALLOCRADIX; frag > (1 << radix); ++radix)
                ;
        return (radix);
 }
index 0a73741..bf7ec1c 100644 (file)
@@ -65,7 +65,7 @@
 void
 hammer2_inode_lock_ex(hammer2_inode_t *ip)
 {
-       hammer2_chain_lock(ip->hmp, &ip->chain);
+       hammer2_chain_lock(ip->hmp, &ip->chain, HAMMER2_RESOLVE_ALWAYS);
 }
 
 void
@@ -341,8 +341,8 @@ hammer2_calc_logical(hammer2_inode_t *ip, hammer2_off_t uoff,
        if (*lbasep == *leofp) {
                radix = hammer2_bytes_to_radix(
                                (size_t)(ip->ip_data.size - *leofp));
-               if (radix < HAMMER2_MINIORADIX)
-                       radix = HAMMER2_MINIORADIX;
+               if (radix < HAMMER2_MINALLOCRADIX)
+                       radix = HAMMER2_MINALLOCRADIX;
                *leofp += 1U << radix;
                return (1U << radix);
        } else {
index 3cc5d11..11bc088 100644 (file)
@@ -53,11 +53,56 @@ struct hammer2_sync_info {
 };
 
 int hammer2_debug;
+int hammer2_cluster_enable = 1;
+long hammer2_iod_file_read;
+long hammer2_iod_meta_read;
+long hammer2_iod_indr_read;
+long hammer2_iod_file_write;
+long hammer2_iod_meta_write;
+long hammer2_iod_indr_write;
+long hammer2_iod_volu_write;
+long hammer2_ioa_file_read;
+long hammer2_ioa_meta_read;
+long hammer2_ioa_indr_read;
+long hammer2_ioa_file_write;
+long hammer2_ioa_meta_write;
+long hammer2_ioa_indr_write;
+long hammer2_ioa_volu_write;
 
 SYSCTL_NODE(_vfs, OID_AUTO, hammer2, CTLFLAG_RW, 0, "HAMMER2 filesystem");
 
 SYSCTL_INT(_vfs_hammer2, OID_AUTO, debug, CTLFLAG_RW,
           &hammer2_debug, 0, "");
+SYSCTL_INT(_vfs_hammer2, OID_AUTO, cluster_enable, CTLFLAG_RW,
+          &hammer2_cluster_enable, 0, "");
+SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_read, CTLFLAG_RW,
+          &hammer2_iod_file_read, 0, "");
+SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_read, CTLFLAG_RW,
+          &hammer2_iod_meta_read, 0, "");
+SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_read, CTLFLAG_RW,
+          &hammer2_iod_indr_read, 0, "");
+SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_write, CTLFLAG_RW,
+          &hammer2_iod_file_write, 0, "");
+SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_write, CTLFLAG_RW,
+          &hammer2_iod_meta_write, 0, "");
+SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_write, CTLFLAG_RW,
+          &hammer2_iod_indr_write, 0, "");
+SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_write, CTLFLAG_RW,
+          &hammer2_iod_volu_write, 0, "");
+SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_file_read, CTLFLAG_RW,
+          &hammer2_ioa_file_read, 0, "");
+SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_meta_read, CTLFLAG_RW,
+          &hammer2_ioa_meta_read, 0, "");
+SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_indr_read, CTLFLAG_RW,
+          &hammer2_ioa_indr_read, 0, "");
+SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_file_write, CTLFLAG_RW,
+          &hammer2_ioa_file_write, 0, "");
+SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_meta_write, CTLFLAG_RW,
+          &hammer2_ioa_meta_write, 0, "");
+SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_indr_write, CTLFLAG_RW,
+          &hammer2_ioa_indr_write, 0, "");
+SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_volu_write, CTLFLAG_RW,
+          &hammer2_ioa_volu_write, 0, "");
 
 static int hammer2_vfs_init(struct vfsconf *conf);
 static int hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
@@ -280,6 +325,7 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
        hmp->vchain.bref.data_off = 0 | HAMMER2_PBUFRADIX;
        /* hmp->vchain.u.xxx is left NULL */
        lockinit(&hmp->vchain.lk, "volume", 0, LK_CANRECURSE);
+       lockinit(&hmp->alloclk, "h2alloc", 0, 0);
 
        /*
         * Install the volume header
@@ -308,19 +354,18 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
         */
        lhc = hammer2_dirhash(label, strlen(label));
        parent = &hmp->vchain;
-       hammer2_chain_ref(hmp, parent);
-       hammer2_chain_lock(hmp, parent);
+       hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
        schain = hammer2_chain_lookup(hmp, &parent,
                                      HAMMER2_SROOT_KEY, HAMMER2_SROOT_KEY, 0);
-       hammer2_chain_put(hmp, parent);
+       hammer2_chain_unlock(hmp, parent);
        if (schain == NULL) {
                kprintf("hammer2_mount: invalid super-root\n");
                hammer2_vfs_unmount(mp, MNT_FORCE);
                return EINVAL;
        }
 
+       hammer2_chain_ref(hmp, schain); /* for hmp->schain */
        parent = schain;
-       hammer2_chain_ref(hmp, parent); /* parent: lock+ref, schain: ref */
        rchain = hammer2_chain_lookup(hmp, &parent,
                                      lhc, lhc + HAMMER2_DIRHASH_LOMASK,
                                      0);
@@ -334,14 +379,15 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
                                            lhc, lhc + HAMMER2_DIRHASH_LOMASK,
                                            0);
        }
-       hammer2_chain_put(hmp, parent);
+       hammer2_chain_unlock(hmp, parent);
        if (rchain == NULL) {
                kprintf("hammer2_mount: root label not found\n");
                hammer2_chain_drop(hmp, schain);
                hammer2_vfs_unmount(mp, MNT_FORCE);
                return EINVAL;
        }
-       hammer2_chain_unlock(hmp, rchain); /* rchain: ref */
+       hammer2_chain_ref(hmp, rchain); /* for hmp->rchain */
+       hammer2_chain_unlock(hmp, rchain);
 
        hmp->schain = schain;           /* left held & unlocked */
        hmp->rchain = rchain;           /* left held & unlocked */
@@ -572,7 +618,7 @@ hammer2_vfs_sync(struct mount *mp, int waitfor)
                /* XXX */
        }
 #endif
-       hammer2_chain_lock(hmp, &hmp->vchain);
+       hammer2_chain_lock(hmp, &hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
        if (hmp->vchain.flags &
            (HAMMER2_CHAIN_MODIFIED1 | HAMMER2_CHAIN_SUBMODIFIED)) {
                hammer2_chain_flush(hmp, &hmp->vchain);
index a14b5ec..c49455a 100644 (file)
@@ -92,7 +92,7 @@ hammer2_vop_inactive(struct vop_inactive_args *ap)
        if (ip->chain.flags & HAMMER2_CHAIN_DIRTYEMBED) {
                hammer2_inode_lock_ex(ip);
                atomic_clear_int(&ip->chain.flags, HAMMER2_CHAIN_DIRTYEMBED);
-               hammer2_chain_modify(ip->hmp, &ip->chain, 1);
+               hammer2_chain_modify(ip->hmp, &ip->chain, 0);
                hammer2_inode_unlock_ex(ip);
        }
 
@@ -163,7 +163,7 @@ hammer2_vop_fsync(struct vop_fsync_args *ap)
         */
        if (ip->chain.flags & HAMMER2_CHAIN_DIRTYEMBED) {
                atomic_clear_int(&ip->chain.flags, HAMMER2_CHAIN_DIRTYEMBED);
-               hammer2_chain_modify(hmp, &ip->chain, 1);
+               hammer2_chain_modify(hmp, &ip->chain, 0);
        }
 
        /*
@@ -276,7 +276,7 @@ hammer2_vop_setattr(struct vop_setattr_args *ap)
                                         ap->a_cred);
                if (error == 0) {
                        if (ip->ip_data.uflags != flags) {
-                               hammer2_chain_modify(hmp, &ip->chain, 1);
+                               hammer2_chain_modify(hmp, &ip->chain, 0);
                                ip->ip_data.uflags = flags;
                                doctime = 1;
                                kflags |= NOTE_ATTRIB;
@@ -406,10 +406,9 @@ hammer2_vop_readdir(struct vop_readdir_args *ap)
        lkey = saveoff | HAMMER2_DIRHASH_VISIBLE;
 
        parent = &ip->chain;
-       hammer2_chain_ref(hmp, parent);
-       error = hammer2_chain_lock(hmp, parent);
+       error = hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
        if (error) {
-               hammer2_chain_put(hmp, parent);
+               hammer2_chain_unlock(hmp, parent);
                goto done;
        }
        chain = hammer2_chain_lookup(hmp, &parent, lkey, lkey, 0);
@@ -453,9 +452,9 @@ hammer2_vop_readdir(struct vop_readdir_args *ap)
                if (cookie_index == ncookies)
                        break;
        }
-       hammer2_chain_put(hmp, parent);
        if (chain)
-               hammer2_chain_put(hmp, chain);
+               hammer2_chain_unlock(hmp, chain);
+       hammer2_chain_unlock(hmp, parent);
 done:
        if (ap->a_eofflag)
                *ap->a_eofflag = (chain == NULL);
@@ -584,7 +583,7 @@ hammer2_vop_write(struct vop_write_args *ap)
         * might wind up being copied into the embedded data area.
         */
        hammer2_inode_lock_ex(ip);
-       hammer2_chain_modify(hmp, &ip->chain, 1);
+       hammer2_chain_modify(hmp, &ip->chain, 0);
        error = hammer2_write_file(ip, uio, ap->a_ioflag);
 
        hammer2_inode_unlock_ex(ip);
@@ -692,8 +691,9 @@ hammer2_write_file(hammer2_inode_t *ip, struct uio *uio, int ioflag)
                         *      the whole loop
                         */
                        hammer2_chain_unlock(ip->hmp, &ip->chain);
-                       bwillwrite(HAMMER2_LBUFSIZE);
-                       hammer2_chain_lock(ip->hmp, &ip->chain);
+                       bwillwrite(HAMMER2_PBUFSIZE);
+                       hammer2_chain_lock(ip->hmp, &ip->chain,
+                                          HAMMER2_RESOLVE_ALWAYS);
                }
 
                /* XXX bigwrite & signal check test */
@@ -785,7 +785,7 @@ hammer2_write_file(hammer2_inode_t *ip, struct uio *uio, int ioflag)
                 */
                hammer2_chain_unlock(ip->hmp, &ip->chain);
                error = uiomove(bp->b_data + loff, n, uio);
-               hammer2_chain_lock(ip->hmp, &ip->chain);
+               hammer2_chain_lock(ip->hmp, &ip->chain, HAMMER2_RESOLVE_ALWAYS);
                kflags |= NOTE_WRITE;
 
                if (error) {
@@ -802,10 +802,12 @@ hammer2_write_file(hammer2_inode_t *ip, struct uio *uio, int ioflag)
                if (ioflag & IO_SYNC) {
                        bwrite(bp);
                } else if ((ioflag & IO_DIRECT) && loff + n == lblksize) {
+                       bp->b_flags |= B_CLUSTEROK;
                        bdwrite(bp);
                } else if (ioflag & IO_ASYNC) {
                        bawrite(bp);
                } else {
+                       bp->b_flags |= B_CLUSTEROK;
                        bdwrite(bp);
                }
        }
@@ -846,8 +848,7 @@ hammer2_assign_physical(hammer2_inode_t *ip, hammer2_key_t lbase,
         * logical buffer cache buffer.
         */
        parent = &ip->chain;
-       hammer2_chain_ref(hmp, parent);
-       hammer2_chain_lock(hmp, parent);
+       hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
 
        chain = hammer2_chain_lookup(hmp, &parent,
                                     lbase, lbase,
@@ -855,9 +856,10 @@ hammer2_assign_physical(hammer2_inode_t *ip, hammer2_key_t lbase,
 
        if (chain == NULL) {
                /*
-                * We found a hole, create a new chain entry.  No meta-data
-                * buffer or data pointer will be assigned (indicating
-                * new, unwritten storage).
+                * We found a hole, create a new chain entry.
+                *
+                * NOTE: DATA chains are created without device backing
+                *       store (nor do we want any).
                 */
                chain = hammer2_chain_create(hmp, parent, NULL,
                                             lbase, HAMMER2_PBUFRADIX,
@@ -881,7 +883,8 @@ hammer2_assign_physical(hammer2_inode_t *ip, hammer2_key_t lbase,
                                      "size mismatch %d/%d\n",
                                      lblksize, chain->bytes);
                        }
-                       hammer2_chain_modify_quick(hmp, chain);
+                       hammer2_chain_modify(hmp, chain,
+                                            HAMMER2_MODIFY_OPTDATA);
                        pbase = chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX;
                        break;
                default:
@@ -893,8 +896,8 @@ hammer2_assign_physical(hammer2_inode_t *ip, hammer2_key_t lbase,
        }
 
        if (chain)
-               hammer2_chain_put(hmp, chain);
-       hammer2_chain_put(hmp, parent);
+               hammer2_chain_unlock(hmp, chain);
+       hammer2_chain_unlock(hmp, parent);
 
        return (pbase);
 }
@@ -923,7 +926,7 @@ hammer2_truncate_file(hammer2_inode_t *ip, hammer2_key_t nsize)
        int oblksize;
        int nblksize;
 
-       hammer2_chain_modify(hmp, &ip->chain, 1);
+       hammer2_chain_modify(hmp, &ip->chain, 0);
        bp = NULL;
 
        /*
@@ -943,10 +946,9 @@ hammer2_truncate_file(hammer2_inode_t *ip, hammer2_key_t nsize)
         * Setup for lookup/search
         */
        parent = &ip->chain;
-       hammer2_chain_ref(hmp, parent);
-       error = hammer2_chain_lock(hmp, parent);
+       error = hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
        if (error) {
-               hammer2_chain_put(hmp, parent);
+               hammer2_chain_unlock(hmp, parent);
                /* XXX error reporting */
                return;
        }
@@ -972,14 +974,14 @@ hammer2_truncate_file(hammer2_inode_t *ip, hammer2_key_t nsize)
         */
        if (loff && bp) {
                chain = hammer2_chain_lookup(hmp, &parent, lbase, lbase,
-                                            HAMMER2_LOOKUP_NOLOCK);
+                                            HAMMER2_LOOKUP_NODATA);
                if (chain) {
                        allocbuf(bp, nblksize);
                        switch(chain->bref.type) {
                        case HAMMER2_BREF_TYPE_DATA:
-                               hammer2_chain_resize_quick(hmp, chain,
-                                            hammer2_bytes_to_radix(nblksize));
-                               hammer2_chain_modify_quick(hmp, chain);
+                               hammer2_chain_resize(hmp, chain,
+                                            hammer2_bytes_to_radix(nblksize),
+                                            HAMMER2_MODIFY_OPTDATA);
                                bzero(bp->b_data + loff, nblksize - loff);
                                bp->b_bio2.bio_offset = chain->bref.data_off &
                                                        HAMMER2_OFF_MASK;
@@ -992,7 +994,8 @@ hammer2_truncate_file(hammer2_inode_t *ip, hammer2_key_t nsize)
                                panic("hammer2_truncate_file: bad type");
                                break;
                        }
-                       hammer2_chain_drop(hmp, chain);
+                       hammer2_chain_unlock(hmp, chain);
+                       bp->b_flags |= B_CLUSTEROK;
                        bdwrite(bp);
                } else {
                        /*
@@ -1005,24 +1008,31 @@ hammer2_truncate_file(hammer2_inode_t *ip, hammer2_key_t nsize)
                        bqrelse(bp);
                }
        } else if (loff) {
+               /*
+                * WARNING: This utilizes a device buffer for the data.
+                *
+                * XXX case should not occur
+                */
+               panic("hammer2_truncate_file: non-zero truncation, no-vnode");
                chain = hammer2_chain_lookup(hmp, &parent, lbase, lbase, 0);
                if (chain) {
                        switch(chain->bref.type) {
                        case HAMMER2_BREF_TYPE_DATA:
                                hammer2_chain_resize(hmp, chain,
-                                            hammer2_bytes_to_radix(nblksize));
-                               hammer2_chain_modify(hmp, chain, 1);
+                                            hammer2_bytes_to_radix(nblksize),
+                                            0);
+                               hammer2_chain_modify(hmp, chain, 0);
                                bzero(chain->data->buf + loff, nblksize - loff);
                                break;
                        case HAMMER2_BREF_TYPE_INODE:
                                if (loff < HAMMER2_EMBEDDED_BYTES) {
-                                       hammer2_chain_modify(hmp, chain, 1);
+                                       hammer2_chain_modify(hmp, chain, 0);
                                        bzero(chain->data->ipdata.u.data + loff,
                                              HAMMER2_EMBEDDED_BYTES - loff);
                                }
                                break;
                        }
-                       hammer2_chain_put(hmp, chain);
+                       hammer2_chain_unlock(hmp, chain);
                }
        }
 
@@ -1043,7 +1053,7 @@ hammer2_truncate_file(hammer2_inode_t *ip, hammer2_key_t nsize)
        lbase = (nsize + HAMMER2_PBUFMASK64) & ~HAMMER2_PBUFMASK64;
        chain = hammer2_chain_lookup(hmp, &parent,
                                     lbase, (hammer2_key_t)-1,
-                                    HAMMER2_LOOKUP_NOLOCK);
+                                    HAMMER2_LOOKUP_NODATA);
        while (chain) {
                /*
                 * Degenerate embedded data case, nothing to loop on.
@@ -1060,9 +1070,9 @@ hammer2_truncate_file(hammer2_inode_t *ip, hammer2_key_t nsize)
                /* XXX check parent if empty indirect block & delete */
                chain = hammer2_chain_next(hmp, &parent, chain,
                                           lbase, (hammer2_key_t)-1,
-                                          HAMMER2_LOOKUP_NOLOCK);
+                                          HAMMER2_LOOKUP_NODATA);
        }
-       hammer2_chain_put(hmp, parent);
+       hammer2_chain_unlock(hmp, parent);
 }
 
 /*
@@ -1090,7 +1100,7 @@ hammer2_extend_file(hammer2_inode_t *ip, hammer2_key_t nsize)
        KKASSERT(ip->vp);
        hmp = ip->hmp;
 
-       hammer2_chain_modify(hmp, &ip->chain, 1);
+       hammer2_chain_modify(hmp, &ip->chain, 0);
 
        /*
         * Nothing to do if the direct-data case is still intact
@@ -1160,15 +1170,14 @@ hammer2_extend_file(hammer2_inode_t *ip, hammer2_key_t nsize)
         */
        if (((int)osize & HAMMER2_PBUFMASK)) {
                parent = &ip->chain;
-               hammer2_chain_ref(hmp, parent);
-               error = hammer2_chain_lock(hmp, parent);
+               error = hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
                KKASSERT(error == 0);
 
                nradix = hammer2_bytes_to_radix(nblksize);
 
                chain = hammer2_chain_lookup(hmp, &parent,
                                             obase, obase,
-                                            HAMMER2_LOOKUP_NOLOCK);
+                                            HAMMER2_LOOKUP_NODATA);
                if (chain == NULL) {
                        chain = hammer2_chain_create(hmp, parent, NULL,
                                                     obase, nblksize,
@@ -1176,14 +1185,15 @@ hammer2_extend_file(hammer2_inode_t *ip, hammer2_key_t nsize)
                                                     nradix);
                } else {
                        KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA);
-                       hammer2_chain_resize_quick(hmp, chain, nradix);
-                       hammer2_chain_modify_quick(hmp, chain);
+                       hammer2_chain_resize(hmp, chain, nradix,
+                                            HAMMER2_MODIFY_OPTDATA);
                }
                bp->b_bio2.bio_offset = chain->bref.data_off &
                                        HAMMER2_OFF_MASK;
-               hammer2_chain_drop(hmp, chain);
+               hammer2_chain_unlock(hmp, chain);
+               bp->b_flags |= B_CLUSTEROK;
                bdwrite(bp);
-               hammer2_chain_put(hmp, parent);
+               hammer2_chain_unlock(hmp, parent);
        }
 }
 
@@ -1213,8 +1223,7 @@ hammer2_vop_nresolve(struct vop_nresolve_args *ap)
         * Note: In DragonFly the kernel handles '.' and '..'.
         */
        parent = &dip->chain;
-       hammer2_chain_ref(hmp, parent);
-       hammer2_chain_lock(hmp, parent);
+       hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
        chain = hammer2_chain_lookup(hmp, &parent,
                                     lhc, lhc + HAMMER2_DIRHASH_LOMASK,
                                     0);
@@ -1229,7 +1238,7 @@ hammer2_vop_nresolve(struct vop_nresolve_args *ap)
                                           lhc, lhc + HAMMER2_DIRHASH_LOMASK,
                                           0);
        }
-       hammer2_chain_put(hmp, parent);
+       hammer2_chain_unlock(hmp, parent);
 
        if (chain) {
                vp = hammer2_igetv(chain->u.ip, &error);
@@ -1238,7 +1247,7 @@ hammer2_vop_nresolve(struct vop_nresolve_args *ap)
                        cache_setvp(ap->a_nch, vp);
                        vrele(vp);
                }
-               hammer2_chain_put(hmp, chain);
+               hammer2_chain_unlock(hmp, chain);
        } else {
                error = ENOENT;
                cache_setvp(ap->a_nch, NULL);
@@ -1262,10 +1271,9 @@ hammer2_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
                *ap->a_vpp = NULL;
                return ENOENT;
        }
-       hammer2_chain_ref(hmp, &ip->chain);
-       hammer2_chain_lock(hmp, &ip->chain);
+       hammer2_chain_lock(hmp, &ip->chain, HAMMER2_RESOLVE_ALWAYS);
        *ap->a_vpp = hammer2_igetv(ip, &error);
-       hammer2_chain_put(hmp, &ip->chain);
+       hammer2_chain_unlock(hmp, &ip->chain);
 
        return error;
 }
@@ -1299,7 +1307,7 @@ hammer2_vop_nmkdir(struct vop_nmkdir_args *ap)
                return error;
        }
        *ap->a_vpp = hammer2_igetv(nip, &error);
-       hammer2_chain_put(hmp, &nip->chain);
+       hammer2_chain_unlock(hmp, &nip->chain);
 
        if (error == 0) {
                cache_setunresolved(ap->a_nch);
@@ -1358,14 +1366,13 @@ hammer2_vop_bmap(struct vop_bmap_args *ap)
        loff = ap->a_loffset & HAMMER2_OFF_MASK_LO;
 
        parent = &ip->chain;
-       hammer2_chain_ref(hmp, parent);
-       hammer2_chain_lock(hmp, parent);
+       hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
        chain = hammer2_chain_lookup(hmp, &parent,
                                     lbeg, lend,
-                                    HAMMER2_LOOKUP_NOLOCK);
+                                    HAMMER2_LOOKUP_NODATA);
        if (chain == NULL) {
                *ap->a_doffsetp = ZFOFFSET;
-               hammer2_chain_put(hmp, parent);
+               hammer2_chain_unlock(hmp, parent);
                return (0);
        }
 
@@ -1378,9 +1385,9 @@ hammer2_vop_bmap(struct vop_bmap_args *ap)
                }
                chain = hammer2_chain_next(hmp, &parent, chain,
                                           lbeg, lend,
-                                          HAMMER2_LOOKUP_NOLOCK);
+                                          HAMMER2_LOOKUP_NODATA);
        }
-       hammer2_chain_put(hmp, parent);
+       hammer2_chain_unlock(hmp, parent);
 
        /*
         * If the requested loffset is not mappable physically we can't
@@ -1508,7 +1515,7 @@ hammer2_vop_ncreate(struct vop_ncreate_args *ap)
                return error;
        }
        *ap->a_vpp = hammer2_igetv(nip, &error);
-       hammer2_chain_put(hmp, &nip->chain);
+       hammer2_chain_unlock(hmp, &nip->chain);
 
        if (error == 0) {
                cache_setunresolved(ap->a_nch);
@@ -1583,7 +1590,7 @@ hammer2_vop_nsymlink(struct vop_nsymlink_args *ap)
                        error = 0;
                }
        }
-       hammer2_chain_put(hmp, &nip->chain);
+       hammer2_chain_unlock(hmp, &nip->chain);
 
        /*
         * Finalize namecache
@@ -1736,7 +1743,7 @@ hammer2_vop_nrename(struct vop_nrename_args *ap)
        /*
         * Reconnect ip to target directory.
         */
-       hammer2_chain_lock(hmp, &ip->chain);
+       hammer2_chain_lock(hmp, &ip->chain, HAMMER2_RESOLVE_ALWAYS);
        error = hammer2_inode_connect(tdip, ip, tname, tname_len);
 
        if (error == 0) {
@@ -1744,7 +1751,7 @@ hammer2_vop_nrename(struct vop_nrename_args *ap)
        }
        hammer2_chain_unlock(hmp, &ip->chain);
 done:
-       hammer2_chain_drop(hmp, &ip->chain);
+       hammer2_chain_drop(hmp, &ip->chain);    /* from ref up top */
 
        return (error);
 }
@@ -1786,8 +1793,7 @@ hammer2_unlink_file(hammer2_inode_t *dip, const uint8_t *name, size_t name_len,
         * Search for the filename in the directory
         */
        parent = &dip->chain;
-       hammer2_chain_ref(hmp, parent);
-       hammer2_chain_lock(hmp, parent);
+       hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
        chain = hammer2_chain_lookup(hmp, &parent,
                                     lhc, lhc + HAMMER2_DIRHASH_LOMASK,
                                     0);
@@ -1807,7 +1813,7 @@ hammer2_unlink_file(hammer2_inode_t *dip, const uint8_t *name, size_t name_len,
         * Not found or wrong type (isdir < 0 disables the type check).
         */
        if (chain == NULL) {
-               hammer2_chain_put(hmp, parent);
+               hammer2_chain_unlock(hmp, parent);
                return ENOENT;
        }
        if (chain->data->ipdata.type == HAMMER2_OBJTYPE_DIRECTORY &&
@@ -1829,18 +1835,17 @@ hammer2_unlink_file(hammer2_inode_t *dip, const uint8_t *name, size_t name_len,
        if (chain->data->ipdata.type == HAMMER2_OBJTYPE_DIRECTORY &&
            isdir >= 0) {
                dparent = chain;
-               hammer2_chain_ref(hmp, dparent);
-               hammer2_chain_lock(hmp, dparent);
+               hammer2_chain_lock(hmp, dparent, HAMMER2_RESOLVE_ALWAYS);
                dchain = hammer2_chain_lookup(hmp, &dparent,
                                              0, (hammer2_key_t)-1,
-                                             HAMMER2_LOOKUP_NOLOCK);
+                                             HAMMER2_LOOKUP_NODATA);
                if (dchain) {
-                       hammer2_chain_drop(hmp, dchain);
-                       hammer2_chain_put(hmp, dparent);
+                       hammer2_chain_unlock(hmp, dchain);
+                       hammer2_chain_unlock(hmp, dparent);
                        error = ENOTEMPTY;
                        goto done;
                }
-               hammer2_chain_put(hmp, dparent);
+               hammer2_chain_unlock(hmp, dparent);
                dparent = NULL;
                /* dchain NULL */
        }
@@ -1891,8 +1896,8 @@ hammer2_unlink_file(hammer2_inode_t *dip, const uint8_t *name, size_t name_len,
        error = 0;
 
 done:
-       hammer2_chain_put(hmp, chain);
-       hammer2_chain_put(hmp, parent);
+       hammer2_chain_unlock(hmp, chain);
+       hammer2_chain_unlock(hmp, parent);
 
        return error;
 }
@@ -1915,9 +1920,11 @@ hammer2_vop_strategy(struct vop_strategy_args *ap)
        switch(bp->b_cmd) {
        case BUF_CMD_READ:
                error = hammer2_strategy_read(ap);
+               ++hammer2_iod_file_read;
                break;
        case BUF_CMD_WRITE:
                error = hammer2_strategy_write(ap);
+               ++hammer2_iod_file_write;
                break;
        default:
                bp->b_error = error = EINVAL;
@@ -1961,16 +1968,10 @@ hammer2_strategy_read(struct vop_strategy_args *ap)
         */
        if (nbio->bio_offset == NOOFFSET) {
                parent = &ip->chain;
-               hammer2_chain_ref(hmp, parent);
-               hammer2_chain_lock(hmp, parent);
+               hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
 
-               /*
-                * Specifying NOLOCK avoids unnecessary bread()s of the
-                * chain element's content.  We just need the block device
-                * offset.
-                */
                chain = hammer2_chain_lookup(hmp, &parent, lbase, lbase,
-                                            HAMMER2_LOOKUP_NOLOCK);
+                                            HAMMER2_LOOKUP_NODATA);
                if (chain == NULL) {
                        /*
                         * Data is zero-fill
@@ -1981,7 +1982,7 @@ hammer2_strategy_read(struct vop_strategy_args *ap)
                         * Data is embedded in the inode (do nothing)
                         */
                        KKASSERT(chain == parent);
-                       hammer2_chain_drop(hmp, chain);
+                       hammer2_chain_unlock(hmp, chain);
                } else if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
                        /*
                         * Data is on-media
@@ -1989,12 +1990,12 @@ hammer2_strategy_read(struct vop_strategy_args *ap)
                        KKASSERT(bp->b_bcount == chain->bytes);
                        nbio->bio_offset = chain->bref.data_off &
                                           HAMMER2_OFF_MASK;
-                       hammer2_chain_drop(hmp, chain);
+                       hammer2_chain_unlock(hmp, chain);
                        KKASSERT(nbio->bio_offset != 0);
                } else {
                        panic("hammer2_strategy_read: unknown bref type");
                }
-               hammer2_chain_put(hmp, parent);
+               hammer2_chain_unlock(hmp, parent);
        }
 
        if (nbio->bio_offset == ZFOFFSET) {