hammer2 - Reformulate strategy code and direct I/O
authorMatthew Dillon <dillon@apollo.backplane.com>
Tue, 20 Mar 2012 22:12:02 +0000 (15:12 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Tue, 20 Mar 2012 22:24:13 +0000 (15:24 -0700)
* Use variable block sizes for both logical and physical buffers such
  that the logical buffer matches the physical buffer.

* Change the file data layout to use 64K blocks for all bulk data fully
  enclosed in the block, with a single variable-length block straddling
  the file EOF sized 1K - 64K in powers of 2.  The inode's 512 bytes of
  embedded data is still implemented for file sizes <= 512 bytes.

* Implement direct IO for both reading and writing.

* Reformulate the strategy write code such that bio2.bio_offset is always
  pre-calculated, allowing the strategy code to issue the I/O without
  requiring any further filesystem interactions.

* Fixes numerous deadlocks.

sys/vfs/hammer2/Makefile
sys/vfs/hammer2/TODO
sys/vfs/hammer2/hammer2.h
sys/vfs/hammer2/hammer2_chain.c
sys/vfs/hammer2/hammer2_subr.c
sys/vfs/hammer2/hammer2_vnops.c

index 97d10b8..8ec7e56 100644 (file)
@@ -3,6 +3,7 @@
 #
 .PATH: ${.CURDIR}
 
+CFLAGS+= -DINVARIANTS
 KMOD=  hammer2
 SRCS=  hammer2_vfsops.c hammer2_vnops.c hammer2_inode.c
 SRCS+= hammer2_chain.c hammer2_freemap.c hammer2_subr.c hammer2_icrc.c
index f4e9ead..985058a 100644 (file)
@@ -1,4 +1,11 @@
 
+* Use bp->b_dep to interlock the buffer with the chain structure so the
+  strategy code can calculate the crc and assert that the chain is marked
+  modified (not yet flushed).
+
+* Deleted inode not reachable via tree for volume flush but still reachable
+  via fsync/inactive/reclaim.  Its tree can be destroyed at that point.
+
 * The direct write code needs to invalidate any underlying physical buffers.
   Direct write needs to be implemented.
 
index feaad80..dcb31c8 100644 (file)
@@ -123,7 +123,7 @@ int hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2);
 SPLAY_PROTOTYPE(hammer2_chain_splay, hammer2_chain, snode, hammer2_chain_cmp);
 
 #define HAMMER2_CHAIN_MODIFIED1                0x00000001      /* active mods */
-#define HAMMER2_CHAIN_MODIFIED2                0x00000002      /* queued mods */
+#define HAMMER2_CHAIN_UNUSED02         0x00000002
 #define HAMMER2_CHAIN_DIRTYBP          0x00000004      /* dirty on unlock */
 #define HAMMER2_CHAIN_SUBMODIFIED      0x00000008      /* 1+ subs modified */
 #define HAMMER2_CHAIN_DELETED          0x00000010
@@ -137,6 +137,7 @@ SPLAY_PROTOTYPE(hammer2_chain_splay, hammer2_chain, snode, hammer2_chain_cmp);
  * Flags passed to hammer2_chain_lookup() and hammer2_chain_next()
  */
 #define HAMMER2_LOOKUP_NOLOCK          0x00000001      /* ref only */
+#define HAMMER2_LOOKUP_NODATA          0x00000002      /* data left NULL */
 
 /*
  * Cluster different types of storage together for allocations
@@ -190,14 +191,9 @@ typedef struct hammer2_inode hammer2_inode_t;
 
 /*
  * A hammer2 indirect block
- *
- * If is_embedded != 0 the buffer is extended.  This is used for
- * indirect blocks which are not whole-physical-blocks.
  */
 struct hammer2_indblock {
        hammer2_chain_t         chain;
-       int                     is_embedded;
-       char                    buf[4];
 };
 
 typedef struct hammer2_indblock hammer2_indblock_t;
@@ -288,6 +284,9 @@ u_int32_t hammer2_to_unix_xid(uuid_t *uuid);
 hammer2_key_t hammer2_dirhash(const unsigned char *name, size_t len);
 int hammer2_bytes_to_radix(size_t bytes);
 
+int hammer2_calc_logical(hammer2_inode_t *ip, hammer2_off_t uoff,
+                        hammer2_key_t *lbasep, hammer2_key_t *leofp);
+
 /*
  * hammer2_inode.c
  */
@@ -322,6 +321,9 @@ int hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain);
 void hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain);
 void hammer2_chain_resize(hammer2_mount_t *hmp, hammer2_chain_t *chain,
                                int nradix);
+void hammer2_chain_modify_quick(hammer2_mount_t *hmp, hammer2_chain_t *chain);
+void hammer2_chain_resize_quick(hammer2_mount_t *hmp, hammer2_chain_t *chain,
+                               int nradix);
 void hammer2_chain_unlock(hammer2_mount_t *hmp, hammer2_chain_t *chain);
 hammer2_chain_t *hammer2_chain_find(hammer2_mount_t *hmp,
                                hammer2_chain_t *parent, int index);
index 29a0530..622c361 100644 (file)
@@ -94,18 +94,7 @@ hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_blockref_t *bref)
                ip->hmp = hmp;
                break;
        case HAMMER2_BREF_TYPE_INDIRECT:
-               if (bytes == HAMMER2_PBUFSIZE) {
-                       np = kmalloc(sizeof(*np),
-                                    hmp->mchain,
-                                    M_WAITOK | M_ZERO);
-                       np->is_embedded = 0;
-               } else {
-                       np = kmalloc(offsetof(struct hammer2_indblock,
-                                             buf[bytes]),
-                                    hmp->mchain,
-                                    M_WAITOK | M_ZERO);
-                       np->is_embedded = 1;
-               }
+               np = kmalloc(sizeof(*np), hmp->mchain, M_WAITOK | M_ZERO);
                chain = &np->chain;
                chain->u.np = np;
                lockinit(&chain->lk, "iblk", 0, LK_CANRECURSE);
@@ -141,6 +130,11 @@ hammer2_chain_free(hammer2_mount_t *hmp, hammer2_chain_t *chain)
 {
        void *mem;
 
+       if (chain->bref.type == HAMMER2_BREF_TYPE_INODE ||
+           chain->bref.type == HAMMER2_BREF_TYPE_VOLUME) {
+               chain->data = NULL;
+       }
+
        KKASSERT(chain->bp == NULL);
        KKASSERT(chain->data == NULL);
        KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_INODE ||
@@ -239,15 +233,16 @@ hammer2_chain_drop(hammer2_mount_t *hmp, hammer2_chain_t *chain)
  * chain->data will be pointed either at the embedded data (e.g. for
  * inodes), in which case the buffer cache buffer is released, or will
  * point into the bp->b_data buffer with the bp left intact while locked.
+ *
+ * NOTE: Chain elements of type DATA do not instantiate a buffer or set
+ *      the data pointer.
  */
 int
 hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
 {
        hammer2_blockref_t *bref;
-       hammer2_off_t off_hi;
-       size_t off_lo;
+       hammer2_off_t pbase;
        int error;
-       void *data;
 
        /*
         * Lock the element.  Under certain conditions this might end up
@@ -271,36 +266,32 @@ hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
                return (0);
 
        /*
+        * We do not instantiate a device buffer for DATA chain elements,
+        * as this would cause unnecessary double-buffering.
+        */
+       if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
+               return(0);
+
+       /*
         * If data is NULL we must issue I/O.  Any error returns the error
         * code but leaves the chain locked.
         *
         * If the chain was modified a new bref will have already been
         * allocated and its related bp is probably still sitting in the
         * buffer cache.
+        *
+        * The buffer cache buffer is variable-sized in powers of 2 down
+        * to HAMMER2_MINIOSIZE (typically 1K).
         */
        bref = &chain->bref;
 
-       off_hi = bref->data_off & HAMMER2_OFF_MASK_HI;
-       off_lo = (size_t)bref->data_off & HAMMER2_OFF_MASK_LO;
-       KKASSERT(off_hi != 0);
-       error = cluster_read(hmp->devvp,
-                            hmp->voldata.volu_size, off_hi,
-                            HAMMER2_PBUFSIZE,
-                            HAMMER2_PBUFSIZE, HAMMER2_PBUFSIZE*8,
-                            &chain->bp);
-
-       /*
-        * Even though this can be synthesized from bref->data_off we
-        * store it in the in-memory chain structure for convenience.
-        */
-       if (chain->bytes !=
-           (1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX))) {
-               panic("hammer2_chain_lock: chain->bytes mismatch");
-       }
+       pbase = bref->data_off & ~(hammer2_off_t)(chain->bytes - 1);
+       KKASSERT(pbase != 0);
+       error = bread(hmp->devvp, pbase, chain->bytes, &chain->bp);
 
        if (error) {
                kprintf("hammer2_chain_get: I/O error %016jx: %d\n",
-                       (intmax_t)off_hi, error);
+                       (intmax_t)pbase, error);
                bqrelse(chain->bp);
                chain->bp = NULL;
                return (error);
@@ -308,7 +299,7 @@ hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
 
        /*
         * Setup the data pointer, either pointing it to an embedded data
-        * structure and copying the data from the buffer, or pointint it
+        * structure and copying the data from the buffer, or pointing it
         * into the buffer.
         *
         * The buffer is not retained when copying to an embedded data
@@ -321,9 +312,9 @@ hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
                 * Copy data from bp to embedded buffer
                 */
                KKASSERT(0);    /* not yet - have mount use this soon */
-               KKASSERT(off_hi == 0);
-               bcopy((char *)chain->bp->b_data + off_lo,
-                     &hmp->voldata, HAMMER2_PBUFSIZE);
+               KKASSERT(pbase == 0);
+               KKASSERT(chain->bytes == HAMMER2_PBUFSIZE);
+               bcopy(chain->bp->b_data, &hmp->voldata, chain->bytes);
                chain->data = (void *)&hmp->voldata;
                bqrelse(chain->bp);
                chain->bp = NULL;
@@ -332,34 +323,18 @@ hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
                /*
                 * Copy data from bp to embedded buffer.
                 */
-               bcopy((char *)chain->bp->b_data + off_lo,
-                     &chain->u.ip->ip_data,
-                     HAMMER2_INODE_BYTES);
+               bcopy(chain->bp->b_data, &chain->u.ip->ip_data, chain->bytes);
                chain->data = (void *)&chain->u.ip->ip_data;
                bqrelse(chain->bp);
                chain->bp = NULL;
                break;
        case HAMMER2_BREF_TYPE_INDIRECT:
-               /*
-                * Indirect node data may or may not be embedded depending
-                * on how much there is.
-                */
-               if (chain->u.np->is_embedded) {
-                       bcopy((char *)chain->bp->b_data + off_lo,
-                             chain->u.np->buf,
-                             chain->bytes);
-                       chain->data = (void *)&chain->u.np->buf[0];
-                       bqrelse(chain->bp);
-                       chain->bp = NULL;
-                       break;
-               }
-               /* fall through */
+       case HAMMER2_BREF_TYPE_DATA:
        default:
                /*
                 * Leave bp intact
                 */
-               data = (char *)chain->bp->b_data + off_lo;
-               chain->data = data;
+               chain->data = (void *)chain->bp->b_data;
                break;
        }
        return (0);
@@ -369,6 +344,8 @@ hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
  * Resize the chain's physical storage allocation.  Chains can be resized
  * smaller without reallocating the storage.  Resizing larger will reallocate
  * the storage.
+ *
+ * Must be passed a locked chain
  */
 void
 hammer2_chain_resize(hammer2_mount_t *hmp, hammer2_chain_t *chain, int nradix)
@@ -381,7 +358,7 @@ hammer2_chain_resize(hammer2_mount_t *hmp, hammer2_chain_t *chain, int nradix)
        int error;
 
        /*
-        * Only data blocks can be resized for now
+        * Only data and indirect blocks can be resized for now
         */
        KKASSERT(chain != &hmp->vchain);
        KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA ||
@@ -395,45 +372,39 @@ hammer2_chain_resize(hammer2_mount_t *hmp, hammer2_chain_t *chain, int nradix)
        if (obytes == nbytes)
                return;
 
-       /*
-        * A deleted inode may still be active but unreachable via sync
-        * because it has been disconnected from the tree.  Do not allow
-        * deleted inodes to be marked as being modified because this will
-        * bump the refs and never get resolved by the sync, leaving the
-        * inode structure allocated after umount.
-        */
+#if 0
        if ((chain->flags & HAMMER2_CHAIN_DELETED) &&
            chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
                KKASSERT(chain->data != NULL);
                return;
        }
+#endif
 
        /*
         * Set MODIFIED1 and add a chain ref to prevent destruction.  Both
         * modified flags share the same ref.
         */
-       atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED1);
-       if ((chain->flags & HAMMER2_CHAIN_MODIFIED2) == 0)
+       if ((chain->flags & HAMMER2_CHAIN_MODIFIED1) == 0) {
+               atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED1);
                hammer2_chain_ref(hmp, chain);
+       }
 
        if (nbytes < obytes) {
                /*
                 * If we are making it smaller we don't have to reallocate
-                * the block.
+                * the block but we still need to resize it.
                 */
-               chain->bref.data_off &= ~ HAMMER2_OFF_MASK_RADIX;
+               chain->bref.data_off &= ~HAMMER2_OFF_MASK_RADIX;
                chain->bref.data_off |= (nradix & HAMMER2_OFF_MASK_RADIX);
                chain->bytes = nbytes;
+               allocbuf(chain->bp, nbytes);
        } else {
                /*
                 * Otherwise we do
                 */
-               if (chain != &hmp->vchain) {
-                       chain->bref.data_off =
-                         hammer2_freemap_alloc(hmp, chain->bref.type, nbytes);
-                       chain->bytes = nbytes;
-               }
-               /* XXX failed allocation */
+               chain->bref.data_off =
+                       hammer2_freemap_alloc(hmp, chain->bref.type, nbytes);
+               chain->bytes = nbytes;
 
                switch(chain->bref.type) {
                case HAMMER2_BREF_TYPE_VOLUME:          /* embedded */
@@ -454,34 +425,28 @@ hammer2_chain_resize(hammer2_mount_t *hmp, hammer2_chain_t *chain, int nradix)
                         * to embedded data.  Copy-on-write to new block.
                         */
                        KKASSERT(chain != &hmp->vchain);        /* safety */
-                       if (nbytes == HAMMER2_PBUFSIZE) {
-                               nbp = getblk(hmp->devvp,
-                                            chain->bref.data_off &
-                                             HAMMER2_OFF_MASK_HI,
-                                            HAMMER2_PBUFSIZE, 0, 0);
-                               vfs_bio_clrbuf(nbp);
-                               error = 0;
-                       } else {
-                               error = bread(hmp->devvp,
-                                            chain->bref.data_off &
-                                             HAMMER2_OFF_MASK_HI,
-                                            HAMMER2_PBUFSIZE, &nbp);
-                               KKASSERT(error == 0);/* XXX handle error */
-                       }
+                       nbp = getblk(hmp->devvp,
+                                    chain->bref.data_off &
+                                     ~(hammer2_off_t)(nbytes - 1),
+                                    nbytes, 0, 0);
+                       vfs_bio_clrbuf(nbp);
+                       error = 0;
 
                        /*
-                        * The new block may be smaller or larger than the
-                        * old block, only copy what fits.
+                        * The new block is larger than the old one, only
+                        * copy what fits.
                         */
-                       ndata = nbp->b_data + (chain->bref.data_off &
-                                              HAMMER2_OFF_MASK_LO);
+                       ndata = nbp->b_data;
                        if (chain->data) {
                                if (nbytes < obytes)
                                        bcopy(chain->data, ndata, nbytes);
                                else
                                        bcopy(chain->data, ndata, obytes);
                                KKASSERT(chain->bp != NULL);
-                               bqrelse(chain->bp);
+                       }
+                       if (chain->bp) {
+                               chain->bp->b_flags |= B_RELBUF;
+                               brelse(chain->bp);
                        }
                        chain->bp = nbp;
                        chain->data = ndata;
@@ -514,6 +479,84 @@ hammer2_chain_resize(hammer2_mount_t *hmp, hammer2_chain_t *chain, int nradix)
 }
 
 /*
+ * This is the same as hammer2_chain_resize() except the chain does NOT
+ * have to be locked and any underlying data is NOT copied to the new
+ * location.
+ */
+void
+hammer2_chain_resize_quick(hammer2_mount_t *hmp, hammer2_chain_t *chain,
+                          int nradix)
+{
+       hammer2_chain_t *parent;
+       size_t obytes;
+       size_t nbytes;
+
+       /*
+        * Only data and indirect blocks can be resized for now
+        */
+       KKASSERT(chain != &hmp->vchain);
+       KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA ||
+                chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT);
+
+       /*
+        * Nothing to do if the element is already the proper size
+        */
+       obytes = chain->bytes;
+       nbytes = 1 << nradix;
+       if (obytes == nbytes)
+               return;
+
+       lockmgr(&chain->lk, LK_EXCLUSIVE);
+       KKASSERT(chain->bp == NULL);
+
+       /*
+        * Set MODIFIED1 and add a chain ref to prevent destruction.  Both
+        * modified flags share the same ref.
+        */
+       if ((chain->flags & HAMMER2_CHAIN_MODIFIED1) == 0) {
+               atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED1);
+               hammer2_chain_ref(hmp, chain);
+       }
+
+       if (nbytes < obytes) {
+               /*
+                * If we are making it smaller we don't have to reallocate
+                * the block but we still need to resize it.
+                */
+               chain->bref.data_off &= ~HAMMER2_OFF_MASK_RADIX;
+               chain->bref.data_off |= (nradix & HAMMER2_OFF_MASK_RADIX);
+               chain->bytes = nbytes;
+       } else {
+               /*
+                * Otherwise we do
+                */
+               chain->bref.data_off =
+                       hammer2_freemap_alloc(hmp, chain->bref.type, nbytes);
+               chain->bytes = nbytes;
+       }
+
+       /*
+        * Recursively mark the parent chain elements so flushes can find
+        * modified elements.
+        *
+        * NOTE: The flush code will modify a SUBMODIFIED-flagged chain
+        *       during the flush recursion after clearing the parent's
+        *       SUBMODIFIED bit.  We don't want to re-set the parent's
+        *       SUBMODIFIED bit in this case!
+        */
+       if ((chain->flags & HAMMER2_CHAIN_SUBMODIFIED) == 0) {
+               parent = chain->parent;
+               while (parent &&
+                      (parent->flags & HAMMER2_CHAIN_SUBMODIFIED) == 0) {
+                       atomic_set_int(&parent->flags,
+                                      HAMMER2_CHAIN_SUBMODIFIED);
+                       parent = parent->parent;
+               }
+       }
+       lockmgr(&chain->lk, LK_RELEASE);
+}
+
+/*
  * Convert a locked chain that was retrieved read-only to read-write.
  *
  * If not already marked modified a new physical block will be allocated
@@ -525,15 +568,22 @@ hammer2_chain_resize(hammer2_mount_t *hmp, hammer2_chain_t *chain, int nradix)
  *
  * If the data is pointing into a bp it will be relocated to a new bp.
  * If the data is embedded we leave it alone for now.
+ *
+ * NOTE: Not used for DATA chain types, hammer2_chain_modify_quick() is
+ *      used instead.  We don't want to allocate a device buffer for
+ *      data that would interfere with the file's logical buffers.
  */
 void
 hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain)
 {
        hammer2_chain_t *parent;
+       hammer2_off_t pbase;
        struct buf *nbp;
        void *ndata;
        int error;
 
+       KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_DATA);
+
        /*
         * Setting the DIRTYBP flag will cause the buffer to be dirtied or
         * written-out on unlock.  This bit is independent of the MODIFIED1
@@ -551,6 +601,7 @@ hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain)
                return;
        }
 
+#if 0
        /*
         * A deleted inode may still be active but unreachable via sync
         * because it has been disconnected from the tree.  Do not allow
@@ -563,14 +614,14 @@ hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain)
                KKASSERT(chain->data != NULL);
                return;
        }
+#endif
 
        /*
         * Set MODIFIED1 and add a chain ref to prevent destruction.  Both
         * modified flags share the same ref.
         */
        atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED1);
-       if ((chain->flags & HAMMER2_CHAIN_MODIFIED2) == 0)
-               hammer2_chain_ref(hmp, chain);
+       hammer2_chain_ref(hmp, chain);
 
        /*
         * We must allocate the copy-on-write block.
@@ -586,10 +637,13 @@ hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain)
         * through the copy-on-write steps except without the copying part.
         */
        if (chain != &hmp->vchain) {
-               if (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)
+               if ((hammer2_debug & 0x0001) &&
+                   (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) {
                        kprintf("Replace %d\n", chain->bytes);
+               }
                chain->bref.data_off =
-                   hammer2_freemap_alloc(hmp, chain->bref.type, chain->bytes);
+                       hammer2_freemap_alloc(hmp, chain->bref.type,
+                                             chain->bytes);
                /* XXX failed allocation */
        }
 
@@ -603,22 +657,6 @@ hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain)
                error = 0;
                break;
        case HAMMER2_BREF_TYPE_INDIRECT:
-               /*
-                * If the indirect data is embedded we just leave it
-                * in its embedded space, otherwise fall-through to
-                * the bp-handling code.
-                *
-                * If this is a newly allocated block chain->data will
-                * be NULL, so make sure it is properly assigned.  In
-                * this case the embedded space has already been zero'd
-                * by the kmalloc().
-                */
-               if (chain->u.np->is_embedded) {
-                       chain->data = (void *)&chain->u.np->buf[0];
-                       error = 0;
-                       break;
-               }
-               /* fallthrough */
        case HAMMER2_BREF_TYPE_DATA:
                /*
                 * data (if not NULL) points into original bp or to embedded
@@ -628,36 +666,27 @@ hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain)
                 * storage must be zero'd.
                 */
                KKASSERT(chain != &hmp->vchain);        /* safety */
-               if (chain->bytes == HAMMER2_PBUFSIZE) {
-                       nbp = getblk(hmp->devvp,
-                                    chain->bref.data_off & HAMMER2_OFF_MASK_HI,
-                                    HAMMER2_PBUFSIZE, 0, 0);
-                       /*
-                        * XXX want to set B_CACHE but not bother to
-                        * zero because it will be zero'd below?
-                        */
-                       vfs_bio_clrbuf(nbp);
-                       error = 0;
-               } else {
-                       error = bread(hmp->devvp,
-                                    chain->bref.data_off & HAMMER2_OFF_MASK_HI,
-                                    HAMMER2_PBUFSIZE, &nbp);
-                       KKASSERT(error == 0);/* XXX handle error */
-               }
+               pbase = chain->bref.data_off &
+                        ~(hammer2_off_t)(chain->bytes - 1);
+               nbp = getblk(hmp->devvp, pbase, chain->bytes, 0, 0);
+               vfs_bio_clrbuf(nbp);    /* XXX */
+               error = 0;
 
                /*
                 * Copy or zero-fill on write depending on whether
                 * chain->data exists or not.
                 */
-               ndata = nbp->b_data + (chain->bref.data_off &
-                                      HAMMER2_OFF_MASK_LO);
+               ndata = nbp->b_data;
                if (chain->data) {
                        bcopy(chain->data, ndata, chain->bytes);
                        KKASSERT(chain->bp != NULL);
-                       bqrelse(chain->bp);
                } else {
                        bzero(ndata, chain->bytes);
                }
+               if (chain->bp) {
+                       chain->bp->b_flags |= B_RELBUF;
+                       brelse(chain->bp);
+               }
                chain->bp = nbp;
                chain->data = ndata;
                break;
@@ -688,6 +717,82 @@ hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain)
 }
 
 /*
+ * Same as hammer2_chain_modify() except the chain does not have to be
+ * locked and the underlying data will NOT be copied to the new location.
+ */
+void
+hammer2_chain_modify_quick(hammer2_mount_t *hmp, hammer2_chain_t *chain)
+{
+       hammer2_chain_t *parent;
+
+       /*
+        * Set the MODIFIED1 bit and handle degenerate cases.
+        *
+        * We do not set the DIRTYBP flag, we don't want the flush code to
+        * read-modify-write the underlying physical buffer because it
+        * is probably aliased against a logical buffer.
+        *
+        * We must lock the chain but not instantiate its data.
+        *
+        * If the chain is already marked MODIFIED1 we can just return,
+        * but must interlock a failed test to avoid races.
+        */
+       if (chain->flags & HAMMER2_CHAIN_MODIFIED1)
+               return;
+       lockmgr(&chain->lk, LK_EXCLUSIVE);
+       if (chain->flags & HAMMER2_CHAIN_MODIFIED1) {
+               lockmgr(&chain->lk, LK_RELEASE);
+               return;
+       }
+       atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED1);
+       hammer2_chain_ref(hmp, chain);  /* ref for MODIFIED1 bit */
+
+       /*
+        * We must allocate the copy-on-write block.
+        *
+        * If the data is embedded no other action is required.
+        *
+        * If the data is not embedded we acquire and clear the
+        * new block.  If chain->data is not NULL we then do the
+        * copy-on-write.  chain->data will then be repointed to the new
+        * buffer and the old buffer will be released.
+        *
+        * For newly created elements with no prior allocation we go
+        * through the copy-on-write steps except without the copying part.
+        */
+       if (chain != &hmp->vchain) {
+               if ((hammer2_debug & 0x0001) &&
+                   (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) {
+                       kprintf("Replace %d\n", chain->bytes);
+               }
+               chain->bref.data_off =
+                       hammer2_freemap_alloc(hmp, chain->bref.type,
+                                             chain->bytes);
+               /* XXX failed allocation */
+       }
+
+       /*
+        * Recursively mark the parent chain elements so flushes can find
+        * modified elements.
+        *
+        * NOTE: The flush code will modify a SUBMODIFIED-flagged chain
+        *       during the flush recursion after clearing the parent's
+        *       SUBMODIFIED bit.  We don't want to re-set the parent's
+        *       SUBMODIFIED bit in this case!
+        */
+       if ((chain->flags & HAMMER2_CHAIN_SUBMODIFIED) == 0) {
+               parent = chain->parent;
+               while (parent &&
+                      (parent->flags & HAMMER2_CHAIN_SUBMODIFIED) == 0) {
+                       atomic_set_int(&parent->flags,
+                                      HAMMER2_CHAIN_SUBMODIFIED);
+                       parent = parent->parent;
+               }
+       }
+       lockmgr(&chain->lk, LK_RELEASE);
+}
+
+/*
  * Unlock a chain element without dropping its reference count.
  * (see hammer2_chain_put() to do both).
  *
@@ -708,6 +813,7 @@ hammer2_chain_unlock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
                                chain->bp->b_flags |= B_RELBUF;
                                bawrite(chain->bp);
                        } else {
+                               chain->bp->b_flags |= B_CLUSTEROK;
                                bdwrite(chain->bp);
                        }
                } else {
@@ -811,7 +917,7 @@ hammer2_chain_get(hammer2_mount_t *hmp, hammer2_chain_t *parent,
        chain->index = index;
        if (SPLAY_INSERT(hammer2_chain_splay, &parent->shead, chain))
                panic("hammer2_chain_link: collision");
-       KKASSERT(parent->refs > 1);
+       KKASSERT(parent->refs > 0);
        atomic_add_int(&parent->refs, 1);       /* for splay entry */
 
        /*
@@ -1232,16 +1338,7 @@ hammer2_chain_create(hammer2_mount_t *hmp, hammer2_chain_t *parent,
                        chain->data = (void *)&chain->u.ip->ip_data;
                        break;
                case HAMMER2_BREF_TYPE_INDIRECT:
-                       /*
-                        * May or may not be embedded (chain->data may or
-                        * may not be NULL)
-                        */
-                       if (chain->u.np->is_embedded) {
-                               chain->data = (void *)&chain->u.np->buf[0];
-                       } else {
-                               KKASSERT(chain->data == NULL);
-                       }
-                       break;
+               case HAMMER2_BREF_TYPE_DATA:
                default:
                        /* leave chain->data NULL */
                        KKASSERT(chain->data == NULL);
@@ -1357,20 +1454,27 @@ again:
        }
 
        /*
-        * Mark the newly created or previously disconnected chain element
-        * as modified and fully resolve the chain->data pointer.  The
-        * WAS_MODIFIED bit will be set in both cases.
+        * WAS_MODIFIED indicates that this is a newly-created chain element
+        * rather than a renamed chain element.  In this situation we want
+        * to mark non-data chain elements as modified in order to resolve
+        * the data pointer.
+        *
+        * data chain elements are marked modified but WITHOUT resolving the
+        * data pointer, as a device buffer would interfere otherwise.
         *
         * Chain elements with embedded data will not issue I/O at this time.
         * A new block will be allocated for the buffer but not instantiated.
         *
-        * Chain elements which do not use embedded data will allocate
-        * the new block AND instantiate its buffer cache buffer, pointing
-        * the data at the bp.
+        * NON-DATA chain elements which do not use embedded data will
+        * allocate the new block AND instantiate its buffer cache buffer,
+        * pointing the data at the bp.
         */
        if (chain->flags & HAMMER2_CHAIN_WAS_MODIFIED) {
                atomic_clear_int(&chain->flags, HAMMER2_CHAIN_WAS_MODIFIED);
-               hammer2_chain_modify(hmp, chain);
+               if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
+                       hammer2_chain_modify_quick(hmp, chain);
+               else
+                       hammer2_chain_modify(hmp, chain);
        }
 
 done:
@@ -1791,8 +1895,7 @@ hammer2_chain_delete(hammer2_mount_t *hmp, hammer2_chain_t *parent,
        if (chain->flags & HAMMER2_CHAIN_MODIFIED1) {
                atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED1);
                atomic_set_int(&chain->flags, HAMMER2_CHAIN_WAS_MODIFIED);
-               if ((chain->flags & HAMMER2_CHAIN_MODIFIED2) == 0)
-                       hammer2_chain_drop(hmp, chain);
+               hammer2_chain_drop(hmp, chain);
        }
 }
 
@@ -1802,12 +1905,16 @@ hammer2_chain_delete(hammer2_mount_t *hmp, hammer2_chain_t *parent,
  *
  * This cannot be called with the volume header's vchain (yet).
  *
- * PASS1 - clear the MODIFIED1 bit (and set the MODIFIED2 bit XXX)
- *
+ * PASS1 - clear the MODIFIED1 bit.
  */
 static void
 hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *chain)
 {
+       hammer2_blockref_t *bref;
+       hammer2_off_t pbase;
+       struct buf *bp;
+       int error;
+
        /*
         * Flush any children of this chain entry.
         */
@@ -1836,7 +1943,6 @@ hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *chain)
                 */
                switch(chain->bref.type) {
                case HAMMER2_BREF_TYPE_INODE:
-                       KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
                        base = &chain->data->ipdata.u.blockset.blockref[0];
                        count = HAMMER2_SET_COUNT;
                        break;
@@ -1845,7 +1951,6 @@ hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *chain)
                        count = chain->bytes / sizeof(hammer2_blockref_t);
                        break;
                case HAMMER2_BREF_TYPE_VOLUME:
-                       KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
                        base = &hmp->voldata.sroot_blockset.blockref[0];
                        count = HAMMER2_SET_COUNT;
                        break;
@@ -1863,26 +1968,27 @@ hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *chain)
                while ((scan = next) != NULL) {
                        next = SPLAY_NEXT(hammer2_chain_splay, &chain->shead,
                                          scan);
+                       if ((scan->flags & (HAMMER2_CHAIN_SUBMODIFIED |
+                                           HAMMER2_CHAIN_MODIFIED1 |
+                                           HAMMER2_CHAIN_MOVED)) == 0) {
+                               continue;
+                       }
+                       KKASSERT(scan->index >= 0 && scan->index < count);
+                       hammer2_chain_ref(hmp, scan);
+                       hammer2_chain_lock(hmp, scan);
+                       hammer2_chain_flush_pass1(hmp, scan);
                        if (scan->flags & (HAMMER2_CHAIN_SUBMODIFIED |
-                                          HAMMER2_CHAIN_MODIFIED1 |
-                                          HAMMER2_CHAIN_MOVED)) {
-                               hammer2_chain_ref(hmp, scan);
-                               hammer2_chain_lock(hmp, scan);
-                               hammer2_chain_flush_pass1(hmp, scan);
-                               if (scan->flags & (HAMMER2_CHAIN_SUBMODIFIED |
-                                                  HAMMER2_CHAIN_MODIFIED1)) {
-                                       submodified = 1;
-                               } else {
-                                       KKASSERT(scan->index < count);
-                                       base[scan->index] = scan->bref;
-                                       if (scan->flags & HAMMER2_CHAIN_MOVED) {
-                                               atomic_clear_int(&scan->flags,
-                                                        HAMMER2_CHAIN_MOVED);
-                                               hammer2_chain_drop(hmp, scan);
-                                       }
+                                          HAMMER2_CHAIN_MODIFIED1)) {
+                               submodified = 1;
+                       } else {
+                               base[scan->index] = scan->bref;
+                               if (scan->flags & HAMMER2_CHAIN_MOVED) {
+                                       atomic_clear_int(&scan->flags,
+                                                HAMMER2_CHAIN_MOVED);
+                                       hammer2_chain_drop(hmp, scan);
                                }
-                               hammer2_chain_put(hmp, scan);
                        }
+                       hammer2_chain_put(hmp, scan);
                }
                if (submodified) {
                        atomic_set_int(&chain->flags,
@@ -1903,12 +2009,10 @@ hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *chain)
         */
        atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED1);
        if (chain->flags & HAMMER2_CHAIN_MOVED) {
-               if ((chain->flags & HAMMER2_CHAIN_MODIFIED2) == 0)
-                       hammer2_chain_drop(hmp, chain);
+               hammer2_chain_drop(hmp, chain);
        } else {
+               /* inherit ref from the MODIFIED1 we cleared */
                atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
-               if (chain->flags & HAMMER2_CHAIN_MODIFIED2)
-                       hammer2_chain_ref(hmp, chain);
        }
 
        /*
@@ -1917,21 +2021,26 @@ hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *chain)
         *
         * This will never be a volume header.
         */
-       if (chain != &hmp->vchain) {
-               hammer2_blockref_t *bref;
-               hammer2_off_t off_hi;
-               struct buf *bp;
-               size_t off_lo;
-               size_t bytes;
-               int error;
-
+       switch(chain->bref.type) {
+       case HAMMER2_BREF_TYPE_VOLUME:
+               /*
+                * The volume header is flushed manually by the syncer, not
+                * here.
+                */
+               break;
+       case HAMMER2_BREF_TYPE_DATA:
+               /*
+                * Data elements have already been flushed via the logical
+                * file buffer cache.  Their hash was set in the bref by
+                * the vop_write code.
+                */
+               break;
+       default:
                KKASSERT(chain->data != NULL);
                bref = &chain->bref;
 
-               off_hi = bref->data_off & HAMMER2_OFF_MASK_HI;
-               off_lo = (size_t)bref->data_off & HAMMER2_OFF_MASK_LO;
-               bytes = 1 << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
-               KKASSERT(off_hi != 0);  /* not the root volume header */
+               pbase = bref->data_off & ~(hammer2_off_t)(chain->bytes - 1);
+               KKASSERT(pbase != 0);   /* not the root volume header */
 
                if (chain->bp == NULL) {
                        /*
@@ -1939,49 +2048,51 @@ hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *chain)
                         * buffer cache buffer and copy the data into it.
                         */
                        bp = NULL;
-                       error = bread(hmp->devvp, off_hi,
-                                     HAMMER2_PBUFSIZE, &bp);
+                       error = bread(hmp->devvp, pbase, chain->bytes, &bp);
                        KKASSERT(error == 0); /* XXX */
 
                        /*
                         * Copy the data to the buffer, mark the buffer
                         * dirty, and convert the chain to unmodified.
                         */
-                       bcopy(chain->data, (char *)bp->b_data + off_lo, bytes);
+                       bcopy(chain->data, bp->b_data, chain->bytes);
+                       bp->b_flags |= B_CLUSTEROK;
                        bdwrite(bp);
                        bp = NULL;
-
                        chain->bref.check.iscsi32.value =
-                                       hammer2_icrc32(chain->data, bytes);
+                               hammer2_icrc32(chain->data, chain->bytes);
+               } else {
+                       chain->bref.check.iscsi32.value =
+                               hammer2_icrc32(chain->data, chain->bytes);
                }
        }
-       {
-               hammer2_blockref_t *bref;
 
-               bref = &chain->bref;
+       /*
+        * Special handling
+        */
+       bref = &chain->bref;
 
-               switch(bref->type) {
-               case HAMMER2_BREF_TYPE_VOLUME:
-                       KKASSERT(chain->data != NULL);
-                       KKASSERT(chain->bp == NULL);
-
-                       hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]=
-                               hammer2_icrc32(
-                                       (char *)&hmp->voldata +
-                                        HAMMER2_VOLUME_ICRC1_OFF,
-                                       HAMMER2_VOLUME_ICRC1_SIZE);
-                       hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]=
-                               hammer2_icrc32(
-                                       (char *)&hmp->voldata +
-                                        HAMMER2_VOLUME_ICRC0_OFF,
-                                       HAMMER2_VOLUME_ICRC0_SIZE);
-                       hmp->voldata.icrc_volheader =
-                               hammer2_icrc32(
-                                       (char *)&hmp->voldata +
-                                        HAMMER2_VOLUME_ICRCVH_OFF,
-                                       HAMMER2_VOLUME_ICRCVH_SIZE);
-                       break;
-               }
+       switch(bref->type) {
+       case HAMMER2_BREF_TYPE_VOLUME:
+               KKASSERT(chain->data != NULL);
+               KKASSERT(chain->bp == NULL);
+
+               hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]=
+                       hammer2_icrc32(
+                               (char *)&hmp->voldata +
+                                HAMMER2_VOLUME_ICRC1_OFF,
+                               HAMMER2_VOLUME_ICRC1_SIZE);
+               hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]=
+                       hammer2_icrc32(
+                               (char *)&hmp->voldata +
+                                HAMMER2_VOLUME_ICRC0_OFF,
+                               HAMMER2_VOLUME_ICRC0_SIZE);
+               hmp->voldata.icrc_volheader =
+                       hammer2_icrc32(
+                               (char *)&hmp->voldata +
+                                HAMMER2_VOLUME_ICRCVH_OFF,
+                               HAMMER2_VOLUME_ICRCVH_SIZE);
+               break;
        }
 }
 
index 3e7e7b6..0a73741 100644 (file)
@@ -328,3 +328,24 @@ hammer2_bytes_to_radix(size_t bytes)
                ++radix;
        return (radix);
 }
+
+int
+hammer2_calc_logical(hammer2_inode_t *ip, hammer2_off_t uoff,
+                    hammer2_key_t *lbasep, hammer2_key_t *leofp)
+{
+       int radix;
+
+       *lbasep = uoff & ~HAMMER2_PBUFMASK64;
+       *leofp = ip->ip_data.size & ~HAMMER2_PBUFMASK;
+       KKASSERT(*lbasep <= *leofp);
+       if (*lbasep == *leofp) {
+               radix = hammer2_bytes_to_radix(
+                               (size_t)(ip->ip_data.size - *leofp));
+               if (radix < HAMMER2_MINIORADIX)
+                       radix = HAMMER2_MINIORADIX;
+               *leofp += 1U << radix;
+               return (1U << radix);
+       } else {
+               return (HAMMER2_PBUFSIZE);
+       }
+}
index b09edd2..a1d0920 100644 (file)
@@ -52,8 +52,9 @@
 static int hammer2_read_file(hammer2_inode_t *ip, struct uio *uio,
                                int seqcount);
 static int hammer2_write_file(hammer2_inode_t *ip, struct uio *uio, int ioflag);
-static void hammer2_extend_file(hammer2_inode_t *ip, hammer2_key_t nsize,
-                               int trivial);
+static hammer2_off_t hammer2_assign_physical(hammer2_inode_t *ip,
+                               hammer2_key_t lbase, int lblksize, int *errorp);
+static void hammer2_extend_file(hammer2_inode_t *ip, hammer2_key_t nsize);
 static void hammer2_truncate_file(hammer2_inode_t *ip, hammer2_key_t nsize);
 static int hammer2_unlink_file(hammer2_inode_t *dip,
                                const uint8_t *name, size_t name_len,
@@ -272,13 +273,9 @@ hammer2_vop_setattr(struct vop_setattr_args *ap)
                        if (vap->va_size == ip->ip_data.size)
                                break;
                        if (vap->va_size < ip->ip_data.size) {
-                               hammer2_chain_modify(hmp, &ip->chain);
                                hammer2_truncate_file(ip, vap->va_size);
-                               ip->ip_data.size = vap->va_size;
                        } else {
-                               hammer2_chain_modify(hmp, &ip->chain);
-                               hammer2_extend_file(ip, vap->va_size, 0);
-                               ip->ip_data.size = vap->va_size;
+                               hammer2_extend_file(ip, vap->va_size);
                        }
                        domtime = 1;
                        break;
@@ -552,6 +549,9 @@ hammer2_vop_write(struct vop_write_args *ap)
        /*
         * ip must be locked if extending the file.
         * ip must be locked to avoid racing a truncation.
+        *
+        * ip must be marked modified, particularly because the write
+        * might wind up being copied into the embedded data area.
         */
        hammer2_inode_lock_ex(ip);
        hammer2_chain_modify(hmp, &ip->chain);
@@ -578,29 +578,29 @@ hammer2_read_file(hammer2_inode_t *ip, struct uio *uio, int seqcount)
         * UIO read loop
         */
        while (uio->uio_resid > 0 && uio->uio_offset < ip->ip_data.size) {
-               hammer2_key_t off_hi;
-               int off_lo;
+               hammer2_key_t lbase;
+               hammer2_key_t leof;
+               int lblksize;
+               int loff;
                int n;
 
-               off_hi = uio->uio_offset & ~HAMMER2_LBUFMASK64;
-               off_lo = (int)(uio->uio_offset & HAMMER2_LBUFMASK64);
-
-               /* XXX bigread & signal check test */
+               lblksize = hammer2_calc_logical(ip, uio->uio_offset,
+                                               &lbase, &leof);
 
-               error = cluster_read(ip->vp,
-                                    ip->ip_data.size, off_hi,
-                                    HAMMER2_LBUFSIZE,
+               error = cluster_read(ip->vp, leof, lbase, lblksize,
                                     uio->uio_resid, seqcount * BKVASIZE,
                                     &bp);
+
                if (error)
                        break;
-               n = HAMMER2_LBUFSIZE - off_lo;
+               loff = (int)(uio->uio_offset - lbase);
+               n = lblksize - loff;
                if (n > uio->uio_resid)
                        n = uio->uio_resid;
                if (n > ip->ip_data.size - uio->uio_offset)
                        n = (int)(ip->ip_data.size - uio->uio_offset);
                bp->b_flags |= B_AGE;
-               uiomove((char *)bp->b_data + off_lo, n, uio);
+               uiomove((char *)bp->b_data + loff, n, uio);
                bqrelse(bp);
        }
        return (error);
@@ -614,6 +614,7 @@ static
 int
 hammer2_write_file(hammer2_inode_t *ip, struct uio *uio, int ioflag)
 {
+       hammer2_key_t old_eof;
        struct buf *bp;
        int kflags;
        int error;
@@ -627,30 +628,29 @@ hammer2_write_file(hammer2_inode_t *ip, struct uio *uio, int ioflag)
        error = 0;
 
        /*
+        * Extend the file if necessary.  If the write fails at some point
+        * we will truncate it back down to cover as much as we were able
+        * to write.
+        *
+        * Doing this now makes it easier to calculate buffer sizes in
+        * the loop.
+        */
+       old_eof = ip->ip_data.size;
+       if (uio->uio_offset + uio->uio_resid > ip->ip_data.size) {
+               hammer2_extend_file(ip, uio->uio_offset + uio->uio_resid);
+               kflags |= NOTE_EXTEND;
+       }
+
+       /*
         * UIO write loop
         */
        while (uio->uio_resid > 0) {
-               hammer2_key_t nsize;
-               hammer2_key_t off_hi;
-               int fixsize;
-               int off_lo;
-               int n;
+               hammer2_key_t lbase;
+               hammer2_key_t leof;
                int trivial;
-               int endofblk;
-
-               off_hi = uio->uio_offset & ~HAMMER2_LBUFMASK64;
-               off_lo = (int)(uio->uio_offset & HAMMER2_LBUFMASK64);
-
-               n = HAMMER2_LBUFSIZE - off_lo;
-               if (n > uio->uio_resid) {
-                       n = uio->uio_resid;
-                       endofblk = 0;
-               } else {
-                       endofblk = 1;
-               }
-               nsize = uio->uio_offset + n;
-
-               /* XXX bigwrite & signal check test */
+               int lblksize;
+               int loff;
+               int n;
 
                /*
                 * Don't allow the buffer build to blow out the buffer
@@ -666,22 +666,35 @@ hammer2_write_file(hammer2_inode_t *ip, struct uio *uio, int ioflag)
                        hammer2_chain_lock(ip->hmp, &ip->chain);
                }
 
+               /* XXX bigwrite & signal check test */
+
                /*
-                * Extend the size of the file as needed
-                * XXX lock.
+                * This nominally tells us how much we can cluster and
+                * what the logical buffer size needs to be.  Currently
+                * we don't try to cluster the write and just handle one
+                * block at a time.
                 */
-               if (nsize > ip->ip_data.size) {
-                       if (uio->uio_offset > ip->ip_data.size)
-                               trivial = 0;
-                       else
+               lblksize = hammer2_calc_logical(ip, uio->uio_offset,
+                                               &lbase, &leof);
+               loff = (int)(uio->uio_offset - lbase);
+
+               /*
+                * Calculate bytes to copy this transfer and whether the
+                * copy completely covers the buffer or not.
+                */
+               trivial = 0;
+               n = lblksize - loff;
+               if (n > uio->uio_resid) {
+                       n = uio->uio_resid;
+                       if (uio->uio_offset + n == ip->ip_data.size)
                                trivial = 1;
-                       hammer2_extend_file(ip, nsize, trivial);
-                       kflags |= NOTE_EXTEND;
-                       fixsize = 1;
-               } else {
-                       fixsize = 0;
+               } else if (loff == 0) {
+                       trivial = 1;
                }
 
+               /*
+                * Get the buffer
+                */
                if (uio->uio_segflg == UIO_NOCOPY) {
                        /*
                         * Issuing a write with the same data backing the
@@ -690,81 +703,178 @@ hammer2_write_file(hammer2_inode_t *ip, struct uio *uio, int ioflag)
                         *
                         * This case is used by vop_stdputpages().
                         */
-                       bp = getblk(ip->vp, off_hi,
-                                   HAMMER2_LBUFSIZE, GETBLK_BHEAVY, 0);
+                       bp = getblk(ip->vp, lbase, lblksize, GETBLK_BHEAVY, 0);
                        if ((bp->b_flags & B_CACHE) == 0) {
                                bqrelse(bp);
-                               error = bread(ip->vp, off_hi,
-                                             HAMMER2_LBUFSIZE, &bp);
+                               error = bread(ip->vp, lbase, lblksize, &bp);
                        }
-               } else if (off_lo == 0 && uio->uio_resid >= HAMMER2_LBUFSIZE) {
+               } else if (trivial) {
                        /*
                         * Even though we are entirely overwriting the buffer
                         * we may still have to zero it out to avoid a
                         * mmap/write visibility issue.
                         */
-                       bp = getblk(ip->vp, off_hi,
-                                   HAMMER2_LBUFSIZE, GETBLK_BHEAVY, 0);
+                       bp = getblk(ip->vp, lbase, lblksize, GETBLK_BHEAVY, 0);
                        if ((bp->b_flags & B_CACHE) == 0)
                                vfs_bio_clrbuf(bp);
-               } else if (off_hi >= ip->ip_data.size) {
-                       /*
-                        * If the base offset of the buffer is beyond the
-                        * file EOF, we don't have to issue a read.
-                        */
-                       bp = getblk(ip->vp, off_hi,
-                                   HAMMER2_LBUFSIZE, GETBLK_BHEAVY, 0);
-                       vfs_bio_clrbuf(bp);
                } else {
                        /*
                         * Partial overwrite, read in any missing bits then
                         * replace the portion being written.
+                        *
+                        * (The strategy code will detect zero-fill physical
+                        * blocks for this case).
                         */
-                       error = bread(ip->vp, off_hi, HAMMER2_LBUFSIZE, &bp);
+                       error = bread(ip->vp, lbase, lblksize, &bp);
                        if (error == 0)
                                bheavy(bp);
                }
 
-               if (error == 0) {
-                       /* release lock */
-                       error = uiomove(bp->b_data + off_lo, n, uio);
-                       /* acquire lock */
+               if (error) {
+                       brelse(bp);
+                       break;
                }
 
+               /*
+                * We have to assign physical storage to the buffer we intend
+                * to dirty or write now to avoid deadlocks in the strategy
+                * code later.
+                *
+                * This can return NOOFFSET for inode-embedded data.  The
+                * strategy code will take care of it in that case.
+                */
+               bp->b_bio2.bio_offset =
+                       hammer2_assign_physical(ip, lbase, lblksize, &error);
                if (error) {
                        brelse(bp);
-                       if (fixsize)
-                               hammer2_truncate_file(ip, ip->ip_data.size);
                        break;
                }
+
+               /*
+                * Ok, copy the data in
+                */
+               hammer2_chain_unlock(ip->hmp, &ip->chain);
+               error = uiomove(bp->b_data + loff, n, uio);
+               hammer2_chain_lock(ip->hmp, &ip->chain);
                kflags |= NOTE_WRITE;
-               if (ip->ip_data.size < uio->uio_offset)
-                       ip->ip_data.size = uio->uio_offset;
+
+               if (error) {
+                       brelse(bp);
+                       break;
+               }
+
                /* XXX update ino_data.mtime */
 
                /*
                 * Once we dirty a buffer any cached offset becomes invalid.
                 */
-               bp->b_bio2.bio_offset = NOOFFSET;
                bp->b_flags |= B_AGE;
                if (ioflag & IO_SYNC) {
                        bwrite(bp);
-               } else if ((ioflag & IO_DIRECT) && endofblk) {
-                       bawrite(bp);
+               } else if ((ioflag & IO_DIRECT) && loff + n == lblksize) {
+                       bdwrite(bp);
                } else if (ioflag & IO_ASYNC) {
                        bawrite(bp);
                } else {
                        bdwrite(bp);
                }
        }
+
+       /*
+        * Cleanup.  If we extended the file EOF but failed to write through
+        * the entire write is a failure and we have to back-up.
+        */
+       if (error && ip->ip_data.size != old_eof)
+               hammer2_truncate_file(ip, old_eof);
        /* hammer2_knote(ip->vp, kflags); */
        return error;
 }
 
 /*
- * Truncate the size of a file.  The inode must be locked and marked
- * for modification.  The caller will set ip->ip_data.size after we
- * return, we do not do it ourselves.
+ * Assign physical storage to a logical block.
+ *
+ * NOOFFSET is returned if the data is inode-embedded.  In this case the
+ * strategy code will simply bcopy() the data into the inode.
+ */
+static
+hammer2_off_t
+hammer2_assign_physical(hammer2_inode_t *ip, hammer2_key_t lbase,
+                       int lblksize, int *errorp)
+{
+       hammer2_mount_t *hmp;
+       hammer2_chain_t *parent;
+       hammer2_chain_t *chain;
+       hammer2_off_t pbase;
+
+       *errorp = 0;
+       hmp = ip->hmp;
+
+       /*
+        * Locate the chain associated with lbase, return a locked chain.
+        * However, do not instantiate any data reference (which utilizes a
+        * device buffer) because we will be using direct IO via the
+        * logical buffer cache buffer.
+        */
+       parent = &ip->chain;
+       hammer2_chain_ref(hmp, parent);
+       hammer2_chain_lock(hmp, parent);
+
+       chain = hammer2_chain_lookup(hmp, &parent,
+                                    lbase, lbase,
+                                    HAMMER2_LOOKUP_NODATA);
+
+       if (chain == NULL) {
+               /*
+                * We found a hole, create a new chain entry.  No meta-data
+                * buffer or data pointer will be assigned (indicating
+                * new, unwritten storage).
+                */
+               chain = hammer2_chain_create(hmp, parent, NULL,
+                                            lbase, HAMMER2_PBUFRADIX,
+                                            HAMMER2_BREF_TYPE_DATA,
+                                            lblksize);
+               pbase = chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX;
+       } else {
+               switch (chain->bref.type) {
+               case HAMMER2_BREF_TYPE_INODE:
+                       /*
+                        * The data is embedded in the inode
+                        */
+                       hammer2_chain_modify(hmp, chain);
+                       pbase = NOOFFSET;
+                       break;
+               case HAMMER2_BREF_TYPE_DATA:
+                       if (chain->bytes != lblksize) {
+                               panic("hammer2_assign_physical: "
+                                     "size mismatch %d/%d\n",
+                                     lblksize, chain->bytes);
+                       }
+                       hammer2_chain_modify_quick(hmp, chain);
+                       pbase = chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX;
+                       break;
+               default:
+                       panic("hammer2_assign_physical: bad type");
+                       /* NOT REACHED */
+                       pbase = NOOFFSET;
+                       break;
+               }
+       }
+
+       if (chain)
+               hammer2_chain_put(hmp, chain);
+       hammer2_chain_put(hmp, parent);
+
+       return (pbase);
+}
+
+/*
+ * Truncate the size of a file.
+ *
+ * This routine adjusts ip->ip_data.size smaller, destroying any related
+ * data beyond the new EOF and potentially resizing the block straddling
+ * the EOF.
+ *
+ * The inode must be locked.
  */
 static
 void
@@ -773,22 +883,32 @@ hammer2_truncate_file(hammer2_inode_t *ip, hammer2_key_t nsize)
        hammer2_chain_t *parent;
        hammer2_chain_t *chain;
        hammer2_mount_t *hmp = ip->hmp;
-       hammer2_key_t pkey;
+       hammer2_key_t lbase;
+       hammer2_key_t leof;
+       struct buf *bp;
+       int loff;
        int error;
-       int nradix;
-       int nbytes;
+       int oblksize;
+       int nblksize;
+
+       hammer2_chain_modify(hmp, &ip->chain);
+       bp = NULL;
 
        /*
-        * Destroy any logical buffer cache buffers beyond the file EOF
-        * and partially clean out any straddling buffer.
+        * Destroy any logical buffer cache buffers beyond the file EOF.
+        *
+        * We call nvtruncbuf() w/ trivial == 1 to prevent it from messing
+        * around with the buffer straddling EOF, because we need to assign
+        * a new physical offset to it.
         */
        if (ip->vp) {
                nvtruncbuf(ip->vp, nsize,
-                          HAMMER2_LBUFSIZE, nsize & HAMMER2_LBUFMASK);
+                          HAMMER2_PBUFSIZE, (int)nsize & HAMMER2_PBUFMASK,
+                          1);
        }
 
        /*
-        * Setup for lookup/next
+        * Setup for lookup/search
         */
        parent = &ip->chain;
        hammer2_chain_ref(hmp, parent);
@@ -800,47 +920,97 @@ hammer2_truncate_file(hammer2_inode_t *ip, hammer2_key_t nsize)
        }
 
        /*
-        * Any straddling logical block will have been dirtied, so the
-        * zeroing of any partial logical data is already handled.
-        *
-        * However any data beyond the logical block size present for the
-        * straddling physical block has to be erased.
-        *
-        * This can be done simply by rewriting the blockref to indicate a
-        * smaller physical allocation.  We do not have to reallocate the
-        * physical block.
+        * Handle the case where a chain/logical-buffer straddles the new
+        * EOF.  We told nvtruncbuf() above not to mess with the logical
+        * buffer straddling the EOF because we need to reassign its storage
+        * and can't let the strategy code do it for us.
         */
-       pkey = nsize & ~HAMMER2_PBUFMASK64;
+       loff = (int)nsize & HAMMER2_PBUFMASK;
+       if (loff && ip->vp) {
+               oblksize = hammer2_calc_logical(ip, nsize, &lbase, &leof);
+               error = bread(ip->vp, lbase, oblksize, &bp);
+               KKASSERT(error == 0);
+       }
+       ip->ip_data.size = nsize;
+       nblksize = hammer2_calc_logical(ip, nsize, &lbase, &leof);
 
        /*
-        * Calculate the physical block allocation for the old block
-        * and the new block.
-        *
-        * A radix of 0 is returned if nsize is 0.
-        * The returned radix may be smaller than HAMMER_LBUFSIZE.
+        * Fixup the chain element.  If we have a logical buffer in-hand
+        * we don't want to create a conflicting device buffer.
         */
-       nradix = hammer2_inode_calc_alloc(nsize);
-       nbytes = 1 << nradix;
-
-       if (nbytes) {
-               chain = hammer2_chain_lookup(hmp, &parent, pkey, pkey,
+       if (loff && bp) {
+               chain = hammer2_chain_lookup(hmp, &parent, lbase, lbase,
                                             HAMMER2_LOOKUP_NOLOCK);
-               if (chain->bref.type == HAMMER2_BREF_TYPE_DATA &&
-                   chain->bytes > nbytes) {
-                       hammer2_chain_lock(hmp, chain);
-                       hammer2_chain_resize(hmp, chain, nradix);
-                       hammer2_chain_unlock(hmp, chain);
+               if (chain) {
+                       allocbuf(bp, nblksize);
+                       switch(chain->bref.type) {
+                       case HAMMER2_BREF_TYPE_DATA:
+                               hammer2_chain_resize_quick(hmp, chain,
+                                            hammer2_bytes_to_radix(nblksize));
+                               hammer2_chain_modify_quick(hmp, chain);
+                               bzero(bp->b_data + loff, nblksize - loff);
+                               bp->b_bio2.bio_offset = chain->bref.data_off &
+                                                       HAMMER2_OFF_MASK;
+                               break;
+                       case HAMMER2_BREF_TYPE_INODE:
+                               bzero(bp->b_data + loff, nblksize - loff);
+                               bp->b_bio2.bio_offset = NOOFFSET;
+                               break;
+                       default:
+                               panic("hammer2_truncate_file: bad type");
+                               break;
+                       }
+                       hammer2_chain_drop(hmp, chain);
+                       bdwrite(bp);
+               } else {
+                       /*
+                        * Destroy clean buffer w/ wrong buffer size.  Retain
+                        * backing store.
+                        */
+                       bp->b_flags |= B_RELBUF;
+                       KKASSERT(bp->b_bio2.bio_offset == NOOFFSET);
+                       KKASSERT((bp->b_flags & B_DIRTY) == 0);
+                       bqrelse(bp);
+               }
+       } else if (loff) {
+               chain = hammer2_chain_lookup(hmp, &parent, lbase, lbase, 0);
+               if (chain) {
+                       switch(chain->bref.type) {
+                       case HAMMER2_BREF_TYPE_DATA:
+                               hammer2_chain_resize(hmp, chain,
+                                            hammer2_bytes_to_radix(nblksize));
+                               hammer2_chain_modify(hmp, chain);
+                               bzero(chain->data->buf + loff, nblksize - loff);
+                               break;
+                       case HAMMER2_BREF_TYPE_INODE:
+                               if (loff < HAMMER2_EMBEDDED_BYTES) {
+                                       hammer2_chain_modify(hmp, chain);
+                                       bzero(chain->data->ipdata.u.data + loff,
+                                             HAMMER2_EMBEDDED_BYTES - loff);
+                               }
+                               break;
+                       }
+                       hammer2_chain_put(hmp, chain);
                }
-               hammer2_chain_drop(hmp, chain);
        }
 
        /*
-        * Destroy any physical blocks after the new EOF point.
+        * Clean up any fragmentory VM pages now that we have properly
+        * resized the straddling buffer.  These pages are no longer
+        * part of the buffer.
         */
-       pkey = (nsize + HAMMER2_PBUFMASK64) & ~HAMMER2_PBUFMASK64;
+       if (ip->vp) {
+               nvtruncbuf(ip->vp, nsize,
+                          nblksize, (int)nsize & (nblksize - 1),
+                          1);
+       }
 
+       /*
+        * Destroy any physical blocks after the new EOF point.
+        */
+       lbase = (nsize + HAMMER2_PBUFMASK64) & ~HAMMER2_PBUFMASK64;
        chain = hammer2_chain_lookup(hmp, &parent,
-                                    pkey, (hammer2_key_t)-1,
+                                    lbase, (hammer2_key_t)-1,
                                     HAMMER2_LOOKUP_NOLOCK);
        while (chain) {
                /*
@@ -857,55 +1027,131 @@ hammer2_truncate_file(hammer2_inode_t *ip, hammer2_key_t nsize)
                }
                /* XXX check parent if empty indirect block & delete */
                chain = hammer2_chain_next(hmp, &parent, chain,
-                                          pkey, (hammer2_key_t)-1,
+                                          lbase, (hammer2_key_t)-1,
                                           HAMMER2_LOOKUP_NOLOCK);
        }
        hammer2_chain_put(hmp, parent);
 }
 
 /*
- * Extend the size of a file.  The inode must be locked and marked
- * for modification.  The caller will set ip->ip_data.size after we
- * return, we do not do it ourselves.
+ * Extend the size of a file.  The inode must be locked.
  *
- * We don't bother resizing the block straddling EOF until the
- * strategy write commits a related buffer.  In otherwords,
- * zero-fill is implied.
+ * We may have to resize the block straddling the old EOF.
  */
 static
 void
-hammer2_extend_file(hammer2_inode_t *ip, hammer2_key_t nsize, int trivial)
+hammer2_extend_file(hammer2_inode_t *ip, hammer2_key_t nsize)
 {
+       hammer2_mount_t *hmp;
+       hammer2_chain_t *parent;
+       hammer2_chain_t *chain;
        struct buf *bp;
+       hammer2_key_t osize;
+       hammer2_key_t obase;
+       hammer2_key_t nbase;
+       hammer2_key_t leof;
+       int oblksize;
+       int nblksize;
+       int nradix;
        int error;
 
+       KKASSERT(ip->vp);
+       hmp = ip->hmp;
+
+       hammer2_chain_modify(hmp, &ip->chain);
+
        /*
-        * Disable direct-data mode if necessary.  It's better to do this
-        * here than to try to code it in the strategy routine.
-        *
-        * No other action on the physical blockmap for the file is required
-        * when simply resizing a file.  All other actions will be handled by
-        * any necessary block reallocation in the strategy write code.
+        * Nothing to do if the direct-data case is still intact
         */
        if ((ip->ip_data.op_flags & HAMMER2_OPFLAG_DIRECTDATA) &&
-           nsize > HAMMER2_EMBEDDED_BYTES) {
-               error = bread(ip->vp, 0, HAMMER2_LBUFSIZE, &bp);
+           nsize <= HAMMER2_EMBEDDED_BYTES) {
+               ip->ip_data.size = nsize;
+               return;
+       }
+
+       /*
+        * Calculate the blocksize at the original EOF and resize the block
+        * if necessary.  Adjust the file size in the inode.
+        */
+       osize = ip->ip_data.size;
+       oblksize = hammer2_calc_logical(ip, osize, &obase, &leof);
+       ip->ip_data.size = nsize;
+       nblksize = hammer2_calc_logical(ip, osize, &nbase, &leof);
+
+       /*
+        * Do all required vnode operations, but do not mess with the
+        * buffer straddling the orignal EOF.
+        */
+       nvextendbuf(ip->vp,
+                   ip->ip_data.size, nsize,
+                   0, nblksize,
+                   0, (int)nsize & HAMMER2_PBUFMASK,
+                   1);
+
+       /*
+        * Early return if we have no more work to do.
+        */
+       if (obase == nbase && oblksize == nblksize &&
+           (ip->ip_data.op_flags & HAMMER2_OPFLAG_DIRECTDATA) == 0) {
+               return;
+       }
+
+       /*
+        * We have work to do, including possibly resizing the buffer
+        * at the EOF point and turning off DIRECTDATA mode.
+        */
+       bp = NULL;
+       if (((int)osize & HAMMER2_PBUFMASK)) {
+               error = bread(ip->vp, obase, oblksize, &bp);
                KKASSERT(error == 0);
+
+               if (obase != nbase) {
+                       allocbuf(bp, HAMMER2_PBUFSIZE);
+               } else {
+                       allocbuf(bp, nblksize);
+               }
+               vfs_bio_clrbuf(bp);
+       }
+
+       /*
+        * Disable direct-data mode by loading up a buffer cache buffer
+        * with the data, then converting the inode data area into the
+        * inode indirect block array area.
+        */
+       if (ip->ip_data.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
                ip->ip_data.op_flags &= ~HAMMER2_OPFLAG_DIRECTDATA;
-               bzero(&ip->ip_data.u.blockset,
-                     sizeof(ip->ip_data.u.blockset));
-               bdwrite(bp);
+               bzero(&ip->ip_data.u.blockset, sizeof(ip->ip_data.u.blockset));
        }
 
        /*
-        * This will fix up the logical buffers
+        * Resize the chain element at the old EOF.
         */
-       if (ip->vp) {
-               nvextendbuf(ip->vp, ip->ip_data.size, nsize,
-                           HAMMER2_LBUFSIZE, HAMMER2_LBUFSIZE,
-                           (int)(ip->ip_data.size & HAMMER2_LBUFMASK),
-                           (int)(nsize & HAMMER2_LBUFMASK),
-                           trivial);
+       if (((int)osize & HAMMER2_PBUFMASK)) {
+               parent = &ip->chain;
+               hammer2_chain_ref(hmp, parent);
+               error = hammer2_chain_lock(hmp, parent);
+               KKASSERT(error == 0);
+
+               nradix = hammer2_bytes_to_radix(nblksize);
+
+               chain = hammer2_chain_lookup(hmp, &parent,
+                                            obase, obase,
+                                            HAMMER2_LOOKUP_NOLOCK);
+               if (chain == NULL) {
+                       chain = hammer2_chain_create(hmp, parent, NULL,
+                                                    obase, nblksize,
+                                                    HAMMER2_BREF_TYPE_DATA,
+                                                    nradix);
+               } else {
+                       KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA);
+                       hammer2_chain_resize_quick(hmp, chain, nradix);
+                       hammer2_chain_modify_quick(hmp, chain);
+               }
+               bp->b_bio2.bio_offset = chain->bref.data_off &
+                                       HAMMER2_OFF_MASK;
+               hammer2_chain_drop(hmp, chain);
+               bdwrite(bp);
+               hammer2_chain_put(hmp, parent);
        }
 }
 
@@ -1659,11 +1905,7 @@ hammer2_strategy_read(struct vop_strategy_args *ap)
        hammer2_inode_t *ip;
        hammer2_chain_t *parent;
        hammer2_chain_t *chain;
-       hammer2_key_t pkey;
-       int poff;
-       int ddlen = 0;          /* direct data shortcut */
-       char *ddata = NULL;
-       int didlock;
+       hammer2_key_t lbase;
 
        bio = ap->a_bio;
        bp = bio->bio_buf;
@@ -1671,11 +1913,9 @@ hammer2_strategy_read(struct vop_strategy_args *ap)
        hmp = ip->hmp;
        nbio = push_bio(bio);
 
-       KKASSERT((bio->bio_offset & HAMMER2_LBUFMASK64) == 0);
-       pkey = bio->bio_offset & HAMMER2_OFF_MASK_HI;
-       poff = bio->bio_offset & HAMMER2_OFF_MASK_LO;
+       lbase = bio->bio_offset;
        chain = NULL;
-       didlock = 0;
+       KKASSERT(((int)lbase & HAMMER2_PBUFMASK) == 0);
 
        /*
         * We must characterize the logical->physical translation if it
@@ -1694,7 +1934,7 @@ hammer2_strategy_read(struct vop_strategy_args *ap)
                 * chain element's content.  We just need the block device
                 * offset.
                 */
-               chain = hammer2_chain_lookup(hmp, &parent, pkey, pkey,
+               chain = hammer2_chain_lookup(hmp, &parent, lbase, lbase,
                                             HAMMER2_LOOKUP_NOLOCK);
                if (chain == NULL) {
                        /*
@@ -1703,43 +1943,19 @@ hammer2_strategy_read(struct vop_strategy_args *ap)
                        nbio->bio_offset = ZFOFFSET;
                } else if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
                        /*
-                        * Data is embedded in the inode
-                        *
-                        * leave nbio->bio_offset set to NOOFFSET
+                        * Data is embedded in the inode (do nothing)
                         */
-                       ddata = chain->data->ipdata.u.data;
-                       ddlen = HAMMER2_EMBEDDED_BYTES;
                        KKASSERT(chain == parent);
+                       hammer2_chain_drop(hmp, chain);
                } else if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
                        /*
                         * Data is on-media
-                        *
-                        * We can set nbio->bio_offset only if the entire
-                        * request can be handled by the chain element.  That
-                        * is, the chain element's storage size is at least
-                        * the size of the IO.  Hopefully this will also
-                        * read-ahead.
-                        *
-                        * For now double-buffer reads for small files in
-                        * order to take advantage of clustering in the
-                        * physical device layer.
                         */
-                       if (chain->bytes >= bp->b_bcount &&
-                           ip->ip_data.size >= HAMMER2_PBUFSIZE / 2) {
-                               /*
-                                * Direct read
-                                */
-                               nbio->bio_offset = (chain->bref.data_off &
-                                                   HAMMER2_OFF_MASK) + poff;
-                       } else {
-                               /*
-                                * Double-buffer through physical buffer
-                                */
-                               hammer2_chain_lock(hmp, chain);
-                               ddata = chain->data->buf;
-                               ddlen = chain->bytes;
-                               didlock = 1;
-                       }
+                       KKASSERT(bp->b_bcount == chain->bytes);
+                       nbio->bio_offset = chain->bref.data_off &
+                                          HAMMER2_OFF_MASK;
+                       hammer2_chain_drop(hmp, chain);
+                       KKASSERT(nbio->bio_offset != 0);
                } else {
                        panic("hammer2_strategy_read: unknown bref type");
                }
@@ -1752,60 +1968,24 @@ hammer2_strategy_read(struct vop_strategy_args *ap)
                 */
                bp->b_resid = 0;
                bp->b_error = 0;
-               vfs_bio_clrbuf(bp);
+               bzero(bp->b_data, bp->b_bcount);
                biodone(nbio);
        } else if (nbio->bio_offset != NOOFFSET) {
                /*
-                * Direct IO is possible
+                * Forward direct IO to the device
                 */
-               if (chain) {
-                       hammer2_chain_drop(hmp, chain);
-                       chain = NULL;
-               }
                vn_strategy(hmp->devvp, nbio);
-       } else if (ddata) {
+       } else {
                /*
-                * We can immediately supply the data
+                * Data is embedded in inode.
                 */
-               if (poff >= ddlen) {
-                       bzero(bp->b_data, bp->b_bcount);
-               } else {
-                       ddlen -= poff;
-                       ddata += poff;
-
-                       if (ddlen > bp->b_bcount)
-                               ddlen = bp->b_bcount;
-                       bcopy(ddata, bp->b_data, ddlen);
-                       if (ddlen < bp->b_bcount)
-                               bzero(bp->b_data + ddlen, bp->b_bcount - ddlen);
-               }
+               bcopy(chain->data->ipdata.u.data, bp->b_data,
+                     HAMMER2_EMBEDDED_BYTES);
+               bzero(bp->b_data + HAMMER2_EMBEDDED_BYTES,
+                     bp->b_bcount - HAMMER2_EMBEDDED_BYTES);
                bp->b_resid = 0;
                bp->b_error = 0;
                biodone(nbio);
-       } else {
-               panic("hammer2_strategy_read: illegal state");
-       }
-
-       /*
-        * Clean up the chain.
-        *
-        * We set CHAIN_IOFLUSH to try to get rid of excess double-buffered
-        * data.  If the chain isn't dirty no flush will actually occur, but
-        * the underlying bp will be released.
-        */
-       if (chain) {
-               if (didlock) {
-#if 1
-                       if (((int)bio->bio_offset & HAMMER2_PBUFMASK) ==
-                           HAMMER2_PBUFSIZE - HAMMER2_LBUFSIZE) {
-                               atomic_set_int(&chain->flags,
-                                              HAMMER2_CHAIN_IOFLUSH);
-                       }
-#endif
-                       hammer2_chain_unlock(hmp, chain);
-               }
-               hammer2_chain_drop(hmp, chain);
-               chain = NULL;
        }
        return (0);
 }
@@ -1819,14 +1999,6 @@ hammer2_strategy_write(struct vop_strategy_args *ap)
        struct bio *nbio;
        hammer2_mount_t *hmp;
        hammer2_inode_t *ip;
-       hammer2_chain_t *parent;
-       hammer2_chain_t *chain;
-       hammer2_key_t pkey;
-       hammer2_key_t pkey_eof;
-       int poff;
-       int radix;
-       char *ddata = NULL;
-       size_t ddlen = 0;
 
        bio = ap->a_bio;
        bp = bio->bio_buf;
@@ -1834,111 +2006,24 @@ hammer2_strategy_write(struct vop_strategy_args *ap)
        hmp = ip->hmp;
        nbio = push_bio(bio);
 
-       /*
-        * Our bmap doesn't support writes atm, and a vop_write should
-        * clear the physical disk offset cache for the copy-on-write
-        * operation.
-        */
-       KKASSERT((bio->bio_offset & HAMMER2_LBUFMASK64) == 0);
-       KKASSERT(nbio->bio_offset == NOOFFSET);
-
-       pkey = bio->bio_offset & HAMMER2_OFF_MASK_HI;
-       pkey_eof = ip->ip_data.size & HAMMER2_OFF_MASK_HI;
-       poff = bio->bio_offset & HAMMER2_OFF_MASK_LO;
-
-       if (pkey < pkey_eof)
-               radix = HAMMER2_PBUFRADIX;
-       else
-               radix = hammer2_inode_calc_alloc(ip->ip_data.size);
+       KKASSERT((bio->bio_offset & HAMMER2_PBUFMASK64) == 0);
+       KKASSERT(nbio->bio_offset != 0 && nbio->bio_offset != ZFOFFSET);
 
-       /*
-        * Locate the physical block
-        */
-       parent = &ip->chain;
-       hammer2_chain_ref(hmp, parent);
-       hammer2_chain_lock(hmp, parent);
-
-       /*
-        * XXX implement NODATA flag to avoid instantiating bp if
-        * it isn't already present for direct-write implementation.
-        */
-       chain = hammer2_chain_lookup(hmp, &parent, pkey, pkey, 0);
-
-       /*
-        * Allocate a zero-fill block, resize an existing block if necessary.
-        * Modify the chain element to reallocate the block.
-        */
-       if (chain == NULL) {
-               /*
-                * Allocate a new chain as necessary
-                */
-               KKASSERT(radix > 0);
-               chain = hammer2_chain_create(hmp, parent, NULL,
-                                            pkey, HAMMER2_PBUFRADIX,
-                                            HAMMER2_BREF_TYPE_DATA,
-                                            (size_t)1 << radix);
-               ddata = chain->data->buf;
-               ddlen = chain->bytes;
-       } else if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
+       if (nbio->bio_offset == NOOFFSET) {
                /*
-                * The data is embedded in the inode
+                * Must be embedded in the inode.
                 */
-               hammer2_chain_modify(hmp, chain);
-               ddata = chain->data->ipdata.u.data;
-               ddlen = HAMMER2_EMBEDDED_BYTES;
-       } else if (chain->bytes != 1 << radix) {
+               KKASSERT(bio->bio_offset == 0);
+               bcopy(bp->b_data, ip->ip_data.u.data, HAMMER2_EMBEDDED_BYTES);
+               bp->b_resid = 0;
+               bp->b_error = 0;
+               biodone(nbio);
+       } else {
                /*
-                * If the existing physical allocation is not properly
-                * sized we have to resize it.
+                * Forward direct IO to the device
                 */
-               hammer2_chain_resize(hmp, chain, radix);
-               hammer2_chain_modify(hmp, chain);
-               ddata = chain->data->buf;
-               ddlen = chain->bytes;
-       } else {
-               hammer2_chain_modify(hmp, chain);
-               ddata = chain->data->buf;
-               ddlen = chain->bytes;
-       }
-
-       if (poff >= ddlen) {
-               /* lbuf beyond file EOF?  Do nothing */
-       } else {
-               ddlen -= poff;
-               ddata += poff;
-
-               if (ddlen > bp->b_bcount)       /* urmm.  partial beyond EOF? */
-                       ddlen = bp->b_bcount;
-               bcopy(bp->b_data, ddata, ddlen);
-               if (ddlen < bp->b_bcount)
-                       ;/* urmm.  Shouldn't be possible, LBUF < PBUF */
-       }
-
-       /*
-        * We set CHAIN_IOFLUSH to try to get rid of excess double-buffered
-        * data.  This will cause a bawrite() to be issued instead of a
-        * bdwrite() when the last chunk in a physical buffer is being put
-        * away.
-        *
-        * Only do this for data elements.  There isn't much point doing
-        * this for other types (e.g. embedded data in inode) because the
-        * hashes in the blockref's are not likely to be updated yet.
-        */
-#if 1
-       if (chain->bref.type == HAMMER2_BREF_TYPE_DATA &&
-           poff + bp->b_bcount == HAMMER2_PBUFSIZE) {
-               atomic_set_int(&chain->flags, HAMMER2_CHAIN_IOFLUSH);
+               vn_strategy(hmp->devvp, nbio);
        }
-#endif
-
-       hammer2_chain_put(hmp, chain);
-       hammer2_chain_put(hmp, parent);
-
-       bp->b_flags |= B_RELBUF;
-       bp->b_resid = 0;
-       bp->b_error = 0;
-       biodone(nbio);
-
        return (0);
 }