HAMMER 61F/Many: Stabilization w/ simultanious pruning and reblocking
authorMatthew Dillon <dillon@dragonflybsd.org>
Mon, 14 Jul 2008 03:20:49 +0000 (03:20 +0000)
committerMatthew Dillon <dillon@dragonflybsd.org>
Mon, 14 Jul 2008 03:20:49 +0000 (03:20 +0000)
* BUG FIX: When doing direct-read check to see if any device buffers
  are aliasing the disk block and flush any we find which are dirty.
  This ensures that reblocked data gets to disk before a direct-read
  tries to read them FROM the disk.

* BUG FIX: Fix a bug introduced in a recent commit where the flusher
  wlll not always completely flush the UNDO FIFO or completely flush
  all meta-data, resulting in a rollback after a normal umount/mount.

* BUG FIX: Direct-writes queue I/O independant of the in-memory record.
  When the backend flusher flushes the record, making it available in the
  B-Tree, make sure that the indepent I/O has completed.  Otherwise
  a later reblocking operation might read the media before the direct-write
  has actually completed.

* BUG FIX: In-memory records are not subject direct-IO, since their data
  is not yet on the media.

* BUG FIX: Do not allow mount to succeed unless all volumes have been found.
  (Reported-by: Sascha Wildner <saw@online.de>)

* BUG FIX: The bd_heatup() call in the reblocker was in the wrong place,
  potentially causing the cursor to shift unexpectedly.

* Reorient some of the buffer invalidation code by enhancing
  the reservation code.

* Add read CRC verification logic for some direct-reads, but comment it out
  because the VM system's bogus-page replacement breaks it.

12 files changed:
sys/vfs/hammer/hammer.h
sys/vfs/hammer/hammer_blockmap.c
sys/vfs/hammer/hammer_flusher.c
sys/vfs/hammer/hammer_inode.c
sys/vfs/hammer/hammer_io.c
sys/vfs/hammer/hammer_mount.h
sys/vfs/hammer/hammer_object.c
sys/vfs/hammer/hammer_ondisk.c
sys/vfs/hammer/hammer_prune.c
sys/vfs/hammer/hammer_reblock.c
sys/vfs/hammer/hammer_vfsops.c
sys/vfs/hammer/hammer_vnops.c

index 0fee1f1..5da87ba 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.116 2008/07/13 09:32:48 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.117 2008/07/14 03:20:49 dillon Exp $
  */
 /*
  * This header file contains structures used internally by the HAMMERFS
@@ -409,10 +409,12 @@ typedef struct hammer_record *hammer_record_t;
 #define HAMMER_RECF_ONRBTREE           0x0002
 #define HAMMER_RECF_DELETED_FE         0x0004  /* deleted (frontend) */
 #define HAMMER_RECF_DELETED_BE         0x0008  /* deleted (backend) */
-#define HAMMER_RECF_UNUSED0010         0x0010
+#define HAMMER_RECF_COMMITTED          0x0010  /* committed to the B-Tree */
 #define HAMMER_RECF_INTERLOCK_BE       0x0020  /* backend interlock */
 #define HAMMER_RECF_WANTED             0x0040  /* wanted by the frontend */
-#define HAMMER_RECF_CONVERT_DELETE     0x0100 /* special case */
+#define HAMMER_RECF_CONVERT_DELETE     0x0100  /* special case */
+#define HAMMER_RECF_DIRECT_IO          0x0200  /* related direct I/O running*/
+#define HAMMER_RECF_DIRECT_WAIT                0x0400  /* related direct I/O running*/
 
 /*
  * hammer_delete_at_cursor() flags
@@ -618,6 +620,7 @@ struct hammer_reserve {
        int             flags;
        int             refs;
        int             zone;
+       int             bytes_freed;
        int             append_off;
        hammer_off_t    zone_offset;
 };
@@ -768,7 +771,7 @@ extern int hammer_debug_btree;
 extern int hammer_debug_tid;
 extern int hammer_debug_recover;
 extern int hammer_debug_recover_faults;
-extern int hammer_debug_cluster_enable;
+extern int hammer_cluster_enable;
 extern int hammer_count_fsyncs;
 extern int hammer_count_inodes;
 extern int hammer_count_iqueued;
@@ -798,6 +801,7 @@ extern int hammer_limit_iqueued;
 extern int hammer_limit_recs;
 extern int hammer_bio_count;
 extern int hammer_verify_zone;
+extern int hammer_verify_data;
 extern int hammer_write_mode;
 extern int64_t hammer_contention_count;
 
@@ -821,6 +825,7 @@ int hammer_adjust_volume_mode(hammer_volume_t volume, void *data __unused);
 
 int    hammer_unload_buffer(hammer_buffer_t buffer, void *data __unused);
 int    hammer_install_volume(hammer_mount_t hmp, const char *volname);
+int    hammer_mountcheck_volumes(hammer_mount_t hmp);
 
 int    hammer_ip_lookup(hammer_cursor_t cursor);
 int    hammer_ip_first(hammer_cursor_t cursor);
@@ -947,7 +952,10 @@ hammer_volume_t    hammer_get_volume(hammer_mount_t hmp,
                        int32_t vol_no, int *errorp);
 hammer_buffer_t        hammer_get_buffer(hammer_mount_t hmp, hammer_off_t buf_offset,
                        int bytes, int isnew, int *errorp);
-void           hammer_del_buffers(hammer_mount_t hmp, hammer_off_t base_offset,
+void           hammer_sync_buffers(hammer_mount_t hmp,
+                       hammer_off_t base_offset, int bytes);
+void           hammer_del_buffers(hammer_mount_t hmp,
+                       hammer_off_t base_offset,
                        hammer_off_t zone2_offset, int bytes);
 
 int            hammer_ref_volume(hammer_volume_t volume);
@@ -994,6 +1002,8 @@ hammer_off_t hammer_blockmap_alloc(hammer_transaction_t trans, int zone,
                        int bytes, int *errorp);
 hammer_reserve_t hammer_blockmap_reserve(hammer_mount_t hmp, int zone,
                        int bytes, hammer_off_t *zone_offp, int *errorp);
+void hammer_blockmap_reserve_undo(hammer_reserve_t resv,
+                       hammer_off_t zone_offset, int bytes);
 void hammer_blockmap_reserve_complete(hammer_mount_t hmp,
                        hammer_reserve_t resv);
 void hammer_reserve_clrdelay(hammer_mount_t hmp, hammer_reserve_t resv);
@@ -1074,11 +1084,14 @@ int hammer_io_new(struct vnode *devvp, struct hammer_io *io);
 void hammer_io_inval(hammer_volume_t volume, hammer_off_t zone2_offset);
 struct buf *hammer_io_release(struct hammer_io *io, int flush);
 void hammer_io_flush(struct hammer_io *io);
+void hammer_io_wait(struct hammer_io *io);
 void hammer_io_waitdep(struct hammer_io *io);
 void hammer_io_wait_all(hammer_mount_t hmp, const char *ident);
-int hammer_io_direct_read(hammer_mount_t hmp, struct bio *bio);
-int hammer_io_direct_write(hammer_mount_t hmp, hammer_btree_leaf_elm_t leaf,
-                         struct bio *bio);
+int hammer_io_direct_read(hammer_mount_t hmp, struct bio *bio,
+                       hammer_btree_leaf_elm_t leaf);
+int hammer_io_direct_write(hammer_mount_t hmp, hammer_record_t record,
+                       struct bio *bio);
+void hammer_io_direct_wait(hammer_record_t record);
 void hammer_io_direct_uncache(hammer_mount_t hmp, hammer_btree_leaf_elm_t leaf);
 void hammer_io_write_interlock(hammer_io_t io);
 void hammer_io_done_interlock(hammer_io_t io);
@@ -1123,6 +1136,8 @@ int  hammer_flusher_meta_halflimit(hammer_mount_t hmp);
 int  hammer_flusher_undo_exhausted(hammer_transaction_t trans, int quarter);
 void hammer_flusher_clean_loose_ios(hammer_mount_t hmp);
 void hammer_flusher_finalize(hammer_transaction_t trans, int final);
+int  hammer_flusher_haswork(hammer_mount_t hmp);
+
 
 int hammer_recover(hammer_mount_t hmp, hammer_volume_t rootvol);
 void hammer_recover_flush_buffers(hammer_mount_t hmp,
index 47037f4..6d117d6 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_blockmap.c,v 1.23 2008/07/03 04:24:51 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_blockmap.c,v 1.24 2008/07/14 03:20:49 dillon Exp $
  */
 
 /*
@@ -532,6 +532,17 @@ failed:
        return(resv);
 }
 
+/*
+ * Backend function - undo a portion of a reservation.
+ */
+void
+hammer_blockmap_reserve_undo(hammer_reserve_t resv,
+                        hammer_off_t zone_offset, int bytes)
+{
+       resv->bytes_freed += bytes;
+}
+
+
 /*
  * A record with a storage reservation calls this function when it is
  * being freed.  The storage may or may not have actually been allocated.
@@ -542,9 +553,37 @@ failed:
 void
 hammer_blockmap_reserve_complete(hammer_mount_t hmp, hammer_reserve_t resv)
 {
+       hammer_off_t zone2_offset;
+
        KKASSERT(resv->refs > 0);
        if (--resv->refs == 0) {
                KKASSERT((resv->flags & HAMMER_RESF_ONDELAY) == 0);
+
+               zone2_offset = (resv->zone_offset & ~HAMMER_OFF_ZONE_MASK) |
+                               HAMMER_ZONE_RAW_BUFFER;
+
+               /*
+                * If we are releasing a zone and all of its reservations
+                * were undone we have to clean out all hammer and device
+                * buffers associated with the big block.
+                *
+                * Any direct allocations will cause this test to fail
+                * (bytes_freed will never reach append_off), which is
+                * the behavior we desire.  Once the zone has been assigned
+                * to the big-block the only way to allocate from it in the
+                * future is if the reblocker can completely clean it out,
+                * and that will also properly call hammer_del_buffers().
+                *
+                * If we don't we risk all sorts of buffer cache aliasing
+                * effects, including overlapping buffers with different
+                * sizes.
+                */
+               if (resv->bytes_freed == resv->append_off) {
+                       kprintf("U");
+                       hammer_del_buffers(hmp, resv->zone_offset,
+                                          zone2_offset,
+                                          HAMMER_LARGEBLOCK_SIZE);
+               }
                RB_REMOVE(hammer_res_rb_tree, &hmp->rb_resv_root, resv);
                kfree(resv, M_HAMMER);
                --hammer_count_reservations;
@@ -710,9 +749,14 @@ again:
                         * the next flush cycle so potentially undoable
                         * data is not overwritten.
                         */
-                       if (hammer_reserve_setdelay(hmp, resv, base_off))
+                       if (hammer_reserve_setdelay(hmp, NULL, base_off))
                                goto again;
                        KKASSERT(layer2->zone == zone);
+                       /*
+                        * XXX maybe incorporate this del call in the
+                        * release code by setting base_offset, bytes_freed,
+                        * etc.
+                        */
                        hammer_del_buffers(hmp,
                                           zone_offset &
                                              ~HAMMER_LARGEBLOCK_MASK64,
index aad7b3c..ec5a92b 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_flusher.c,v 1.39 2008/07/13 09:32:48 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_flusher.c,v 1.40 2008/07/14 03:20:49 dillon Exp $
  */
 /*
  * HAMMER dependancy flusher thread
@@ -353,10 +353,10 @@ hammer_flusher_flush(hammer_mount_t hmp)
        }
 
        /*
-        * We may have pure meta-data to flush, even if there were no
-        * flush groups.
+        * We may have pure meta-data to flush, or we may have to finish
+        * cycling the UNDO FIFO, even if there were no flush groups.
         */
-       if (count == 0 && hmp->locked_dirty_space) {
+       if (count == 0 && hammer_flusher_haswork(hmp)) {
                hammer_start_transaction_fls(&hmp->flusher.trans, hmp);
                hammer_flusher_finalize(&hmp->flusher.trans, 1);
                hammer_done_transaction(&hmp->flusher.trans);
@@ -673,7 +673,12 @@ hammer_flusher_finalize(hammer_transaction_t trans, int final)
         */
        if (final) {
                cundomap = &hmp->blockmap[HAMMER_ZONE_UNDO_INDEX];
-               cundomap->first_offset = cundomap->next_offset;
+               if (cundomap->first_offset == cundomap->next_offset) {
+                       hmp->hflags &= ~HMNT_UNDO_DIRTY;
+               } else {
+                       cundomap->first_offset = cundomap->next_offset;
+                       hmp->hflags |= HMNT_UNDO_DIRTY;
+               }
                hammer_clear_undo_history(hmp);
        }
 
@@ -701,6 +706,12 @@ hammer_flusher_meta_limit(hammer_mount_t hmp)
        return(0);
 }
 
+/*
+ * Return non-zero if too many dirty meta-data buffers have built up.
+ *
+ * This version is used by background operations (mirror, prune, reblock)
+ * to leave room for foreground operations.
+ */
 int
 hammer_flusher_meta_halflimit(hammer_mount_t hmp)
 {
@@ -711,3 +722,21 @@ hammer_flusher_meta_halflimit(hammer_mount_t hmp)
        return(0);
 }
 
+/*
+ * Return non-zero if the flusher still has something to flush.
+ */
+int
+hammer_flusher_haswork(hammer_mount_t hmp)
+{
+       if (TAILQ_FIRST(&hmp->flush_group_list) ||      /* dirty inodes */
+           TAILQ_FIRST(&hmp->volu_list) ||             /* dirty bufffers */
+           TAILQ_FIRST(&hmp->undo_list) ||
+           TAILQ_FIRST(&hmp->data_list) ||
+           TAILQ_FIRST(&hmp->meta_list) ||
+           (hmp->hflags & HMNT_UNDO_DIRTY)             /* UNDO FIFO sync */
+       ) {
+               return(1);
+       }
+       return(0);
+}
+
index 05e1c0f..bcdbb39 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.102 2008/07/13 09:32:48 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.103 2008/07/14 03:20:49 dillon Exp $
  */
 
 #include "hammer.h"
@@ -1041,7 +1041,7 @@ retry:
                 * destroy it whether we succeed or fail.
                 */
                record->flags &= ~HAMMER_RECF_INTERLOCK_BE;
-               record->flags |= HAMMER_RECF_DELETED_FE;
+               record->flags |= HAMMER_RECF_DELETED_FE | HAMMER_RECF_COMMITTED;
                record->flush_state = HAMMER_FST_IDLE;
                hammer_rel_mem_record(record);
 
index 5705b7b..30099a3 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_io.c,v 1.48 2008/06/29 07:50:40 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_io.c,v 1.49 2008/07/14 03:20:49 dillon Exp $
  */
 /*
  * IO Primitives and buffer cache management
 
 static void hammer_io_modify(hammer_io_t io, int count);
 static void hammer_io_deallocate(struct buf *bp);
+#if 0
+static void hammer_io_direct_read_complete(struct bio *nbio);
+#endif
+static void hammer_io_direct_write_complete(struct bio *nbio);
 static int hammer_io_direct_uncache_callback(hammer_inode_t ip, void *data);
 
 /*
@@ -117,7 +121,7 @@ hammer_io_disassociate(hammer_io_structure_t iou)
 /*
  * Wait for any physical IO to complete
  */
-static void
+void
 hammer_io_wait(hammer_io_t io)
 {
        if (io->running) {
@@ -937,14 +941,22 @@ struct bio_ops hammer_bioops = {
 
 /*
  * Read a buffer associated with a front-end vnode directly from the
- * disk media.  The bio may be issued asynchronously.
+ * disk media.  The bio may be issued asynchronously.  If leaf is non-NULL
+ * we validate the CRC.
  *
  * A second-level bio already resolved to a zone-2 offset (typically by
  * the BMAP code, or by a previous hammer_io_direct_write()), is passed. 
+ *
+ * We must check for the presence of a HAMMER buffer to handle the case
+ * where the reblocker has rewritten the data (which it does via the HAMMER
+ * buffer system, not via the high-level vnode buffer cache), but not yet
+ * committed the buffer to the media. 
  */
 int
-hammer_io_direct_read(hammer_mount_t hmp, struct bio *bio)
+hammer_io_direct_read(hammer_mount_t hmp, struct bio *bio,
+                     hammer_btree_leaf_elm_t leaf)
 {
+       hammer_off_t buf_offset;
        hammer_off_t zone2_offset;
        hammer_volume_t volume;
        struct buf *bp;
@@ -952,34 +964,61 @@ hammer_io_direct_read(hammer_mount_t hmp, struct bio *bio)
        int vol_no;
        int error;
 
-       zone2_offset = bio->bio_offset;
+       buf_offset = bio->bio_offset;
+       KKASSERT((buf_offset & HAMMER_OFF_ZONE_MASK) ==
+                HAMMER_ZONE_LARGE_DATA);
+
+       /*
+        * The buffer cache may have an aliased buffer (the reblocker can
+        * write them).  If it does we have to sync any dirty data before
+        * we can build our direct-read.  This is a non-critical code path.
+        */
+       bp = bio->bio_buf;
+       hammer_sync_buffers(hmp, buf_offset, bp->b_bufsize);
 
+       /*
+        * Resolve to a zone-2 offset.  The conversion just requires
+        * munging the top 4 bits but we want to abstract it anyway
+        * so the blockmap code can verify the zone assignment.
+        */
+       zone2_offset = hammer_blockmap_lookup(hmp, buf_offset, &error);
+       if (error)
+               goto done;
        KKASSERT((zone2_offset & HAMMER_OFF_ZONE_MASK) ==
                 HAMMER_ZONE_RAW_BUFFER);
 
+       /*
+        * Resolve volume and raw-offset for 3rd level bio.  The
+        * offset will be specific to the volume.
+        */
        vol_no = HAMMER_VOL_DECODE(zone2_offset);
        volume = hammer_get_volume(hmp, vol_no, &error);
        if (error == 0 && zone2_offset >= volume->maxbuf_off)
                error = EIO;
 
-       /*
-        * Third level bio - raw offset specific to the
-        * correct volume.
-        */
        if (error == 0) {
                zone2_offset &= HAMMER_OFF_SHORT_MASK;
 
                nbio = push_bio(bio);
                nbio->bio_offset = volume->ondisk->vol_buf_beg +
                                   zone2_offset;
+#if 0
+               /*
+                * XXX disabled - our CRC check doesn't work if the OS
+                * does bogus_page replacement on the direct-read.
+                */
+               if (leaf && hammer_verify_data) {
+                       nbio->bio_done = hammer_io_direct_read_complete;
+                       nbio->bio_caller_info1.uvalue32 = leaf->data_crc;
+               }
+#endif
                vn_strategy(volume->devvp, nbio);
        }
        hammer_rel_volume(volume, 0);
-
+done:
        if (error) {
                kprintf("hammer_direct_read: failed @ %016llx\n",
                        zone2_offset);
-               bp = bio->bio_buf;
                bp->b_error = error;
                bp->b_flags |= B_ERROR;
                biodone(bio);
@@ -987,14 +1026,45 @@ hammer_io_direct_read(hammer_mount_t hmp, struct bio *bio)
        return(error);
 }
 
+#if 0
+/*
+ * On completion of the BIO this callback must check the data CRC
+ * and chain to the previous bio.
+ */
+static
+void
+hammer_io_direct_read_complete(struct bio *nbio)
+{
+       struct bio *obio;
+       struct buf *bp;
+       u_int32_t rec_crc = nbio->bio_caller_info1.uvalue32;
+
+       bp = nbio->bio_buf;
+       if (crc32(bp->b_data, bp->b_bufsize) != rec_crc) {
+               kprintf("HAMMER: data_crc error @%016llx/%d\n",
+                       nbio->bio_offset, bp->b_bufsize);
+               if (hammer_debug_debug)
+                       Debugger("");
+               bp->b_flags |= B_ERROR;
+               bp->b_error = EIO;
+       }
+       obio = pop_bio(nbio);
+       biodone(obio);
+}
+#endif
+
 /*
  * Write a buffer associated with a front-end vnode directly to the
  * disk media.  The bio may be issued asynchronously.
+ *
+ * The BIO is associated with the specified record and RECF_DIRECT_IO
+ * is set.
  */
 int
-hammer_io_direct_write(hammer_mount_t hmp, hammer_btree_leaf_elm_t leaf,
+hammer_io_direct_write(hammer_mount_t hmp, hammer_record_t record,
                       struct bio *bio)
 {
+       hammer_btree_leaf_elm_t leaf = &record->leaf;
        hammer_off_t buf_offset;
        hammer_off_t zone2_offset;
        hammer_volume_t volume;
@@ -1028,11 +1098,18 @@ hammer_io_direct_write(hammer_mount_t hmp, hammer_btree_leaf_elm_t leaf,
                        KKASSERT((bp->b_bufsize & HAMMER_BUFMASK) == 0);
                        hammer_del_buffers(hmp, buf_offset,
                                           zone2_offset, bp->b_bufsize);
+
                        /*
                         * Second level bio - cached zone2 offset.
+                        *
+                        * (We can put our bio_done function in either the
+                        *  2nd or 3rd level).
                         */
                        nbio = push_bio(bio);
                        nbio->bio_offset = zone2_offset;
+                       nbio->bio_done = hammer_io_direct_write_complete;
+                       nbio->bio_caller_info1.ptr = record;
+                       record->flags |= HAMMER_RECF_DIRECT_IO;
 
                        /*
                         * Third level bio - raw offset specific to the
@@ -1046,7 +1123,11 @@ hammer_io_direct_write(hammer_mount_t hmp, hammer_btree_leaf_elm_t leaf,
                }
                hammer_rel_volume(volume, 0);
        } else {
-               /* must fit in a standard HAMMER buffer */
+               /* 
+                * Must fit in a standard HAMMER buffer.  In this case all
+                * consumers use the HAMMER buffer system and RECF_DIRECT_IO
+                * does not need to be set-up.
+                */
                KKASSERT(((buf_offset ^ (buf_offset + leaf->data_len - 1)) & ~HAMMER_BUFMASK64) == 0);
                buffer = NULL;
                ptr = hammer_bread(hmp, buf_offset, &error, &buffer);
@@ -1073,6 +1154,48 @@ hammer_io_direct_write(hammer_mount_t hmp, hammer_btree_leaf_elm_t leaf,
        return(error);
 }
 
+/*
+ * On completion of the BIO this callback must disconnect
+ * it from the hammer_record and chain to the previous bio.
+ */
+static
+void
+hammer_io_direct_write_complete(struct bio *nbio)
+{
+       struct bio *obio;
+       hammer_record_t record = nbio->bio_caller_info1.ptr;
+
+       obio = pop_bio(nbio);
+       biodone(obio);
+       KKASSERT(record != NULL && (record->flags & HAMMER_RECF_DIRECT_IO));
+       record->flags &= ~HAMMER_RECF_DIRECT_IO;
+       if (record->flags & HAMMER_RECF_DIRECT_WAIT) {
+               record->flags &= ~HAMMER_RECF_DIRECT_WAIT;
+               wakeup(&record->flags);
+       }
+}
+
+
+/*
+ * This is called before a record is either committed to the B-Tree
+ * or destroyed, to resolve any associated direct-IO.  We must
+ * ensure that the data is available on-media to other consumers
+ * such as the reblocker or mirroring code.
+ *
+ * Note that other consumers might access the data via the block
+ * device's buffer cache and not the high level vnode's buffer cache.
+ */
+void
+hammer_io_direct_wait(hammer_record_t record)
+{
+       crit_enter();
+       while (record->flags & HAMMER_RECF_DIRECT_IO) {
+               record->flags |= HAMMER_RECF_DIRECT_WAIT;
+               tsleep(&record->flags, 0, "hmdiow", 0);
+       }
+       crit_exit();
+}
+
 /*
  * This is called to remove the second-level cached zone-2 offset from
  * frontend buffer cache buffers, now stale due to a data relocation.
index f9f7877..a301291 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_mount.h,v 1.8 2008/07/02 21:57:54 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_mount.h,v 1.9 2008/07/14 03:20:49 dillon Exp $
  */
 
 #ifndef _SYS_TYPES_H_
@@ -58,6 +58,7 @@ struct hammer_mount_info {
 #define HMNT_NOHISTORY 0x00000001
 #define HMNT_MASTERID  0x00000002      /* masterid field set */
 #define HMNT_EXPORTREQ 0x00000004
+#define HMNT_UNDO_DIRTY        0x00000008
 
 #define HMNT_USERFLAGS (HMNT_NOHISTORY | HMNT_MASTERID)
 
index f0988e8..a8baab1 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.89 2008/07/13 09:32:48 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.90 2008/07/14 03:20:49 dillon Exp $
  */
 
 #include "hammer.h"
@@ -340,7 +340,10 @@ hammer_flush_record_done(hammer_record_t record, int error)
 void
 hammer_rel_mem_record(struct hammer_record *record)
 {
-       hammer_inode_t ip, target_ip;
+       hammer_mount_t hmp;
+       hammer_reserve_t resv;
+       hammer_inode_t ip;
+       hammer_inode_t target_ip;
 
        hammer_unref(&record->lock);
 
@@ -354,6 +357,7 @@ hammer_rel_mem_record(struct hammer_record *record)
                 * might possibly block.  hammer_test_inode() can block!
                 */
                ip = record->ip;
+               hmp = ip->hmp;
 
                /*
                 * Upon release of the last reference a record marked deleted
@@ -380,9 +384,9 @@ hammer_rel_mem_record(struct hammer_record *record)
                                          &record->ip->rec_tree,
                                          record);
                                KKASSERT(ip->rsv_recs > 0);
-                               --ip->hmp->rsv_recs;
+                               --hmp->rsv_recs;
                                --ip->rsv_recs;
-                               ip->hmp->rsv_databytes -= record->leaf.data_len;
+                               hmp->rsv_databytes -= record->leaf.data_len;
                                record->flags &= ~HAMMER_RECF_ONRBTREE;
 
                                if (RB_EMPTY(&record->ip->rec_tree)) {
@@ -392,6 +396,14 @@ hammer_rel_mem_record(struct hammer_record *record)
                                }
                        }
 
+                       /*
+                        * We must wait for any direct-IO to complete before
+                        * we can destroy the record.
+                        */
+                       if (record->flags & HAMMER_RECF_DIRECT_IO)
+                               hammer_io_direct_wait(record);
+
+
                        /*
                         * Do this test after removing record from the B-Tree.
                         */
@@ -405,9 +417,20 @@ hammer_rel_mem_record(struct hammer_record *record)
                                kfree(record->data, M_HAMMER);
                                record->flags &= ~HAMMER_RECF_ALLOCDATA;
                        }
-                       if (record->resv) {
-                               hammer_blockmap_reserve_complete(ip->hmp,
-                                                                record->resv);
+
+                       /*
+                        * Release the reservation.  If the record was not
+                        * committed return the reservation before
+                        * releasing it.
+                        */
+                       if ((resv = record->resv) != NULL) {
+                               if ((record->flags & HAMMER_RECF_COMMITTED) == 0) {
+                                       hammer_blockmap_reserve_undo(
+                                               resv,
+                                               record->leaf.data_offset,
+                                               record->leaf.data_len);
+                               }
+                               hammer_blockmap_reserve_complete(hmp, resv);
                                record->resv = NULL;
                        }
                        record->data = NULL;
@@ -1060,6 +1083,7 @@ hammer_ip_sync_record_cursor(hammer_cursor_t cursor, hammer_record_t record)
                        if (error == 0) {
                                record->flags |= HAMMER_RECF_DELETED_FE;
                                record->flags |= HAMMER_RECF_DELETED_BE;
+                               record->flags |= HAMMER_RECF_COMMITTED;
                        }
                }
                goto done;
@@ -1133,6 +1157,13 @@ hammer_ip_sync_record_cursor(hammer_cursor_t cursor, hammer_record_t record)
                record->leaf.data_crc = 0;
        }
 
+       /*
+        * If the record's data was direct-written we cannot insert
+        * it until the direct-IO has completed.
+        */
+       if (record->flags & HAMMER_RECF_DIRECT_IO)
+               hammer_io_direct_wait(record);
+
        error = hammer_btree_insert(cursor, &record->leaf, &doprop);
        if (hammer_debug_inode && error)
                kprintf("BTREE INSERT error %d @ %016llx:%d key %016llx\n", error, cursor->node->node_offset, cursor->index, record->leaf.base.key);
@@ -1161,6 +1192,7 @@ hammer_ip_sync_record_cursor(hammer_cursor_t cursor, hammer_record_t record)
                        record->flags |= HAMMER_RECF_DELETED_FE;
                        record->flags |= HAMMER_RECF_DELETED_BE;
                }
+               record->flags |= HAMMER_RECF_COMMITTED;
        } else {
                if (record->leaf.data_offset) {
                        hammer_blockmap_free(trans, record->leaf.data_offset,
@@ -1885,7 +1917,6 @@ int
 hammer_ip_delete_record(hammer_cursor_t cursor, hammer_inode_t ip,
                        hammer_tid_t tid)
 {
-       hammer_off_t zone2_offset;
        hammer_record_t iprec;
        hammer_btree_elm_t elm;
        hammer_mount_t hmp;
@@ -1902,26 +1933,14 @@ hammer_ip_delete_record(hammer_cursor_t cursor, hammer_inode_t ip,
         * the interlock.
         *
         * An in-memory record may be deleted before being committed to disk,
-        * but could have been accessed in the mean time.  The backing store
-        * may never been marked allocated and so hammer_blockmap_free() may
-        * never get called on it.  Because of this we have to make sure that
-        * we've gotten rid of any related hammer_buffer or buffer cache
-        * buffer.
+        * but could have been accessed in the mean time.  The reservation
+        * code will deal with the case.
         */
        if (hammer_cursor_inmem(cursor)) {
                iprec = cursor->iprec;
                KKASSERT((iprec->flags & HAMMER_RECF_INTERLOCK_BE) ==0);
                iprec->flags |= HAMMER_RECF_DELETED_FE;
                iprec->flags |= HAMMER_RECF_DELETED_BE;
-
-               if (iprec->leaf.data_offset && iprec->leaf.data_len) {
-                       zone2_offset = hammer_blockmap_lookup(hmp, iprec->leaf.data_offset, &error);
-                       KKASSERT(error == 0);
-                       hammer_del_buffers(hmp,
-                                          iprec->leaf.data_offset,
-                                          zone2_offset,
-                                          iprec->leaf.data_len);
-               }
                return(0);
        }
 
index 32d5d9b..8df8525 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.68 2008/07/12 23:04:50 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.69 2008/07/14 03:20:49 dillon Exp $
  */
 /*
  * Manage HAMMER's on-disk structures.  These routines are primarily
@@ -204,6 +204,7 @@ hammer_install_volume(struct hammer_mount *hmp, const char *volname)
         */
        if (error == 0 && ondisk->vol_rootvol == ondisk->vol_no) {
                hmp->rootvol = volume;
+               hmp->nvolumes = ondisk->vol_count;
                if (bp) {
                        brelse(bp);
                        bp = NULL;
@@ -454,6 +455,20 @@ hammer_rel_volume(hammer_volume_t volume, int flush)
        crit_exit();
 }
 
+int
+hammer_mountcheck_volumes(struct hammer_mount *hmp)
+{
+       hammer_volume_t vol;
+       int i;
+
+       for (i = 0; i < hmp->nvolumes; ++i) {
+               vol = RB_LOOKUP(hammer_vol_rb_tree, &hmp->rb_vols_root, i);
+               if (vol == NULL)
+                       return(EINVAL);
+       }
+       return(0);
+}
+
 /************************************************************************
  *                             BUFFERS                                 *
  ************************************************************************
@@ -608,6 +623,44 @@ found:
        return(buffer);
 }
 
+/*
+ * This is used by the direct-read code to deal with large-data buffers
+ * created by the reblocker and mirror-write code.  The direct-read code
+ * bypasses the HAMMER buffer subsystem and so any aliased dirty hammer
+ * buffers must be fully synced to disk before we can issue the direct-read.
+ *
+ * This code path is not considered critical as only the rebocker and
+ * mirror-write code will create large-data buffers via the HAMMER buffer
+ * subsystem.  They do that because they operate at the B-Tree level and
+ * do not access the vnode/inode structures.
+ */
+void
+hammer_sync_buffers(hammer_mount_t hmp, hammer_off_t base_offset, int bytes)
+{
+       hammer_buffer_t buffer;
+       int error;
+
+       KKASSERT((base_offset & HAMMER_OFF_ZONE_MASK) ==
+                HAMMER_ZONE_LARGE_DATA);
+
+       while (bytes > 0) {
+               buffer = RB_LOOKUP(hammer_buf_rb_tree, &hmp->rb_bufs_root,
+                                  base_offset);
+               if (buffer && buffer->io.modified) {
+                       error = hammer_ref_buffer(buffer);
+                       if (error == 0 && buffer->io.modified) {
+                               hammer_io_write_interlock(&buffer->io);
+                               hammer_io_flush(&buffer->io);
+                               hammer_io_done_interlock(&buffer->io);
+                               hammer_io_wait(&buffer->io);
+                               hammer_rel_buffer(buffer, 0);
+                       }
+               }
+               base_offset += HAMMER_BUFSIZE;
+               bytes -= HAMMER_BUFSIZE;
+       }
+}
+
 /*
  * Destroy all buffers covering the specified zoneX offset range.  This
  * is called when the related blockmap layer2 entry is freed or when
index 29fa2af..0c807a5 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_prune.c,v 1.17 2008/07/13 01:12:41 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_prune.c,v 1.18 2008/07/14 03:20:49 dillon Exp $
  */
 
 #include "hammer.h"
@@ -314,8 +314,10 @@ prune_check_nlinks(hammer_cursor_t cursor, hammer_btree_leaf_elm_t elm)
                      elm->base.localization & HAMMER_LOCALIZE_PSEUDOFS_MASK,
                      0, &error);
        if (ip) {
-               kprintf("pruning disconnected inode %016llx\n",
-                       elm->base.obj_id);
+               if (hammer_debug_general & 0x0001) {
+                       kprintf("pruning disconnected inode %016llx\n",
+                               elm->base.obj_id);
+               }
                hammer_rel_inode(ip, 0);
        } else {
                kprintf("unable to prune disconnected inode %016llx\n",
index c62892c..727bdea 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_reblock.c,v 1.31 2008/07/13 09:32:48 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_reblock.c,v 1.32 2008/07/14 03:20:49 dillon Exp $
  */
 /*
  * HAMMER reblocker - This code frees up fragmented physical space
@@ -145,18 +145,6 @@ retry:
                        seq = hammer_flusher_async(trans->hmp, NULL);
                }
 
-               /*
-                * We allocate data buffers, which atm we don't track
-                * dirty levels for because we allow the kernel to write
-                * them.  But if we allocate too many we can still deadlock
-                * the buffer cache.
-                */
-               if (bd_heatup()) {
-                       hammer_unlock_cursor(&cursor, 0);
-                       bwillwrite(HAMMER_BUFSIZE);
-                       hammer_lock_cursor(&cursor, 0);
-               }
-
                /*
                 * Acquiring the sync_lock prevents the operation from
                 * crossing a synchronization boundary.
@@ -174,8 +162,28 @@ retry:
                        hammer_lock_cursor(&cursor, 0);
                        seq = hammer_flusher_async_one(trans->hmp);
                }
+
+               /*
+                * Setup for iteration, our cursor flags may be modified by
+                * other threads while we are unlocked.
+                */
+               cursor.flags |= HAMMER_CURSOR_ATEDISK;
+
+               /*
+                * We allocate data buffers, which atm we don't track
+                * dirty levels for because we allow the kernel to write
+                * them.  But if we allocate too many we can still deadlock
+                * the buffer cache.
+                *
+                * (The cursor's node and element may change!)
+                */
+               if (bd_heatup()) {
+                       hammer_unlock_cursor(&cursor, 0);
+                       bwillwrite(HAMMER_XBUFSIZE);
+                       hammer_lock_cursor(&cursor, 0);
+               }
+
                if (error == 0) {
-                       cursor.flags |= HAMMER_CURSOR_ATEDISK;
                        error = hammer_btree_iterate(&cursor);
                }
 
index 39a765b..8cee5f1 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_vfsops.c,v 1.62 2008/07/12 23:04:50 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_vfsops.c,v 1.63 2008/07/14 03:20:49 dillon Exp $
  */
 
 #include <sys/param.h>
 
 int hammer_debug_io;
 int hammer_debug_general;
-int hammer_debug_debug;
+int hammer_debug_debug = 1;            /* medium-error panics */ 
 int hammer_debug_inode;
 int hammer_debug_locks;
 int hammer_debug_btree;
 int hammer_debug_tid;
 int hammer_debug_recover;              /* -1 will disable, +1 will force */
 int hammer_debug_recover_faults;
-int hammer_debug_cluster_enable = 1;   /* enable read clustering by default */
+int hammer_cluster_enable = 1;         /* enable read clustering by default */
 int hammer_count_fsyncs;
 int hammer_count_inodes;
 int hammer_count_iqueued;
@@ -86,6 +86,7 @@ int hammer_limit_recs;                        /* as a whole XXX */
 int hammer_limit_iqueued;              /* per-mount */
 int hammer_bio_count;
 int hammer_verify_zone;
+int hammer_verify_data = 1;
 int hammer_write_mode;
 int64_t hammer_contention_count;
 int64_t hammer_zone_limit;
@@ -109,8 +110,8 @@ SYSCTL_INT(_vfs_hammer, OID_AUTO, debug_recover, CTLFLAG_RW,
           &hammer_debug_recover, 0, "");
 SYSCTL_INT(_vfs_hammer, OID_AUTO, debug_recover_faults, CTLFLAG_RW,
           &hammer_debug_recover_faults, 0, "");
-SYSCTL_INT(_vfs_hammer, OID_AUTO, debug_cluster_enable, CTLFLAG_RW,
-          &hammer_debug_cluster_enable, 0, "");
+SYSCTL_INT(_vfs_hammer, OID_AUTO, cluster_enable, CTLFLAG_RW,
+          &hammer_cluster_enable, 0, "");
 
 SYSCTL_INT(_vfs_hammer, OID_AUTO, limit_dirtybufspace, CTLFLAG_RW,
           &hammer_limit_dirtybufspace, 0, "");
@@ -173,6 +174,8 @@ SYSCTL_QUAD(_vfs_hammer, OID_AUTO, contention_count, CTLFLAG_RW,
           &hammer_contention_count, 0, "");
 SYSCTL_INT(_vfs_hammer, OID_AUTO, verify_zone, CTLFLAG_RW,
           &hammer_verify_zone, 0, "");
+SYSCTL_INT(_vfs_hammer, OID_AUTO, verify_data, CTLFLAG_RW,
+          &hammer_verify_data, 0, "");
 SYSCTL_INT(_vfs_hammer, OID_AUTO, write_mode, CTLFLAG_RW,
           &hammer_write_mode, 0, "");
 
@@ -377,7 +380,7 @@ hammer_vfs_mount(struct mount *mp, char *mntpt, caddr_t data,
         * Load volumes
         */
        path = objcache_get(namei_oc, M_WAITOK);
-       hmp->nvolumes = info.nvolumes;
+       hmp->nvolumes = -1;
        for (i = 0; i < info.nvolumes; ++i) {
                error = copyin(&info.volumes[i], &upath, sizeof(char *));
                if (error == 0)
@@ -396,6 +399,15 @@ hammer_vfs_mount(struct mount *mp, char *mntpt, caddr_t data,
                kprintf("hammer_mount: No root volume found!\n");
                error = EINVAL;
        }
+
+       /*
+        * Check that all required volumes are available
+        */
+       if (error == 0 && hammer_mountcheck_volumes(hmp)) {
+               kprintf("hammer_mount: Missing volumes, cannot mount!\n");
+               error = EINVAL;
+       }
+
        if (error) {
                hammer_free_hmp(mp);
                return (error);
@@ -554,6 +566,7 @@ static void
 hammer_free_hmp(struct mount *mp)
 {
        struct hammer_mount *hmp = (void *)mp->mnt_data;
+       int count;
 
 #if 0
        /*
@@ -564,8 +577,24 @@ hammer_free_hmp(struct mount *mp)
                hmp->rootvp = NULL;
        }
 #endif
-       hammer_flusher_sync(hmp);
-       hammer_flusher_sync(hmp);
+       count = 0;
+       while (hammer_flusher_haswork(hmp)) {
+               hammer_flusher_sync(hmp);
+               ++count;
+               if (count >= 5) {
+                       if (count == 5)
+                               kprintf("HAMMER: umount flushing.");
+                       else
+                               kprintf(".");
+                       tsleep(hmp, 0, "hmrufl", hz);
+               }
+               if (count == 30) {
+                       kprintf("giving up\n");
+                       break;
+               }
+       }
+       if (count >= 5 && count < 30)
+               kprintf("\n");
        hammer_flusher_destroy(hmp);
 
        KKASSERT(RB_EMPTY(&hmp->rb_inos_root));
index 750e131..d63d834 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.90 2008/07/12 23:55:22 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.91 2008/07/14 03:20:49 dillon Exp $
  */
 
 #include <sys/param.h>
@@ -238,7 +238,7 @@ hammer_vop_read(struct vop_read_args *ap)
                offset = (int)uio->uio_offset & (blksize - 1);
                base_offset = uio->uio_offset - offset;
 
-               if (hammer_debug_cluster_enable) {
+               if (hammer_cluster_enable) {
                        /*
                         * Use file_limit to prevent cluster_read() from
                         * creating buffers of the wrong block size past
@@ -498,6 +498,14 @@ hammer_vop_write(struct vop_write_args *ap)
                flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS;
                hammer_modify_inode(ip, flags);
 
+               /*
+                * Once we dirty the buffer any cached zone-X offset
+                * becomes invalid.  HAMMER NOTE: no-history mode cannot 
+                * allow overwriting over the same data sector unless
+                * we provide UNDOs for the old data, which we don't.
+                */
+               bp->b_bio2.bio_offset = NOOFFSET;
+
                /*
                 * Final buffer disposition.
                 */
@@ -1766,6 +1774,8 @@ hammer_vop_setattr(struct vop_setattr_args *ap)
                                if (error == 0) {
                                        bzero(bp->b_data + offset,
                                              blksize - offset);
+                                       /* must de-cache direct-io offset */
+                                       bp->b_bio2.bio_offset = NOOFFSET;
                                        bdwrite(bp);
                                } else {
                                        kprintf("ERROR %d\n", error);
@@ -2055,8 +2065,8 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap)
         */
        nbio = push_bio(bio);
        if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) ==
-           HAMMER_ZONE_RAW_BUFFER) {
-               error = hammer_io_direct_read(ip->hmp, nbio);
+           HAMMER_ZONE_LARGE_DATA) {
+               error = hammer_io_direct_read(ip->hmp, nbio, NULL);
                return (error);
        }
 
@@ -2191,16 +2201,15 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap)
                 * truncation point, but may not be for any synthesized
                 * truncation point from above.
                 */
+               disk_offset = cursor.leaf->data_offset + roff;
                if (boff == 0 && n == bp->b_bufsize &&
-                   ((cursor.leaf->data_offset + roff) & HAMMER_BUFMASK) == 0) {
-                       disk_offset = hammer_blockmap_lookup(
-                                               trans.hmp,
-                                               cursor.leaf->data_offset + roff,
-                                               &error);
-                       if (error)
-                               break;
+                   hammer_cursor_ondisk(&cursor) &&
+                   (disk_offset & HAMMER_BUFMASK) == 0) {
+                       KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
+                                HAMMER_ZONE_LARGE_DATA);
                        nbio->bio_offset = disk_offset;
-                       error = hammer_io_direct_read(trans.hmp, nbio);
+                       error = hammer_io_direct_read(trans.hmp, nbio,
+                                                     cursor.leaf);
                        goto done;
                } else if (n) {
                        error = hammer_ip_resolve_data(&cursor);
@@ -2388,21 +2397,20 @@ hammer_vop_bmap(struct vop_bmap_args *ap)
                 * block reset base_offset unless we are already beyond the
                 * requested offset.  If we are, that's it, we stop.
                 */
-               disk_offset = hammer_blockmap_lookup(trans.hmp,
-                                                    cursor.leaf->data_offset,
-                                                    &error);
                if (error)
                        break;
-               if (rec_offset != last_offset ||
-                   disk_offset != last_disk_offset) {
-                       if (rec_offset > ap->a_loffset)
-                               break;
-                       base_offset = rec_offset;
-                       base_disk_offset = disk_offset;
+               if (hammer_cursor_ondisk(&cursor)) {
+                       disk_offset = cursor.leaf->data_offset;
+                       if (rec_offset != last_offset ||
+                           disk_offset != last_disk_offset) {
+                               if (rec_offset > ap->a_loffset)
+                                       break;
+                               base_offset = rec_offset;
+                               base_disk_offset = disk_offset;
+                       }
+                       last_offset = rec_offset + rec_len;
+                       last_disk_offset = disk_offset + rec_len;
                }
-               last_offset = rec_offset + rec_len;
-               last_disk_offset = disk_offset + rec_len;
-
                error = hammer_ip_next(&cursor);
        }
 
@@ -2451,14 +2459,22 @@ hammer_vop_bmap(struct vop_bmap_args *ap)
         */
        disk_offset = base_disk_offset + (ap->a_loffset - base_offset);
 
-       /*
-        * If doffsetp is not aligned or the forward run size does
-        * not cover a whole buffer, disallow the direct I/O.
-        */
-       if ((disk_offset & HAMMER_BUFMASK) ||
-           (last_offset - ap->a_loffset) < blksize) {
+       if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) {
+               /*
+                * Only large-data zones can be direct-IOd
+                */
+               error = EOPNOTSUPP;
+       } else if ((disk_offset & HAMMER_BUFMASK) ||
+                  (last_offset - ap->a_loffset) < blksize) {
+               /*
+                * doffsetp is not aligned or the forward run size does
+                * not cover a whole buffer, disallow the direct I/O.
+                */
                error = EOPNOTSUPP;
        } else {
+               /*
+                * We're good.
+                */
                *ap->a_doffsetp = disk_offset;
                if (ap->a_runb) {
                        *ap->a_runb = ap->a_loffset - base_offset;
@@ -2544,7 +2560,7 @@ hammer_vop_strategy_write(struct vop_strategy_args *ap)
        record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data,
                                    bytes, &error);
        if (record) {
-               hammer_io_direct_write(hmp, &record->leaf, bio);
+               hammer_io_direct_write(hmp, record, bio);
                hammer_rel_mem_record(record);
                if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs)
                        hammer_flush_inode(ip, 0);