HAMMER VFS - Implement async I/O for double-buffer strategy case
authorMatthew Dillon <dillon@apollo.backplane.com>
Sun, 10 Apr 2011 15:46:04 +0000 (08:46 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Sun, 10 Apr 2011 15:46:04 +0000 (08:46 -0700)
* When vfs.hammer.double_buffer is enabled the HAMMER strategy code
  was running synchronously.  This creates numerous problems including
  extra stalls when read-ahead is issued.

* Use the new breadcb() function to allow nominal double_buffer strategy
  operations to run asynchronously.  Essentially the original buffer and
  CRC is recorded in the device bio and the copyback is made in the
  callback.

* This improves performance vfs.hammer.double_buffer is enabled.

sys/vfs/hammer/hammer.h
sys/vfs/hammer/hammer_io.c
sys/vfs/hammer/hammer_ondisk.c
sys/vfs/hammer/hammer_vnops.c

index a0bb98e..11f20b4 100644 (file)
@@ -1428,6 +1428,8 @@ void hammer_io_waitdep(struct hammer_io *io);
 void hammer_io_wait_all(hammer_mount_t hmp, const char *ident, int doflush);
 int hammer_io_direct_read(hammer_mount_t hmp, struct bio *bio,
                        hammer_btree_leaf_elm_t leaf);
+int hammer_io_indirect_read(hammer_mount_t hmp, struct bio *bio,
+                       hammer_btree_leaf_elm_t leaf);
 int hammer_io_direct_write(hammer_mount_t hmp, struct bio *bio,
                        hammer_record_t record);
 void hammer_io_direct_wait(hammer_record_t record);
index 38f689d..de02111 100644 (file)
@@ -56,6 +56,7 @@
 
 static void hammer_io_modify(hammer_io_t io, int count);
 static void hammer_io_deallocate(struct buf *bp);
+static void hammer_indirect_callback(struct bio *bio);
 #if 0
 static void hammer_io_direct_read_complete(struct bio *nbio);
 #endif
@@ -1483,6 +1484,147 @@ done:
        return(error);
 }
 
+/*
+ * This works similarly to hammer_io_direct_read() except instead of
+ * directly reading from the device into the bio we instead indirectly
+ * read through the device's buffer cache and then copy the data into
+ * the bio.
+ *
+ * If leaf is non-NULL and validation is enabled, the CRC will be checked.
+ *
+ * This routine also executes asynchronously.  It allows hammer strategy
+ * calls to operate asynchronously when in double_buffer mode (in addition
+ * to operating asynchronously when in normal mode).
+ */
+int
+hammer_io_indirect_read(hammer_mount_t hmp, struct bio *bio,
+                       hammer_btree_leaf_elm_t leaf)
+{
+       hammer_off_t buf_offset;
+       hammer_off_t zone2_offset;
+       hammer_volume_t volume;
+       struct buf *bp;
+       int vol_no;
+       int error;
+
+       buf_offset = bio->bio_offset;
+       KKASSERT((buf_offset & HAMMER_OFF_ZONE_MASK) ==
+                HAMMER_ZONE_LARGE_DATA);
+
+       /*
+        * The buffer cache may have an aliased buffer (the reblocker can
+        * write them).  If it does we have to sync any dirty data before
+        * we can build our direct-read.  This is a non-critical code path.
+        */
+       bp = bio->bio_buf;
+       hammer_sync_buffers(hmp, buf_offset, bp->b_bufsize);
+
+       /*
+        * Resolve to a zone-2 offset.  The conversion just requires
+        * munging the top 4 bits but we want to abstract it anyway
+        * so the blockmap code can verify the zone assignment.
+        */
+       zone2_offset = hammer_blockmap_lookup(hmp, buf_offset, &error);
+       if (error)
+               goto done;
+       KKASSERT((zone2_offset & HAMMER_OFF_ZONE_MASK) ==
+                HAMMER_ZONE_RAW_BUFFER);
+
+       /*
+        * Resolve volume and raw-offset for 3rd level bio.  The
+        * offset will be specific to the volume.
+        */
+       vol_no = HAMMER_VOL_DECODE(zone2_offset);
+       volume = hammer_get_volume(hmp, vol_no, &error);
+       if (error == 0 && zone2_offset >= volume->maxbuf_off)
+               error = EIO;
+
+       if (error == 0) {
+               /*
+                * Convert to the raw volume->devvp offset and acquire
+                * the buf, issuing async I/O if necessary.
+                */
+               buf_offset = volume->ondisk->vol_buf_beg +
+                            (zone2_offset & HAMMER_OFF_SHORT_MASK);
+
+               if (leaf && hammer_verify_data) {
+                       bio->bio_caller_info1.uvalue32 = leaf->data_crc;
+                       bio->bio_caller_info2.index = 1;
+               } else {
+                       bio->bio_caller_info2.index = 0;
+               }
+               breadcb(volume->devvp, buf_offset, bp->b_bufsize,
+                       hammer_indirect_callback, bio);
+       }
+       hammer_rel_volume(volume, 0);
+done:
+       if (error) {
+               kprintf("hammer_direct_read: failed @ %016llx\n",
+                       (long long)zone2_offset);
+               bp->b_error = error;
+               bp->b_flags |= B_ERROR;
+               biodone(bio);
+       }
+       return(error);
+}
+
+/*
+ * Indirect callback on completion.  bio/bp specify the device-backed
+ * buffer.  bio->bio_caller_info1.ptr holds obio.
+ *
+ * obio/obp is the original regular file buffer.  obio->bio_caller_info*
+ * contains the crc specification.
+ *
+ * We are responsible for calling bpdone() and bqrelse() on bio/bp, and
+ * for calling biodone() on obio.
+ */
+static void
+hammer_indirect_callback(struct bio *bio)
+{
+       struct buf *bp = bio->bio_buf;
+       struct buf *obp;
+       struct bio *obio;
+
+       /*
+        * If BIO_DONE is already set the device buffer was already
+        * fully valid (B_CACHE).  If it is not set then I/O was issued
+        * and we have to run I/O completion as the last bio.
+        *
+        * Nobody is waiting for our device I/O to complete, we are
+        * responsible for bqrelse()ing it which means we also have to do
+        * the equivalent of biowait() and clear BIO_DONE (which breadcb()
+        * may have set).
+        *
+        * Any preexisting device buffer should match the requested size,
+        * but due to bigblock recycling and other factors there is some
+        * fragility there, so we assert that the device buffer covers
+        * the request.
+        */
+       if ((bio->bio_flags & BIO_DONE) == 0)
+               bpdone(bp, 0);
+       bio->bio_flags &= ~(BIO_DONE | BIO_SYNC);
+
+       obio = bio->bio_caller_info1.ptr;
+       obp = obio->bio_buf;
+
+       if (bp->b_flags & B_ERROR) {
+               obp->b_flags |= B_ERROR;
+               obp->b_error = bp->b_error;
+       } else if (obio->bio_caller_info2.index &&
+                  obio->bio_caller_info1.uvalue32 !=
+                   crc32(bp->b_data, bp->b_bufsize)) {
+               obp->b_flags |= B_ERROR;
+               obp->b_error = EIO;
+       } else {
+               KKASSERT(bp->b_bufsize >= obp->b_bufsize);
+               bcopy(bp->b_data, obp->b_data, obp->b_bufsize);
+               obp->b_resid = 0;
+               obp->b_flags |= B_AGE;
+       }
+       biodone(obio);
+       bqrelse(bp);
+}
+
 #if 0
 /*
  * On completion of the BIO this callback must check the data CRC
index de931f3..01d1fa1 100644 (file)
@@ -101,6 +101,9 @@ RB_GENERATE2(hammer_nod_rb_tree, hammer_node, rb_node,
  * code on failure.  Volumes must be loaded at mount time, get_volume() will
  * not load a new volume.
  *
+ * The passed devvp is vref()'d but not locked.  This function consumes the
+ * ref (typically by associating it with the volume structure).
+ *
  * Calls made to hammer_load_volume() or single-threaded
  */
 int
index df6b893..183ff53 100644 (file)
@@ -2700,13 +2700,25 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap)
         * device.
         */
        nbio = push_bio(bio);
-       if (hammer_double_buffer == 0 &&
-           (nbio->bio_offset & HAMMER_OFF_ZONE_MASK) ==
+       if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) ==
            HAMMER_ZONE_LARGE_DATA) {
-               lwkt_gettoken(&hmp->fs_token);
-               error = hammer_io_direct_read(hmp, nbio, NULL);
-               lwkt_reltoken(&hmp->fs_token);
-               return (error);
+               if (hammer_double_buffer == 0) {
+                       lwkt_gettoken(&hmp->fs_token);
+                       error = hammer_io_direct_read(hmp, nbio, NULL);
+                       lwkt_reltoken(&hmp->fs_token);
+                       return (error);
+               }
+
+               /*
+                * Try to shortcut requests for double_buffer mode too.
+                * Since this mode runs through the device buffer cache
+                * only compatible buffer sizes (meaning those generated
+                * by normal filesystem buffers) are legal.
+                */
+               if (hammer_live_dedup == 0 && (bp->b_flags & B_PAGING) == 0) {
+                       error = hammer_io_indirect_read(hmp, nbio, NULL);
+                       return (error);
+               }
        }
 
        /*
@@ -2851,6 +2863,9 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap)
                 * The buffer on-disk should be zerod past any real
                 * truncation point, but may not be for any synthesized
                 * truncation point from above.
+                *
+                * NOTE: disk_offset is only valid if the cursor data is
+                *       on-disk.
                 */
                disk_offset = cursor.leaf->data_offset + roff;
                isdedupable = (boff == 0 && n == bp->b_bufsize &&
@@ -2858,6 +2873,9 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap)
                               ((int)disk_offset & HAMMER_BUFMASK) == 0);
 
                if (isdedupable && hammer_double_buffer == 0) {
+                       /*
+                        * Direct read case
+                        */
                        KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
                                 HAMMER_ZONE_LARGE_DATA);
                        nbio->bio_offset = disk_offset;
@@ -2865,6 +2883,21 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap)
                        if (hammer_live_dedup && error == 0)
                                hammer_dedup_cache_add(ip, cursor.leaf);
                        goto done;
+               } else if (isdedupable) {
+                       /*
+                        * Async I/O case for reading from backing store
+                        * and copying the data to the filesystem buffer.
+                        * live-dedup has to verify the data anyway if it
+                        * gets a hit later so we can just add the entry
+                        * now.
+                        */
+                       KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
+                                HAMMER_ZONE_LARGE_DATA);
+                       nbio->bio_offset = disk_offset;
+                       if (hammer_live_dedup)
+                               hammer_dedup_cache_add(ip, cursor.leaf);
+                       error = hammer_io_indirect_read(hmp, nbio, cursor.leaf);
+                       goto done;
                } else if (n) {
                        error = hammer_ip_resolve_data(&cursor);
                        if (error == 0) {