HAMMER VFS - REDO implementation base code part 4/many
authorMatthew Dillon <dillon@apollo.backplane.com>
Wed, 13 Jan 2010 00:01:00 +0000 (16:01 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Wed, 13 Jan 2010 00:46:12 +0000 (16:46 -0800)
* Wait for BIOs to finish before issuing the media sync.  Previously we
  were issuing the media sync concurrently which worked only because
  the operation was serialized by the disk driver.

  This change has no effect on performance.

* For fsync mode 3 wait for BIOs to finish so the data is guaranteed
  to at least be in the drive cache.

* Collapse hammer_io_wait_firm() into hammer_io_wait_all()

* Pipeline hammer_io_wait_all().  Instead of waiting for the running_io
  count to hit 0, which can cause us to wait longer then necessary when
  multiple entities are dirtying buffers, we now place all running I/Os
  on a list along with a dummy entry for the waiter.  When the dummy entry
  becomes the head of the list the waiter returns.

  This way new I/O's initiated during the wait do not contribute to
  the wait.

  In particular this will improve fsync() operations which can flush the
  UNDO/REDO FIFO in parallel with a full meta-data flush.

sys/vfs/hammer/hammer.h
sys/vfs/hammer/hammer_flusher.c
sys/vfs/hammer/hammer_io.c
sys/vfs/hammer/hammer_recover.c
sys/vfs/hammer/hammer_vfsops.c

index 29c231a..a2e0446 100644 (file)
@@ -536,7 +536,8 @@ typedef enum hammer_io_type {
        HAMMER_STRUCTURE_VOLUME,
        HAMMER_STRUCTURE_META_BUFFER,
        HAMMER_STRUCTURE_UNDO_BUFFER,
-       HAMMER_STRUCTURE_DATA_BUFFER
+       HAMMER_STRUCTURE_DATA_BUFFER,
+       HAMMER_STRUCTURE_DUMMY
 } hammer_io_type_t;
 
 union hammer_io_structure;
@@ -556,6 +557,7 @@ struct hammer_io {
        struct hammer_mount     *hmp;
        struct hammer_volume    *volume;
        TAILQ_ENTRY(hammer_io)  mod_entry; /* list entry if modified */
+       TAILQ_ENTRY(hammer_io)  iorun_entry; /* iorun_list */
        hammer_io_list_t        mod_list;
        struct buf              *bp;
        int64_t                 offset;    /* zone-2 offset */
@@ -840,6 +842,7 @@ struct hammer_mount {
        hammer_flush_group_t    next_flush_group;
        TAILQ_HEAD(, hammer_objid_cache) objid_cache_list;
        TAILQ_HEAD(, hammer_reclaim) reclaim_list;
+       TAILQ_HEAD(, hammer_io) iorun_list;
 };
 
 typedef struct hammer_mount    *hammer_mount_t;
@@ -1258,7 +1261,7 @@ struct buf *hammer_io_release(struct hammer_io *io, int flush);
 void hammer_io_flush(struct hammer_io *io, int reclaim);
 void hammer_io_wait(struct hammer_io *io);
 void hammer_io_waitdep(struct hammer_io *io);
-void hammer_io_wait_all(hammer_mount_t hmp, const char *ident);
+void hammer_io_wait_all(hammer_mount_t hmp, const char *ident, int doflush);
 int hammer_io_direct_read(hammer_mount_t hmp, struct bio *bio,
                        hammer_btree_leaf_elm_t leaf);
 int hammer_io_direct_write(hammer_mount_t hmp, hammer_record_t record,
index 0e0194d..6066a5d 100644 (file)
@@ -646,8 +646,11 @@ hammer_flusher_finalize(hammer_transaction_t trans, int final)
        }
 
        /*
-        * Flush UNDOs.  This also waits for I/Os to complete and flushes
-        * the cache on the target disk.
+        * Flush UNDOs.  This can occur concurrently with the data flush
+        * because data writes never overwrite.
+        *
+        * This also waits for I/Os to complete and flushes the cache on
+        * the target disk.
         *
         * Record the UNDO append point as this can continue to change
         * after we have flushed the UNDOs.
@@ -730,7 +733,7 @@ hammer_flusher_finalize(hammer_transaction_t trans, int final)
         */
        hammer_flusher_clean_loose_ios(hmp);
        if (hmp->version < HAMMER_VOL_VERSION_FOUR)
-               hammer_io_wait_all(hmp, "hmrfl2");
+               hammer_io_wait_all(hmp, "hmrfl3", 1);
 
        if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR)
                goto failed;
@@ -855,7 +858,9 @@ hammer_flusher_flush_undos(hammer_mount_t hmp, int mode)
        hammer_flusher_clean_loose_ios(hmp);
        if (mode == HAMMER_FLUSH_UNDOS_FORCED ||
            (mode == HAMMER_FLUSH_UNDOS_AUTO && count)) {
-               hammer_io_wait_all(hmp, "hmrfl1");
+               hammer_io_wait_all(hmp, "hmrfl1", 1);
+       } else {
+               hammer_io_wait_all(hmp, "hmrfl2", 0);
        }
 }
 
index ad18127..40f6ef7 100644 (file)
@@ -119,6 +119,9 @@ hammer_io_disassociate(hammer_io_structure_t iou)
        case HAMMER_STRUCTURE_UNDO_BUFFER:
                iou->buffer.ondisk = NULL;
                break;
+       case HAMMER_STRUCTURE_DUMMY:
+               panic("hammer_io_disassociate: bad io type");
+               break;
        }
 }
 
@@ -145,18 +148,50 @@ hammer_io_wait(hammer_io_t io)
 }
 
 /*
- * Wait for all hammer_io-initated write I/O's to complete.  This is not
- * supposed to count direct I/O's but some can leak through (for
- * non-full-sized direct I/Os).
+ * Wait for all currently queued HAMMER-initiated I/Os to complete.
+ *
+ * This is not supposed to count direct I/O's but some can leak
+ * through (for non-full-sized direct I/Os).
  */
 void
-hammer_io_wait_all(hammer_mount_t hmp, const char *ident)
+hammer_io_wait_all(hammer_mount_t hmp, const char *ident, int doflush)
 {
-       hammer_io_flush_sync(hmp);
+       struct hammer_io iodummy;
+       hammer_io_t io;
+
+       /*
+        * Degenerate case, no I/O is running
+        */
        crit_enter();
-       while (hmp->io_running_space)
-               tsleep(&hmp->io_running_space, 0, ident, 0);
+       if (TAILQ_EMPTY(&hmp->iorun_list)) {
+               crit_exit();
+               if (doflush)
+                       hammer_io_flush_sync(hmp);
+               return;
+       }
+       bzero(&iodummy, sizeof(iodummy));
+       iodummy.type = HAMMER_STRUCTURE_DUMMY;
+
+       /*
+        * Add placemarker and then wait until it becomes the head of
+        * the list.
+        */
+       TAILQ_INSERT_TAIL(&hmp->iorun_list, &iodummy, iorun_entry);
+       while (TAILQ_FIRST(&hmp->iorun_list) != &iodummy) {
+               tsleep(&iodummy, 0, ident, 0);
+       }
+
+       /*
+        * Chain in case several placemarkers are present.
+        */
+       TAILQ_REMOVE(&hmp->iorun_list, &iodummy, iorun_entry);
+       io = TAILQ_FIRST(&hmp->iorun_list);
+       if (io && io->type == HAMMER_STRUCTURE_DUMMY)
+               wakeup(io);
        crit_exit();
+
+       if (doflush)
+               hammer_io_flush_sync(hmp);
 }
 
 /*
@@ -560,6 +595,7 @@ hammer_io_flush(struct hammer_io *io, int reclaim)
         */
        io->running = 1;
        io->hmp->io_running_space += io->bytes;
+       TAILQ_INSERT_TAIL(&io->hmp->iorun_list, io, iorun_entry);
        hammer_count_io_running_write += io->bytes;
        bawrite(bp);
        hammer_io_flush_mark(io->volume);
@@ -811,6 +847,9 @@ hammer_io_set_modlist(struct hammer_io *io)
        case HAMMER_STRUCTURE_DATA_BUFFER:
                io->mod_list = &hmp->data_list;
                break;
+       case HAMMER_STRUCTURE_DUMMY:
+               panic("hammer_io_disassociate: bad io type");
+               break;
        }
        TAILQ_INSERT_TAIL(io->mod_list, io, mod_entry);
 }
@@ -840,6 +879,7 @@ static void
 hammer_io_complete(struct buf *bp)
 {
        union hammer_io_structure *iou = (void *)LIST_FIRST(&bp->b_dep);
+       struct hammer_io *ionext;
 
        KKASSERT(iou->io.released == 1);
 
@@ -885,10 +925,18 @@ hammer_io_complete(struct buf *bp)
                hammer_stats_disk_write += iou->io.bytes;
                hammer_count_io_running_write -= iou->io.bytes;
                iou->io.hmp->io_running_space -= iou->io.bytes;
-               if (iou->io.hmp->io_running_space == 0)
-                       wakeup(&iou->io.hmp->io_running_space);
                KKASSERT(iou->io.hmp->io_running_space >= 0);
                iou->io.running = 0;
+
+               /*
+                * Remove from iorun list and wakeup any multi-io waiter(s).
+                */
+               if (TAILQ_FIRST(&iou->io.hmp->iorun_list) == &iou->io) {
+                       ionext = TAILQ_NEXT(&iou->io, iorun_entry);
+                       if (ionext && ionext->type == HAMMER_STRUCTURE_DUMMY)
+                               wakeup(ionext);
+               }
+               TAILQ_REMOVE(&iou->io.hmp->iorun_list, &iou->io, iorun_entry);
        } else {
                hammer_stats_disk_read += iou->io.bytes;
        }
@@ -1038,6 +1086,7 @@ hammer_io_checkwrite(struct buf *bp)
        KKASSERT(io->running == 0);
        io->running = 1;
        io->hmp->io_running_space += io->bytes;
+       TAILQ_INSERT_TAIL(&io->hmp->iorun_list, io, iorun_entry);
        hammer_count_io_running_write += io->bytes;
        return(0);
 }
index f03a45f..cd7d082 100644 (file)
@@ -894,7 +894,7 @@ hammer_recover_flush_buffers(hammer_mount_t hmp, hammer_volume_t root_volume,
          */
        RB_SCAN(hammer_buf_rb_tree, &hmp->rb_bufs_root, NULL,
                hammer_recover_flush_buffer_callback, &final);
-       hammer_io_wait_all(hmp, "hmrrcw");
+       hammer_io_wait_all(hmp, "hmrrcw", 1);
        RB_SCAN(hammer_buf_rb_tree, &hmp->rb_bufs_root, NULL,
                hammer_recover_flush_buffer_callback, &final);
 
@@ -914,13 +914,11 @@ hammer_recover_flush_buffers(hammer_mount_t hmp, hammer_volume_t root_volume,
         * Finalize the root volume header.
         */
        if (root_volume && root_volume->io.recovered && final > 0) {
-               crit_enter();
-               while (hmp->io_running_space > 0)
-                       tsleep(&hmp->io_running_space, 0, "hmrflx", 0);
-               crit_exit();
+               hammer_io_wait_all(hmp, "hmrflx", 1);
                root_volume->io.recovered = 0;
                hammer_io_flush(&root_volume->io, 0);
                hammer_rel_volume(root_volume, 0);
+               hammer_io_wait_all(hmp, "hmrfly", 1);
        }
 }
 
index d5c6ddf..31c0c57 100644 (file)
@@ -504,6 +504,7 @@ hammer_vfs_mount(struct mount *mp, char *mntpt, caddr_t data,
        TAILQ_INIT(&hmp->data_list);
        TAILQ_INIT(&hmp->meta_list);
        TAILQ_INIT(&hmp->lose_list);
+       TAILQ_INIT(&hmp->iorun_list);
 
        /*
         * Load volumes