HAMMER VFS - Version 4 part 1/many - UNDO FIFO layout work.
authorMatthew Dillon <dillon@apollo.backplane.com>
Mon, 2 Nov 2009 00:35:41 +0000 (16:35 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Mon, 2 Nov 2009 00:35:41 +0000 (16:35 -0800)
These changes only apply to HAMMER version 4+ filesystems.  HAMMER
versions less then 4 only implement some of these changes and do not
use the new features during crash recovery.

* Add a sequence number of the UNDO FIFO media record format.  The field
  already existed for just this purpose so no media structures changed
  size.

* Change the alignment boundary for HAMMER UNDO records from 16K to 512
  bytes.  This coupled with the sequence number virtually guarantees that
  the recovery code can detect uninterrupted sequences of UNDO records
  without having to relay on the FIFO last_offset field in the volume
  header.

  This isn't as bad as it sounds.  It just means that large UNDO blocks
  are broken up into smaller on-media structures in order to ensure a
  record header occurs on every 512 byte boundary.

* Add HAMMER_HEAD_TYPE_DUMMY and HAMMER_HEAD_TYPE_REDO (Redo is not yet
  used).  The DUMMY type is a dummy record used solely to identify a
  sequence number.  PAD records cannot have sequence numbers so we need
  a DUMMY record for it.

  Remove unused UNDO FIFO record types.

* Adjust the version upgrade code to completely reinitialize the UNDO FIFO
  space when moving from version < 4 to version >= 4.  This puts all blocks
  in the UNDO FIFO in a deterministic state with deterministic sequence
  numbers on 512 byte boundaries.

* Refactor the flush code.  In versions less then 4 the flush code had to
  flush dirty UNDO buffers, synchronize disk, then flush the volume header
  and synchronize disk again, then flush the meta data.  For HAMMER
  versions >= 4 the flush code removes the second disk synchronization
  operation.

* Refactor the crash recovery code.  For versions < 4 the crash recovery
  code relied on the UNDO FIFO first_offset and next_offset indexes in
  the volume header to calculate the UNDO space that needed to be run.
  For versions >= 4 the crash recovery code uses first_offset for the
  beginning of the UNDO space and proactively scans the UNDO FIFO to
  find the end of the space.  This takes longer but allows HAMMER to
  remove one of the two disk sync operations in the flush code.

* Split the crash recovery code into stage 1 and stage 2.  Stage 2 will
  be used to run REDO operations (REDO is not yet implemented).

sys/vfs/hammer/hammer.h
sys/vfs/hammer/hammer_disk.h
sys/vfs/hammer/hammer_flusher.c
sys/vfs/hammer/hammer_io.c
sys/vfs/hammer/hammer_ioctl.c
sys/vfs/hammer/hammer_mount.h
sys/vfs/hammer/hammer_recover.c
sys/vfs/hammer/hammer_undo.c
sys/vfs/hammer/hammer_vfsops.c

index 65ca1f7..35393ec 100644 (file)
@@ -773,6 +773,7 @@ struct hammer_mount {
        hammer_tid_t    flush_tid1;             /* flusher tid sequencing */
        hammer_tid_t    flush_tid2;             /* flusher tid sequencing */
        int64_t copy_stat_freebigblocks;        /* number of free bigblocks */
+       u_int32_t       undo_seqno;             /* UNDO/REDO FIFO seqno */
 
        struct netexport export;
        struct hammer_lock sync_lock;
@@ -1089,8 +1090,9 @@ void *hammer_alloc_data(hammer_transaction_t trans, int32_t data_len,
                        struct hammer_buffer **data_bufferp,
                        hammer_off_t hint, int *errorp);
 
-int hammer_generate_undo(hammer_transaction_t trans, hammer_io_t io,
+int hammer_generate_undo(hammer_transaction_t trans,
                        hammer_off_t zone1_offset, void *base, int len);
+int hammer_upgrade_undo_4(hammer_transaction_t trans);
 
 void hammer_put_volume(struct hammer_volume *volume, int flush);
 void hammer_put_buffer(struct hammer_buffer *buffer, int flush);
@@ -1254,7 +1256,8 @@ void hammer_flusher_finalize(hammer_transaction_t trans, int final);
 int  hammer_flusher_haswork(hammer_mount_t hmp);
 
 
-int hammer_recover(hammer_mount_t hmp, hammer_volume_t rootvol);
+int hammer_recover_stage1(hammer_mount_t hmp, hammer_volume_t rootvol);
+int hammer_recover_stage2(hammer_mount_t hmp, hammer_volume_t rootvol);
 void hammer_recover_flush_buffers(hammer_mount_t hmp,
                        hammer_volume_t root_volume, int final);
 
index 5a6f3d7..85d2c10 100644 (file)
@@ -340,7 +340,6 @@ typedef struct hammer_blockmap_layer2 *hammer_blockmap_layer2_t;
  * may be reserved.  The size of the undo fifo is usually set a newfs time
  * but can be adjusted if the filesystem is taken offline.
  */
-
 #define HAMMER_UNDO_LAYER2     128     /* max layer2 undo mapping entries */
 
 /*
@@ -365,6 +364,28 @@ typedef struct hammer_blockmap_layer2 *hammer_blockmap_layer2_t;
  * with a single atomic operation.  A larger transactional operation, such
  * as a remove(), may consist of several smaller atomic operations
  * representing raw meta-data operations.
+ *
+ *                             HAMMER VERSION 4 CHANGES
+ *
+ * In HAMMER version 4 the undo structure alignment is reduced from 16384
+ * to 512 bytes in order to ensure that each 512 byte sector begins with
+ * a header.  The reserved01 field in the header is now a 32 bit sequence
+ * number.  This allows the recovery code to detect missing sectors
+ * without relying on the 32-bit crc and to definitively identify the current
+ * undo sequence space without having to rely on information from the volume
+ * header.  In addition, new REDO entries in the undo space are used to
+ * record write, write/extend, and transaction id updates.
+ *
+ * The grand result is:
+ *
+ * (1) The volume header no longer needs to be synchronized for most
+ *     flush and fsync operations.
+ *
+ * (2) Most fsync operations need only lay down REDO records
+ *
+ * (3) Data overwrite for nohistory operations covered by REDO records
+ *     can be supported (instead of rolling a new block allocation),
+ *     by rolling UNDO for the prior contents of the data.
  */
 #define HAMMER_HEAD_ONDISK_SIZE                32
 #define HAMMER_HEAD_ALIGN              8
@@ -373,11 +394,16 @@ typedef struct hammer_blockmap_layer2 *hammer_blockmap_layer2_t;
 #define HAMMER_HEAD_DOALIGN(bytes)     \
        (((bytes) + HAMMER_HEAD_ALIGN_MASK) & ~HAMMER_HEAD_ALIGN_MASK)
 
+#define HAMMER_UNDO_ALIGN              512
+#define HAMMER_UNDO_ALIGN64            ((u_int64_t)512)
+#define HAMMER_UNDO_MASK               (HAMMER_UNDO_ALIGN - 1)
+#define HAMMER_UNDO_MASK64             (HAMMER_UNDO_ALIGN64 - 1)
+
 struct hammer_fifo_head {
        u_int16_t hdr_signature;
        u_int16_t hdr_type;
-       u_int32_t hdr_size;     /* aligned size of the whole mess */
-       u_int32_t reserved01;   /* (0) reserved for future use */
+       u_int32_t hdr_size;     /* Aligned size of the whole mess */
+       u_int32_t hdr_seq;      /* Sequence number */
        hammer_crc_t hdr_crc;   /* XOR crc up to field w/ crc after field */
 };
 
@@ -396,11 +422,11 @@ typedef struct hammer_fifo_tail *hammer_fifo_tail_t;
  * Fifo header types.
  */
 #define HAMMER_HEAD_TYPE_PAD   (0x0040U|HAMMER_HEAD_FLAG_FREE)
-#define HAMMER_HEAD_TYPE_VOL   0x0041U         /* Volume (dummy header) */
-#define HAMMER_HEAD_TYPE_BTREE 0x0042U         /* B-Tree node */
+#define HAMMER_HEAD_TYPE_DUMMY 0x0041U         /* dummy entry w/seqno */
+#define HAMMER_HEAD_TYPE_42    0x0042U
 #define HAMMER_HEAD_TYPE_UNDO  0x0043U         /* random UNDO information */
-#define HAMMER_HEAD_TYPE_DELETE        0x0044U         /* record deletion */
-#define HAMMER_HEAD_TYPE_RECORD        0x0045U         /* Filesystem record */
+#define HAMMER_HEAD_TYPE_REDO  0x0044U         /* data REDO / fast fsync */
+#define HAMMER_HEAD_TYPE_45    0x0045U
 
 #define HAMMER_HEAD_FLAG_FREE  0x8000U         /* Indicates object freed */
 
@@ -413,6 +439,8 @@ typedef struct hammer_fifo_tail *hammer_fifo_tail_t;
 
 /*
  * Misc FIFO structures.
+ *
+ * NOTE: redo records are for version 4+ filesystems.
  */
 struct hammer_fifo_undo {
        struct hammer_fifo_head head;
@@ -422,12 +450,24 @@ struct hammer_fifo_undo {
        /* followed by data */
 };
 
-typedef struct hammer_fifo_undo *hammer_fifo_undo_t;
+struct hammer_fifo_redo {
+       struct hammer_fifo_head head;
+       int64_t                 redo_objid;     /* file being written */
+       hammer_off_t            redo_offset;    /* logical offset in file */
+       int32_t                 redo_data_bytes;
+       int32_t                 redo_reserved01;
+};
 
-struct hammer_fifo_buf_commit {
-       hammer_off_t            undo_offset;
+union hammer_fifo_any {
+       struct hammer_fifo_head head;
+       struct hammer_fifo_undo undo;
+       struct hammer_fifo_redo redo;
 };
 
+typedef struct hammer_fifo_redo *hammer_fifo_redo_t;
+typedef struct hammer_fifo_undo *hammer_fifo_undo_t;
+typedef union hammer_fifo_any *hammer_fifo_any_t;
+
 /*
  * Volume header types
  */
@@ -543,13 +583,14 @@ typedef struct hammer_volume_ondisk *hammer_volume_ondisk_t;
         sizeof(hammer_crc_t))
 
 #define HAMMER_VOL_VERSION_MIN         1       /* minimum supported version */
-#define HAMMER_VOL_VERSION_DEFAULT     1       /* newfs default version */
-#define HAMMER_VOL_VERSION_WIP         3       /* version >= this is WIP */
-#define HAMMER_VOL_VERSION_MAX         3       /* maximum supported version */
+#define HAMMER_VOL_VERSION_DEFAULT     3       /* newfs default version */
+#define HAMMER_VOL_VERSION_WIP         4       /* version >= this is WIP */
+#define HAMMER_VOL_VERSION_MAX         4       /* maximum supported version */
 
 #define HAMMER_VOL_VERSION_ONE         1
 #define HAMMER_VOL_VERSION_TWO         2       /* new dirent layout (2.3+) */
 #define HAMMER_VOL_VERSION_THREE       3       /* new snapshot layout (2.5+) */
+#define HAMMER_VOL_VERSION_FOUR                4       /* new undo/redo/flush (2.5+) */
 
 /*
  * Record types are fairly straightforward.  The B-Tree includes the record
index bdc9a6e..a728015 100644 (file)
@@ -648,7 +648,7 @@ hammer_flusher_finalize(hammer_transaction_t trans, int final)
        }
 
        /*
-        * Wait for I/Os to complete
+        * Wait for I/Os to complete and flush the cache on the target disk.
         */
        hammer_flusher_clean_loose_ios(hmp);
        hammer_io_wait_all(hmp, "hmrfl1");
@@ -657,9 +657,16 @@ hammer_flusher_finalize(hammer_transaction_t trans, int final)
                goto failed;
 
        /*
-        * Update the on-disk volume header with new UNDO FIFO end position
-        * (do not generate new UNDO records for this change).  We have to
-        * do this for the UNDO FIFO whether (final) is set or not.
+        * HAMMER VERSION < 4:
+        *      Update the on-disk volume header with new UNDO FIFO end
+        *      position (do not generate new UNDO records for this change).
+        *      We have to do this for the UNDO FIFO whether (final) is
+        *      set or not in order for the UNDOs to be recognized on
+        *      recovery.
+        *
+        * HAMMER VERSION >= 4:
+        *      The UNDO FIFO data written above will be recognized on
+        *      recovery without us having to sync the volume header.
         *
         * Also update the on-disk next_tid field.  This does not require
         * an UNDO.  However, because our TID is generated before we get
@@ -701,21 +708,30 @@ hammer_flusher_finalize(hammer_transaction_t trans, int final)
        }
 
        /*
-        * Wait for I/Os to complete
+        * Wait for I/Os to complete.
+        *
+        * For HAMMER VERSION 4+ filesystems we do not have to wait for
+        * the I/O to complete as the new UNDO FIFO entries are recognized
+        * even without the volume header update.  This allows the volume
+        * header to flushed along with meta-data, significantly reducing
+        * flush overheads.
         */
        hammer_flusher_clean_loose_ios(hmp);
-       hammer_io_wait_all(hmp, "hmrfl2");
+       if (hmp->version < HAMMER_VOL_VERSION_FOUR)
+               hammer_io_wait_all(hmp, "hmrfl2");
 
        if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR)
                goto failed;
 
        /*
         * Flush meta-data.  The meta-data will be undone if we crash
-        * so we can safely flush it asynchronously.
+        * so we can safely flush it asynchronously.  There is no need
+        * to wait for I/O to complete (or issue a synchronous disk flush).
         *
-        * Repeated catchups will wind up flushing this update's meta-data
-        * and the UNDO buffers for the next update simultaniously.  This
-        * is ok.
+        * In fact, even if we did wait the meta-data will still be undone
+        * by a crash up until the next flush cycle due to the first_offset
+        * in the volume header for the UNDO FIFO not being adjusted until
+        * the following flush cycle.
         */
        count = 0;
        while ((io = TAILQ_FIRST(&hmp->meta_list)) != NULL) {
index 537b9ec..cc39000 100644 (file)
@@ -647,7 +647,7 @@ hammer_modify_volume(hammer_transaction_t trans, hammer_volume_t volume,
        if (len) {
                intptr_t rel_offset = (intptr_t)base - (intptr_t)volume->ondisk;
                KKASSERT((rel_offset & ~(intptr_t)HAMMER_BUFMASK) == 0);
-               hammer_generate_undo(trans, &volume->io,
+               hammer_generate_undo(trans,
                         HAMMER_ENCODE_RAW_VOLUME(volume->vol_no, rel_offset),
                         base, len);
        }
@@ -669,7 +669,7 @@ hammer_modify_buffer(hammer_transaction_t trans, hammer_buffer_t buffer,
        if (len) {
                intptr_t rel_offset = (intptr_t)base - (intptr_t)buffer->ondisk;
                KKASSERT((rel_offset & ~(intptr_t)HAMMER_BUFMASK) == 0);
-               hammer_generate_undo(trans, &buffer->io,
+               hammer_generate_undo(trans,
                                     buffer->zone2_offset + rel_offset,
                                     base, len);
        }
index 5ed70be..9f6025e 100644 (file)
@@ -501,6 +501,10 @@ hammer_ioc_get_version(hammer_transaction_t trans, hammer_inode_t ip,
                ksnprintf(ver->description, sizeof(ver->description),
                         "New snapshot management (DragonFly 2.5+)");
                break;
+       case 4:
+               ksnprintf(ver->description, sizeof(ver->description),
+                        "New REDO, faster flush/sync (DragonFly 2.5+)");
+               break;
        default:
                ksnprintf(ver->description, sizeof(ver->description),
                         "Unknown");
@@ -518,17 +522,25 @@ int
 hammer_ioc_set_version(hammer_transaction_t trans, hammer_inode_t ip,
                   struct hammer_ioc_version *ver)
 {
+       hammer_mount_t hmp = trans->hmp;
        struct hammer_cursor cursor;
        hammer_volume_t volume;
        int error;
+       int over = hmp->version;
 
-       if (ver->cur_version < trans->hmp->version)
-               return(EINVAL);
-       if (ver->cur_version == trans->hmp->version)
+       /*
+        * Generally do not allow downgrades.  However, version 4 can
+        * be downgraded to version 3.
+        */
+       if (ver->cur_version < hmp->version) {
+               if (!(ver->cur_version == 3 && hmp->version == 4))
+                       return(EINVAL);
+       }
+       if (ver->cur_version == hmp->version)
                return(0);
        if (ver->cur_version > HAMMER_VOL_VERSION_MAX)
                return(EINVAL);
-       if (trans->hmp->ronly)
+       if (hmp->ronly)
                return(EROFS);
 
        /*
@@ -538,17 +550,34 @@ hammer_ioc_set_version(hammer_transaction_t trans, hammer_inode_t ip,
        error = hammer_init_cursor(trans, &cursor, NULL, NULL);
        if (error)
                goto failed;
-       hammer_sync_lock_sh(trans);
+       hammer_lock_ex(&hmp->flusher.finalize_lock);
+       hammer_sync_lock_ex(trans);
+       hmp->version = ver->cur_version;
 
-       volume = hammer_get_root_volume(cursor.trans->hmp, &error);
+       /*
+        * If upgrading from version < 4 to version >= 4 the UNDO FIFO
+        * must be reinitialized.
+        */
+       if (over < HAMMER_VOL_VERSION_FOUR &&
+           ver->cur_version >= HAMMER_VOL_VERSION_FOUR) {
+               kprintf("upgrade undo to version 4\n");
+               error = hammer_upgrade_undo_4(trans);
+               if (error)
+                       goto failed;
+       }
+
+       /*
+        * Adjust the version in the volume header
+        */
+       volume = hammer_get_root_volume(hmp, &error);
        KKASSERT(error == 0);
        hammer_modify_volume_field(cursor.trans, volume, vol_version);
        volume->ondisk->vol_version = ver->cur_version;
-       cursor.trans->hmp->version = ver->cur_version;
        hammer_modify_volume_done(volume);
        hammer_rel_volume(volume, 0);
 
        hammer_sync_unlock(trans);
+       hammer_unlock(&hmp->flusher.finalize_lock);
 failed:
        ver->head.error = error;
        hammer_done_cursor(&cursor);
index 1d8ca8d..db8e3da 100644 (file)
@@ -59,6 +59,8 @@ struct hammer_mount_info {
 #define HMNT_MASTERID  0x00000002      /* master_id field set */
 #define HMNT_EXPORTREQ 0x00000004
 #define HMNT_UNDO_DIRTY        0x00000008
+#define HMNT_STAGE2    0x00000010      /* ran stage-2 recovery */
+#define HMNT_HASREDO   0x00000020      /* stage-2 must scan for REDO */
 
 #define HMNT_USERFLAGS (HMNT_NOHISTORY | HMNT_MASTERID)
 
index 8cbe323..cebc6e0 100644 (file)
 
 static int hammer_check_tail_signature(hammer_fifo_tail_t tail,
                        hammer_off_t end_off);
+static int hammer_check_head_signature(hammer_fifo_head_t head,
+                       hammer_off_t beg_off);
 static void hammer_recover_copy_undo(hammer_off_t undo_offset,
                        char *src, char *dst, int bytes);
+static hammer_fifo_any_t hammer_recover_scan_fwd(hammer_mount_t hmp,
+                       hammer_volume_t root_volume,
+                       hammer_off_t *scan_offsetp,
+                       int *errorp, struct hammer_buffer **bufferp);
+static hammer_fifo_any_t hammer_recover_scan_rev(hammer_mount_t hmp,
+                       hammer_volume_t root_volume,
+                       hammer_off_t *scan_offsetp,
+                       int *errorp, struct hammer_buffer **bufferp);
 #if 0
 static void hammer_recover_debug_dump(int w, char *buf, int bytes);
 #endif
 static int hammer_recover_undo(hammer_mount_t hmp, hammer_volume_t root_volume,
-                       hammer_fifo_undo_t undo, int bytes);
+                       hammer_fifo_undo_t undo);
 
 /*
- * Recover a filesystem on mount
+ * Recover filesystem meta-data on mount.  This procedure figures out the
+ * UNDO FIFO range and runs the UNDOs backwards.  The FIFO pointers are not
+ * resynchronized by this procedure.
+ *
+ * This procedure is run near the beginning of the mount sequence, before
+ * any B-Tree or high-level accesses are enabled, and is responsible for
+ * restoring the meta-data to a consistent state.  High level HAMMER data
+ * structures (such as the B-Tree) cannot be accessed here.
  *
  * NOTE: No information from the root volume has been cached in the
- * hammer_mount structure yet, so we need to access the root volume's
- * buffer directly.
+ *      hammer_mount structure yet, so we need to access the root volume's
+ *      buffer directly.
+ *
+ * NOTE:
  */
 int
-hammer_recover(hammer_mount_t hmp, hammer_volume_t root_volume)
+hammer_recover_stage1(hammer_mount_t hmp, hammer_volume_t root_volume)
 {
        hammer_blockmap_t rootmap;
        hammer_buffer_t buffer;
        hammer_off_t scan_offset;
+       hammer_off_t scan_offset_save;
        hammer_off_t bytes;
-       hammer_fifo_tail_t tail;
-       hammer_fifo_undo_t undo;
+       hammer_fifo_any_t head;
        hammer_off_t first_offset;
        hammer_off_t last_offset;
+       u_int32_t seqno;
        int error;
 
        /*
-        * Examine the UNDO FIFO.  If it is empty the filesystem is clean
-        * and no action need be taken.
+        * Examine the UNDO FIFO indices in the volume header.
         */
        rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
-
-       if (rootmap->first_offset == rootmap->next_offset)
-               return(0);
-
        first_offset = rootmap->first_offset;
        last_offset  = rootmap->next_offset;
+       buffer = NULL;
+       error = 0;
+
+       if (first_offset > rootmap->alloc_offset ||
+           last_offset > rootmap->alloc_offset) {
+               kprintf("HAMMER(%s) Illegal UNDO FIFO index range "
+                       "%016jx, %016jx limit %016jx\n",
+                       root_volume->ondisk->vol_name,
+                       (intmax_t)first_offset,
+                       (intmax_t)last_offset,
+                       (intmax_t)rootmap->alloc_offset);
+               error = EIO;
+               goto done;
+       }
+
+       /*
+        * In HAMMER version 4+ filesystems the volume header does NOT
+        * contain definitive UNDO FIFO state.  In particular, the
+        * rootmap->next_offset may not be indexed completely to the
+        * end of the active UNDO FIFO.
+        */
+       if (hmp->version >= HAMMER_VOL_VERSION_FOUR) {
+               /*
+                * To find the definitive range we must first scan backwards
+                * from first_offset to locate the first real record and
+                * extract the sequence number from it.  This record is not
+                * part of the active undo space.
+                */
+               scan_offset = first_offset;
+               seqno = 0;
+
+               for (;;) {
+                       head = hammer_recover_scan_rev(hmp, root_volume,
+                                                      &scan_offset,
+                                                      &error, &buffer);
+                       if (error)
+                               break;
+                       if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) {
+                               seqno = head->head.hdr_seq;
+                               break;
+                       }
+               }
+               if (error) {
+                       kprintf("HAMMER(%s) meta-data recovery failure "
+                               "during seqno backscan\n",
+                               root_volume->ondisk->vol_name);
+                       goto done;
+               }
+
+               /*
+                * Scan forwards from first_offset and (seqno+1) looking
+                * for a sequence space discontinuity.  This denotes the
+                * end of the active FIFO area.
+                *
+                * NOTE: For the case where the FIFO is empty the very first
+                *       record we find will be discontinuous.
+                *
+                * NOTE: Do not include trailing PADs in the scan range,
+                *       and remember the returned scan_offset after a
+                *       fwd iteration points to the end of the returned
+                *       record.
+                */
+               kprintf("HAMMER(%s) meta-data recovery check seqno=%08x\n",
+                       root_volume->ondisk->vol_name,
+                       seqno);
+
+               scan_offset = first_offset;
+               scan_offset_save = scan_offset;
+               ++seqno;
+               for (;;) {
+                       head = hammer_recover_scan_fwd(hmp, root_volume,
+                                                      &scan_offset,
+                                                      &error, &buffer);
+                       if (error)
+                               break;
+                       if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) {
+                               if (seqno != head->head.hdr_seq) {
+                                       scan_offset = scan_offset_save;
+                                       break;
+                               }
+                               scan_offset_save = scan_offset;
+                               ++seqno;
+                       }
+
+#if 0
+                       /*
+                        * If the forward scan is grossly ahead of last_offset
+                        * then something is wrong.  last_offset is supposed
+                        * to be flushed out
+                        */
+                       if (last_offset >= scan_offset) {
+                               bytes = last_offset - scan_offset;
+                       } else {
+                               bytes = rootmap->alloc_offset - scan_offset +
+                                       (last_offset & HAMMER_OFF_LONG_MASK);
+                       }
+                       if (bytes >
+                           (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK) *
+                           4 / 5) {
+                               kprintf("HAMMER(%s) meta-data forward scan is "
+                                       "grossly beyond the last_offset in "
+                                       "the volume header, this can't be "
+                                       "right.\n",
+                                       root_volume->ondisk->vol_name);
+                               error = EIO;
+                               break;
+                       }
+#endif
+               }
 
+               /*
+                * Store the seqno.  This will be the next seqno we lay down
+                * when generating new UNDOs.
+                */
+               hmp->undo_seqno = seqno;
+               if (error) {
+                       kprintf("HAMMER(%s) meta-data recovery failure "
+                               "during seqno fwdscan\n",
+                               root_volume->ondisk->vol_name);
+                       goto done;
+               }
+               last_offset = scan_offset;
+               kprintf("HAMMER(%s) meta-data recovery range %016jx-%016jx "
+                       "(invol %016jx) endseqno=%08x\n",
+                       root_volume->ondisk->vol_name,
+                       (intmax_t)first_offset,
+                       (intmax_t)last_offset,
+                       (intmax_t)rootmap->next_offset,
+                       seqno);
+       }
+
+       /*
+        * Calculate the size of the active portion of the FIFO.  If the
+        * FIFO is empty the filesystem is clean and no further action is
+        * needed.
+        */
        if (last_offset >= first_offset) {
                bytes = last_offset - first_offset;
        } else {
                bytes = rootmap->alloc_offset - first_offset +
                        (last_offset & HAMMER_OFF_LONG_MASK);
        }
-       kprintf("HAMMER(%s) Start Recovery %016llx - %016llx "
-               "(%lld bytes of UNDO)%s\n",
+       if (bytes == 0) {
+               error = 0;
+               goto done;
+       }
+
+       kprintf("HAMMER(%s) Start meta-data recovery %016jx - %016jx "
+               "(%jd bytes of UNDO)%s\n",
                root_volume->ondisk->vol_name,
-               (long long)first_offset,
-               (long long)last_offset,
-               (long long)bytes,
+               (intmax_t)first_offset,
+               (intmax_t)last_offset,
+               (intmax_t)bytes,
                (hmp->ronly ? " (RO)" : "(RW)"));
        if (bytes > (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK)) {
                kprintf("Undo size is absurd, unable to mount\n");
-               return(EIO);
+               error = EIO;
+               goto done;
        }
 
        /*
         * Scan the UNDOs backwards.
         */
        scan_offset = last_offset;
-       buffer = NULL;
-       if (scan_offset > rootmap->alloc_offset) {
-               kprintf("HAMMER(%s) UNDO record at %016llx FIFO overflow\n",
-                       root_volume->ondisk->vol_name,
-                       (long long)scan_offset);
-               error = EIO;
-               goto done;
-       }
 
        while ((int64_t)bytes > 0) {
-               if (hammer_debug_general & 0x0080)
-                       kprintf("scan_offset %016llx\n",
-                               (long long)scan_offset);
-               if (scan_offset == HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0)) {
-                       scan_offset = rootmap->alloc_offset;
-                       continue;
-               }
-               if (scan_offset - sizeof(*tail) <
-                   HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0)) {
-                       kprintf("HAMMER(%s) UNDO record at %016llx FIFO "
-                               "underflow\n",
-                               root_volume->ondisk->vol_name,
-                               (long long)scan_offset);
-                       error = EIO;
-                       break;
-               }
-               tail = hammer_bread(hmp, scan_offset - sizeof(*tail),
-                                   &error, &buffer);
-               if (error) {
-                       kprintf("HAMMER(%s) Unable to read UNDO TAIL "
-                               "at %016llx\n",
-                               root_volume->ondisk->vol_name,
-                               (long long)scan_offset - sizeof(*tail));
-                       break;
-               }
-
-               if (hammer_check_tail_signature(tail, scan_offset) != 0) {
-                       kprintf("HAMMER(%s) Illegal UNDO TAIL signature "
-                               "at %016llx\n",
-                               root_volume->ondisk->vol_name,
-                               (long long)scan_offset - sizeof(*tail));
-                       error = EIO;
+               KKASSERT(scan_offset != first_offset);
+               head = hammer_recover_scan_rev(hmp, root_volume,
+                                              &scan_offset, &error, &buffer);
+               if (error)
                        break;
-               }
-               undo = (void *)((char *)tail + sizeof(*tail) - tail->tail_size);
-
-               error = hammer_recover_undo(hmp, root_volume, undo,
-                               HAMMER_BUFSIZE -
-                               (int)((char *)undo - (char *)buffer->ondisk));
+               error = hammer_recover_undo(hmp, root_volume, &head->undo);
                if (error) {
-                       kprintf("HAMMER(%s) UNDO record at %016llx failed\n",
+                       kprintf("HAMMER(%s) UNDO record at %016jx failed\n",
                                root_volume->ondisk->vol_name,
-                               (long long)scan_offset - tail->tail_size);
+                               (intmax_t)scan_offset - head->head.hdr_size);
                        break;
                }
-               scan_offset -= tail->tail_size;
-               bytes -= tail->tail_size;
+               bytes -= head->head.hdr_size;
 
                /*
                 * If too many dirty buffers have built up we have to flush'm
@@ -185,12 +299,14 @@ hammer_recover(hammer_mount_t hmp, hammer_volume_t root_volume)
                }
        }
 done:
-       if (buffer)
+       if (buffer) {
                hammer_rel_buffer(buffer, 0);
+               buffer = NULL;
+       }
 
        /*
         * After completely flushing all the recovered buffers the volume
-        * header will also be flushed.  Force the UNDO FIFO to 0-length.
+        * header will also be flushed.
         */
        if (root_volume->io.recovered == 0) {
                hammer_ref_volume(root_volume);
@@ -198,12 +314,14 @@ done:
        }
 
        /*
-        * Finish up flushing (or discarding) recovered buffers
+        * Finish up flushing (or discarding) recovered buffers.  FIFO
+        * indices in the volume header are updated to the actual undo
+        * range but will not be collapsed until stage 2.
         */
        if (error == 0) {
                hammer_modify_volume(NULL, root_volume, NULL, 0);
                rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
-               rootmap->first_offset = last_offset;
+               rootmap->first_offset = first_offset;
                rootmap->next_offset = last_offset;
                hammer_modify_volume_done(root_volume);
                if (hmp->ronly == 0)
@@ -211,131 +329,430 @@ done:
        } else {
                hammer_recover_flush_buffers(hmp, root_volume, -1);
        }
-       kprintf("HAMMER(%s) End Recovery\n", root_volume->ondisk->vol_name);
+       kprintf("HAMMER(%s) End meta-data recovery\n",
+               root_volume->ondisk->vol_name);
        return (error);
 }
 
-static int
-hammer_check_tail_signature(hammer_fifo_tail_t tail, hammer_off_t end_off)
+/*
+ * Execute redo operations
+ *
+ * This procedure is run at the end of the mount sequence, after the hammer
+ * mount structure has been completely initialized but before the filesystem
+ * goes live.  It can access standard cursors, the B-Tree, flush the
+ * filesystem, and so forth.
+ *
+ * This code may only be called for read-write mounts or when a mount
+ * switches from read-only to read-write.
+ *
+ * The stage1 code will have already calculated the correct FIFO range
+ * and stored it in the rootmap.
+ */
+int
+hammer_recover_stage2(hammer_mount_t hmp, hammer_volume_t root_volume)
 {
-       int max_bytes;
+       hammer_blockmap_t rootmap;
+       hammer_buffer_t buffer;
+       hammer_off_t scan_offset;
+       hammer_off_t bytes;
+       hammer_fifo_any_t head;
+       hammer_off_t first_offset;
+       hammer_off_t last_offset;
+       int error;
+
+       /*
+        * Stage 2 can only be run on a RW mount, or when the mount is
+        * switched from RO to RW.  It must be run only once.
+        */
+       KKASSERT(hmp->ronly == 0);
 
-       max_bytes = ((end_off - sizeof(*tail)) & HAMMER_BUFMASK);
-       max_bytes += sizeof(*tail);
+       if (hmp->hflags & HMNT_STAGE2)
+               return(0);
+       hmp->hflags |= HMNT_STAGE2;
 
        /*
-        * tail overlaps buffer boundary
+        * Examine the UNDO FIFO.  If it is empty the filesystem is clean
+        * and no action need be taken.
         */
-       if (((end_off - sizeof(*tail)) ^ (end_off - 1)) & ~HAMMER_BUFMASK64) {
-               return(1);
+       rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
+       first_offset = rootmap->first_offset;
+       last_offset  = rootmap->next_offset;
+       if (first_offset == last_offset)
+               return(0);
+
+       if (last_offset >= first_offset) {
+               bytes = last_offset - first_offset;
+       } else {
+               bytes = rootmap->alloc_offset - first_offset +
+                       (last_offset & HAMMER_OFF_LONG_MASK);
+       }
+       kprintf("HAMMER(%s) Start redo recovery %016jx - %016jx "
+               "(%jd bytes of UNDO)%s\n",
+               root_volume->ondisk->vol_name,
+               (intmax_t)first_offset,
+               (intmax_t)last_offset,
+               (intmax_t)bytes,
+               (hmp->ronly ? " (RO)" : "(RW)"));
+       if (bytes > (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK)) {
+               kprintf("Undo size is absurd, unable to mount\n");
+               return(EIO);
        }
 
        /*
-        * signature check, the tail signature is allowed to be the head
-        * signature only for 8-byte PADs.
+        * Scan the REDOs forwards.
         */
-       switch(tail->tail_signature) {
-       case HAMMER_TAIL_SIGNATURE:
-               break;
-       case HAMMER_HEAD_SIGNATURE:
-               if (tail->tail_type != HAMMER_HEAD_TYPE_PAD ||
-                   tail->tail_size != sizeof(*tail)) {
-                       return(2);
+       scan_offset = first_offset;
+       buffer = NULL;
+
+       while (bytes) {
+               KKASSERT(scan_offset != last_offset);
+
+               head = hammer_recover_scan_fwd(hmp, root_volume,
+                                              &scan_offset, &error, &buffer);
+               if (error)
+                       break;
+
+#if 0
+               error = hammer_recover_redo(hmp, root_volume, &head->redo);
+#endif
+               if (error) {
+                       kprintf("HAMMER(%s) UNDO record at %016jx failed\n",
+                               root_volume->ondisk->vol_name,
+                               (intmax_t)scan_offset - head->head.hdr_size);
+                       break;
                }
-               break;
+               bytes -= head->head.hdr_size;
+       }
+       if (buffer) {
+               hammer_rel_buffer(buffer, 0);
+               buffer = NULL;
        }
 
        /*
-        * The undo structure must not overlap a buffer boundary.
+        * Finish up flushing (or discarding) recovered buffers by executing
+        * a normal flush cycle.  Setting HMNT_UNDO_DIRTY bypasses degenerate
+        * case tests and forces the flush in order to update the FIFO indices.
+        *
+        * If a crash occurs during the flush the entire undo/redo will be
+        * re-run during recovery on the next mount.
         */
-       if (tail->tail_size < sizeof(*tail) || tail->tail_size > max_bytes) {
-               return(3);
+       if (error == 0) {
+               if (rootmap->first_offset != rootmap->next_offset)
+                       hmp->hflags |= HMNT_UNDO_DIRTY;
+               hammer_flusher_sync(hmp);
        }
-       return(0);
+       kprintf("HAMMER(%s) End redo recovery\n",
+               root_volume->ondisk->vol_name);
+       return (error);
 }
 
-static int
-hammer_recover_undo(hammer_mount_t hmp, hammer_volume_t root_volume,
-                   hammer_fifo_undo_t undo, int bytes)
+/*
+ * Scan backwards from *scan_offsetp, return the FIFO record prior to the
+ * record at *scan_offsetp or NULL if an error occured.
+ *
+ * On return *scan_offsetp will be the offset of the returned record.
+ */
+hammer_fifo_any_t
+hammer_recover_scan_rev(hammer_mount_t hmp, hammer_volume_t root_volume,
+                       hammer_off_t *scan_offsetp,
+                       int *errorp, struct hammer_buffer **bufferp)
 {
+       hammer_off_t scan_offset;
+       hammer_blockmap_t rootmap;
+       hammer_fifo_any_t head;
        hammer_fifo_tail_t tail;
-       hammer_volume_t volume;
-       hammer_buffer_t buffer;
-       hammer_off_t buf_offset;
-       int zone;
-       int error;
-       int vol_no;
-       int max_bytes;
-       u_int32_t offset;
-       u_int32_t crc;
 
-       /*
-        * Basic sanity checks
-        */
-       if (bytes < HAMMER_HEAD_ALIGN) {
-               kprintf("HAMMER: Undo alignment error (%d)\n", bytes);
-               return(EIO);
+       rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
+       scan_offset = *scan_offsetp;
+
+       if (hammer_debug_general & 0x0080)
+               kprintf("rev scan_offset %016jx\n", (intmax_t)scan_offset);
+       if (scan_offset == HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0))
+               scan_offset = rootmap->alloc_offset;
+       if (scan_offset - sizeof(*tail) <
+           HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0)) {
+               kprintf("HAMMER(%s) UNDO record at %016jx FIFO underflow\n",
+                       root_volume->ondisk->vol_name,
+                       (intmax_t)scan_offset);
+               *errorp = EIO;
+               return (NULL);
        }
-       if (undo->head.hdr_signature != HAMMER_HEAD_SIGNATURE) {
-               kprintf("HAMMER: Bad head signature %04x\n", 
-                       undo->head.hdr_signature);
-               return(EIO);
+       tail = hammer_bread(hmp, scan_offset - sizeof(*tail),
+                           errorp, bufferp);
+       if (*errorp) {
+               kprintf("HAMMER(%s) Unable to read UNDO TAIL "
+                       "at %016jx\n",
+                       root_volume->ondisk->vol_name,
+                       (intmax_t)scan_offset - sizeof(*tail));
+               return (NULL);
        }
-       if (undo->head.hdr_size < HAMMER_HEAD_ALIGN ||
-           undo->head.hdr_size > bytes) {
-               kprintf("HAMMER: Bad size %d\n", bytes);
-               return(EIO);
+
+       if (hammer_check_tail_signature(tail, scan_offset) != 0) {
+               kprintf("HAMMER(%s) Illegal UNDO TAIL signature "
+                       "at %016jx\n",
+                       root_volume->ondisk->vol_name,
+                       (intmax_t)scan_offset - sizeof(*tail));
+               *errorp = EIO;
+               return (NULL);
        }
+       head = (void *)((char *)tail + sizeof(*tail) - tail->tail_size);
+       *scan_offsetp = scan_offset - head->head.hdr_size;
+
+       return (head);
+}
+
+/*
+ * Scan forwards from *scan_offsetp, return the FIFO record or NULL if
+ * an error occured.
+ *
+ * On return *scan_offsetp will be the offset of the record following
+ * the returned record.
+ */
+hammer_fifo_any_t
+hammer_recover_scan_fwd(hammer_mount_t hmp, hammer_volume_t root_volume,
+                       hammer_off_t *scan_offsetp,
+                       int *errorp, struct hammer_buffer **bufferp)
+{
+       hammer_off_t scan_offset;
+       hammer_blockmap_t rootmap;
+       hammer_fifo_any_t head;
+
+       rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
+       scan_offset = *scan_offsetp;
+
+       if (hammer_debug_general & 0x0080)
+               kprintf("fwd scan_offset %016jx\n", (intmax_t)scan_offset);
+       if (scan_offset == rootmap->alloc_offset)
+               scan_offset = HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0);
+
+       head = hammer_bread(hmp, scan_offset, errorp, bufferp);
+       if (*errorp) {
+               kprintf("HAMMER(%s) Unable to read UNDO HEAD at %016jx\n",
+                       root_volume->ondisk->vol_name,
+                       (intmax_t)scan_offset);
+               return (NULL);
+       }
+
+       if (hammer_check_head_signature(&head->head, scan_offset) != 0) {
+               kprintf("HAMMER(%s) Illegal UNDO TAIL signature "
+                       "at %016jx\n",
+                       root_volume->ondisk->vol_name,
+                       (intmax_t)scan_offset);
+               *errorp = EIO;
+               return (NULL);
+       }
+       scan_offset += head->head.hdr_size;
+       if (scan_offset == rootmap->alloc_offset)
+               scan_offset = HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0);
+       *scan_offsetp = scan_offset;
+
+       return (head);
+}
+
+/*
+ * Helper function for hammer_check_{head,tail}_signature().  Check stuff
+ * once the head and tail has been established.
+ *
+ * This function validates the entire FIFO record wrapper.
+ */
+static __inline
+int
+_hammer_check_signature(hammer_fifo_head_t head, hammer_fifo_tail_t tail,
+                       hammer_off_t beg_off)
+{
+       hammer_off_t end_off;
+       u_int32_t crc;
+       int bytes;
 
        /*
-        * Skip PAD records.  Note that PAD records also do not require
-        * a tail and may have a truncated structure.
+        * Check signatures.  The tail signature is allowed to be the
+        * head signature only for 8-byte PADs.
         */
-       if (undo->head.hdr_type == HAMMER_HEAD_TYPE_PAD)
-               return(0);
+       if (head->hdr_signature != HAMMER_HEAD_SIGNATURE) {
+               kprintf("HAMMER: FIFO record bad head signature "
+                       "%04x at %016jx\n",
+                       head->hdr_signature,
+                       (intmax_t)beg_off);
+               return(2);
+       }
+       if (head->hdr_size < HAMMER_HEAD_ALIGN ||
+           (head->hdr_size & HAMMER_HEAD_ALIGN_MASK)) {
+               kprintf("HAMMER: FIFO record unaligned or bad size"
+                       "%04x at %016jx\n",
+                       head->hdr_size,
+                       (intmax_t)beg_off);
+               return(2);
+       }
+       end_off = beg_off + head->hdr_size;
+
+       if (head->hdr_type != HAMMER_HEAD_TYPE_PAD ||
+           (size_t)(end_off - beg_off) != sizeof(*tail)) {
+               if (head->hdr_type != tail->tail_type) {
+                       kprintf("HAMMER: FIFO record head/tail type mismatch "
+                               "%04x %04x at %016jx\n",
+                               head->hdr_type, tail->tail_type,
+                               (intmax_t)beg_off);
+                       return(2);
+               }
+               if (head->hdr_size != tail->tail_size) {
+                       kprintf("HAMMER: FIFO record head/tail size mismatch "
+                               "%04x %04x at %016jx\n",
+                               head->hdr_size, tail->tail_size,
+                               (intmax_t)beg_off);
+                       return(2);
+               }
+               if (tail->tail_signature != HAMMER_TAIL_SIGNATURE) {
+                       kprintf("HAMMER: FIFO record bad tail signature "
+                               "%04x at %016jx\n",
+                               tail->tail_signature,
+                               (intmax_t)beg_off);
+                       return(3);
+               }
+       }
 
        /*
-        * Check the CRC
+        * Non-PAD records must have a CRC and must be sized at
+        * least large enough to fit the head and tail.
         */
-       crc = crc32(undo, HAMMER_FIFO_HEAD_CRCOFF) ^
-             crc32(&undo->head + 1, undo->head.hdr_size - sizeof(undo->head));
-       if (undo->head.hdr_crc != crc) {
-               kprintf("HAMMER: Undo record CRC failed %08x %08x\n",
-                       undo->head.hdr_crc, crc);
-               return(EIO);
+       if (head->hdr_type != HAMMER_HEAD_TYPE_PAD) {
+               crc = crc32(head, HAMMER_FIFO_HEAD_CRCOFF) ^
+                     crc32(head + 1, head->hdr_size - sizeof(*head));
+               if (head->hdr_crc != crc) {
+                       kprintf("HAMMER: FIFO record CRC failed %08x %08x "
+                               "at %016jx\n",
+                               head->hdr_crc, crc,
+                               (intmax_t)beg_off);
+                       return(EIO);
+               }
+               if (head->hdr_size < sizeof(*head) + sizeof(*tail)) {
+                       kprintf("HAMMER: FIFO record too small "
+                               "%04x at %016jx\n",
+                               head->hdr_size,
+                               (intmax_t)beg_off);
+                       return(EIO);
+               }
        }
 
-
        /*
         * Check the tail
         */
-       bytes = undo->head.hdr_size;
-       tail = (void *)((char *)undo + bytes - sizeof(*tail));
-       if (tail->tail_size != undo->head.hdr_size) {
-               kprintf("HAMMER: Bad tail size %d\n", tail->tail_size);
+       bytes = head->hdr_size;
+       tail = (void *)((char *)head + bytes - sizeof(*tail));
+       if (tail->tail_size != head->hdr_size) {
+               kprintf("HAMMER: Bad tail size %04x vs %04x at %016jx\n",
+                       tail->tail_size, head->hdr_size,
+                       (intmax_t)beg_off);
                return(EIO);
        }
-       if (tail->tail_type != undo->head.hdr_type) {
-               kprintf("HAMMER: Bad tail type %d\n", tail->tail_type);
+       if (tail->tail_type != head->hdr_type) {
+               kprintf("HAMMER: Bad tail type %04x vs %04x at %016jx\n",
+                       tail->tail_type, head->hdr_type,
+                       (intmax_t)beg_off);
                return(EIO);
        }
 
+       return(0);
+}
+
+/*
+ * Check that the FIFO record is in-bounds given the head and the
+ * hammer offset.
+ *
+ * Also checks that the head and tail structures agree with each other,
+ * but does not check beyond the signature, type, and size.
+ */
+static int
+hammer_check_head_signature(hammer_fifo_head_t head, hammer_off_t beg_off)
+{
+       hammer_fifo_tail_t tail;
+       hammer_off_t end_off;
+
+       /*
+        * head overlaps buffer boundary.  This could be a PAD so only
+        * check the minimum PAD size here.
+        */
+       if (((beg_off + sizeof(*tail) - 1) ^ (beg_off)) & ~HAMMER_BUFMASK64)
+               return(1);
+
+       /*
+        * Calculate the ending offset and make sure the record does
+        * not cross a buffer boundary.
+        */
+       end_off = beg_off + head->hdr_size;
+       if ((beg_off ^ (end_off - 1)) & ~HAMMER_BUFMASK64)
+               return(1);
+       tail = (void *)((char *)head + head->hdr_size - sizeof(*tail));
+       return (_hammer_check_signature(head, tail, beg_off));
+}
+
+/*
+ * Check that the FIFO record is in-bounds given the tail and the
+ * hammer offset.  The offset is pointing at the ending boundary of the
+ * record.
+ *
+ * Also checks that the head and tail structures agree with each other,
+ * but does not check beyond the signature, type, and size.
+ */
+static int
+hammer_check_tail_signature(hammer_fifo_tail_t tail, hammer_off_t end_off)
+{
+       hammer_fifo_head_t head;
+       hammer_off_t beg_off;
+
        /*
-        * Only process UNDO records
+        * tail overlaps buffer boundary
+        */
+       if (((end_off - sizeof(*tail)) ^ (end_off - 1)) & ~HAMMER_BUFMASK64)
+               return(1);
+
+       /*
+        * Calculate the begining offset and make sure the record does
+        * not cross a buffer boundary.
         */
-       if (undo->head.hdr_type != HAMMER_HEAD_TYPE_UNDO)
+       beg_off = end_off - tail->tail_size;
+       if ((beg_off ^ (end_off - 1)) & ~HAMMER_BUFMASK64)
+               return(1);
+       head = (void *)((char *)tail + sizeof(*tail) - tail->tail_size);
+       return (_hammer_check_signature(head, tail, beg_off));
+}
+
+static int
+hammer_recover_undo(hammer_mount_t hmp, hammer_volume_t root_volume,
+                   hammer_fifo_undo_t undo)
+{
+       hammer_volume_t volume;
+       hammer_buffer_t buffer;
+       hammer_off_t buf_offset;
+       int zone;
+       int error;
+       int vol_no;
+       int bytes;
+       u_int32_t offset;
+
+       /*
+        * Only process UNDO records.  Flag if we find other records to
+        * optimize stage2 recovery.
+        */
+       if (undo->head.hdr_type != HAMMER_HEAD_TYPE_UNDO) {
+               if (undo->head.hdr_type == HAMMER_HEAD_TYPE_REDO)
+                       hmp->hflags |= HMNT_HASREDO;
                return(0);
+       }
 
        /*
         * Validate the UNDO record.
         */
-       max_bytes = undo->head.hdr_size - sizeof(*undo) - sizeof(*tail);
-       if (undo->undo_data_bytes < 0 || undo->undo_data_bytes > max_bytes) {
+       bytes = undo->head.hdr_size - sizeof(*undo) -
+               sizeof(struct hammer_fifo_tail);
+       if (bytes < 0 || undo->undo_data_bytes < 0 ||
+           undo->undo_data_bytes > bytes) {
                kprintf("HAMMER: Corrupt UNDO record, undo_data_bytes %d/%d\n",
-                       undo->undo_data_bytes, max_bytes);
+                       undo->undo_data_bytes, bytes);
                return(EIO);
        }
 
+       bytes = undo->undo_data_bytes;
+
        /*
         * The undo offset may only be a zone-1 or zone-2 offset.
         *
@@ -345,7 +762,7 @@ hammer_recover_undo(hammer_mount_t hmp, hammer_volume_t root_volume,
        zone = HAMMER_ZONE_DECODE(undo->undo_offset);
        offset = undo->undo_offset & HAMMER_BUFMASK;
 
-       if (offset + undo->undo_data_bytes > HAMMER_BUFSIZE) {
+       if (offset + bytes > HAMMER_BUFSIZE) {
                kprintf("HAMMER: Corrupt UNDO record, bad offset\n");
                return (EIO);
        }
@@ -363,7 +780,7 @@ hammer_recover_undo(hammer_mount_t hmp, hammer_volume_t root_volume,
                hammer_recover_copy_undo(undo->undo_offset,
                                         (char *)(undo + 1),
                                         (char *)volume->ondisk + offset,
-                                        undo->undo_data_bytes);
+                                        bytes);
                hammer_modify_volume_done(volume);
 
                /*
@@ -384,15 +801,15 @@ hammer_recover_undo(hammer_mount_t hmp, hammer_volume_t root_volume,
                                           0, &error);
                if (buffer == NULL) {
                        kprintf("HAMMER: UNDO record, "
-                               "cannot access buffer %016llx\n",
-                               (long long)undo->undo_offset);
+                               "cannot access buffer %016jx\n",
+                               (intmax_t)undo->undo_offset);
                        break;
                }
                hammer_modify_buffer(NULL, buffer, NULL, 0);
                hammer_recover_copy_undo(undo->undo_offset,
                                         (char *)(undo + 1),
                                         (char *)buffer->ondisk + offset,
-                                        undo->undo_data_bytes);
+                                        bytes);
                hammer_modify_buffer_done(buffer);
 
                /*
@@ -418,11 +835,11 @@ hammer_recover_copy_undo(hammer_off_t undo_offset,
                         char *src, char *dst, int bytes)
 {
        if (hammer_debug_general & 0x0080) {
-               kprintf("UNDO %016llx: %d\n",
-                       (long long)undo_offset, bytes);
+               kprintf("UNDO %016jx: %d\n",
+                       (intmax_t)undo_offset, bytes);
        }
 #if 0
-       kprintf("UNDO %016llx:", (long long)undo_offset);
+       kprintf("UNDO %016jx:", (intmax_t)undo_offset);
        hammer_recover_debug_dump(22, dst, bytes);
        kprintf("%22s", "to:");
        hammer_recover_debug_dump(22, src, bytes);
index c5fb784..c2f4d07 100644 (file)
@@ -41,6 +41,7 @@
 #include "hammer.h"
 
 static int hammer_und_rb_compare(hammer_undo_t node1, hammer_undo_t node2);
+static void hammer_format_undo(void *base, u_int32_t seqno);
 
 RB_GENERATE2(hammer_und_rb_tree, hammer_undo, rb_node,
              hammer_und_rb_compare, hammer_off_t, offset);
@@ -73,15 +74,20 @@ hammer_undo_lookup(hammer_mount_t hmp, hammer_off_t zone3_off, int *errorp)
 }
 
 /*
- * Generate an UNDO record for the block of data at the specified zone1
+ * Generate UNDO record(s) for the block of data at the specified zone1
  * or zone2 offset.
  *
  * The recovery code will execute UNDOs in reverse order, allowing overlaps.
  * All the UNDOs are executed together so if we already laid one down we
  * do not have to lay another one down for the same range.
+ *
+ * For HAMMER version 4+ UNDO a 512 byte boundary is enforced and a PAD
+ * will be laid down for any unused space.  UNDO FIFO media structures
+ * will implement the hdr_seq field (it used to be reserved01), and
+ * both flush and recovery mechanics will be very different.
  */
 int
-hammer_generate_undo(hammer_transaction_t trans, hammer_io_t io,
+hammer_generate_undo(hammer_transaction_t trans,
                     hammer_off_t zone_off, void *base, int len)
 {
        hammer_mount_t hmp;
@@ -93,6 +99,7 @@ hammer_generate_undo(hammer_transaction_t trans, hammer_io_t io,
        hammer_off_t next_offset;
        int error;
        int bytes;
+       int n;
 
        hmp = trans->hmp;
 
@@ -108,103 +115,320 @@ hammer_generate_undo(hammer_transaction_t trans, hammer_io_t io,
 
        /* no undo recursion */
        hammer_modify_volume(NULL, root_volume, NULL, 0);
-
        hammer_lock_ex(&hmp->undo_lock);
-again:
-       /*
-        * Allocate space in the FIFO
-        */
-       bytes = ((len + HAMMER_HEAD_ALIGN_MASK) & ~HAMMER_HEAD_ALIGN_MASK) +
-               sizeof(struct hammer_fifo_undo) +
-               sizeof(struct hammer_fifo_tail);
-       if (hammer_undo_space(trans) < bytes + HAMMER_BUFSIZE*2)
-               panic("hammer: insufficient undo FIFO space!");
 
-       next_offset = undomap->next_offset;
+       /* undo had better not roll over (loose test) */
+       if (hammer_undo_space(trans) < len + HAMMER_BUFSIZE*3)
+               panic("hammer: insufficient undo FIFO space!");
 
        /*
-        * Wrap next_offset
+        * Loop until the undo for the entire range has been laid down.
         */
-       if (undomap->next_offset == undomap->alloc_offset) {
-               next_offset = HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0);
-               undomap->next_offset = next_offset;
+       while (len) {
+               /*
+                * Fetch the layout offset in the UNDO FIFO, wrap it as
+                * necessary.
+                */
+               if (undomap->next_offset == undomap->alloc_offset) {
+                       undomap->next_offset =
+                               HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0);
+               }
+               next_offset = undomap->next_offset;
+
+               /*
+                * This is a tail-chasing FIFO, when we hit the start of a new
+                * buffer we don't have to read it in.
+                */
+               if ((next_offset & HAMMER_BUFMASK) == 0) {
+                       undo = hammer_bnew(hmp, next_offset, &error, &buffer);
+                       hammer_format_undo(undo, hmp->undo_seqno ^ 0x40000000);
+               } else {
+                       undo = hammer_bread(hmp, next_offset, &error, &buffer);
+               }
+               if (error)
+                       break;
+               hammer_modify_buffer(NULL, buffer, NULL, 0);
+
+               /*
+                * Calculate how big a media structure fits up to the next
+                * alignment point and how large a data payload we can
+                * accomodate.
+                *
+                * If n calculates to 0 or negative there is no room for
+                * anything but a PAD.
+                */
+               bytes = HAMMER_UNDO_ALIGN -
+                       ((int)next_offset & HAMMER_UNDO_MASK);
+               n = bytes -
+                   (int)sizeof(struct hammer_fifo_undo) -
+                   (int)sizeof(struct hammer_fifo_tail);
+
+               /*
+                * If available space is insufficient for any payload
+                * we have to lay down a PAD.
+                *
+                * The minimum PAD is 8 bytes and the head and tail will
+                * overlap each other in that case.  PADs do not have
+                * sequence numbers or CRCs.
+                *
+                * A PAD may not start on a boundary.  That is, every
+                * 512-byte block in the UNDO/REDO FIFO must begin with
+                * a record containing a sequence number.
+                */
+               if (n <= 0) {
+                       KKASSERT(bytes >= sizeof(struct hammer_fifo_tail));
+                       KKASSERT(((int)next_offset & HAMMER_UNDO_MASK) != 0);
+                       tail = (void *)((char *)undo + bytes - sizeof(*tail));
+                       if ((void *)undo != (void *)tail) {
+                               tail->tail_signature = HAMMER_TAIL_SIGNATURE;
+                               tail->tail_type = HAMMER_HEAD_TYPE_PAD;
+                               tail->tail_size = bytes;
+                       }
+                       undo->head.hdr_signature = HAMMER_HEAD_SIGNATURE;
+                       undo->head.hdr_type = HAMMER_HEAD_TYPE_PAD;
+                       undo->head.hdr_size = bytes;
+                       /* NO CRC OR SEQ NO */
+                       undomap->next_offset += bytes;
+                       hammer_modify_buffer_done(buffer);
+                       hammer_stats_undo += bytes;
+                       continue;
+               }
+
+               /*
+                * Calculate the actual payload and recalculate the size
+                * of the media structure as necessary.
+                */
+               if (n > len) {
+                       n = len;
+                       bytes = ((n + HAMMER_HEAD_ALIGN_MASK) &
+                                ~HAMMER_HEAD_ALIGN_MASK) +
+                               (int)sizeof(struct hammer_fifo_undo) +
+                               (int)sizeof(struct hammer_fifo_tail);
+               }
+               if (hammer_debug_general & 0x0080) {
+                       kprintf("undo %016llx %d %d\n",
+                               (long long)next_offset, bytes, n);
+               }
+
+               undo->head.hdr_signature = HAMMER_HEAD_SIGNATURE;
+               undo->head.hdr_type = HAMMER_HEAD_TYPE_UNDO;
+               undo->head.hdr_size = bytes;
+               undo->head.hdr_seq = hmp->undo_seqno++;
+               undo->head.hdr_crc = 0;
+               undo->undo_offset = zone_off;
+               undo->undo_data_bytes = n;
+               bcopy(base, undo + 1, n);
+
+               tail = (void *)((char *)undo + bytes - sizeof(*tail));
+               tail->tail_signature = HAMMER_TAIL_SIGNATURE;
+               tail->tail_type = HAMMER_HEAD_TYPE_UNDO;
+               tail->tail_size = bytes;
+
+               KKASSERT(bytes >= sizeof(undo->head));
+               undo->head.hdr_crc = crc32(undo, HAMMER_FIFO_HEAD_CRCOFF) ^
+                            crc32(&undo->head + 1, bytes - sizeof(undo->head));
+               undomap->next_offset += bytes;
+               hammer_stats_undo += bytes;
+
+               /*
+                * Before we finish off the buffer we have to deal with any
+                * junk between the end of the media structure we just laid
+                * down and the UNDO alignment boundary.  We do this by laying
+                * down a dummy PAD.  Even though we will probably overwrite
+                * it almost immediately we have to do this so recovery runs
+                * can iterate the UNDO space without having to depend on
+                * the indices in the volume header.
+                *
+                * This dummy PAD will be overwritten on the next undo so
+                * we do not adjust undomap->next_offset.
+                */
+               bytes = HAMMER_UNDO_ALIGN -
+                       ((int)undomap->next_offset & HAMMER_UNDO_MASK);
+               if (bytes != HAMMER_UNDO_ALIGN) {
+                       KKASSERT(bytes >= sizeof(struct hammer_fifo_tail));
+                       undo = (void *)(tail + 1);
+                       tail = (void *)((char *)undo + bytes - sizeof(*tail));
+                       if ((void *)undo != (void *)tail) {
+                               tail->tail_signature = HAMMER_TAIL_SIGNATURE;
+                               tail->tail_type = HAMMER_HEAD_TYPE_PAD;
+                               tail->tail_size = bytes;
+                       }
+                       undo->head.hdr_signature = HAMMER_HEAD_SIGNATURE;
+                       undo->head.hdr_type = HAMMER_HEAD_TYPE_PAD;
+                       undo->head.hdr_size = bytes;
+                       /* NO CRC OR SEQ NO */
+               }
+               hammer_modify_buffer_done(buffer);
+
+               /*
+                * Adjust for loop
+                */
+               len -= n;
+               base = (char *)base + n;
+               zone_off += n;
+       }
+       hammer_modify_volume_done(root_volume);
+       hammer_unlock(&hmp->undo_lock);
+       /* XXX flush volume header */
+
+       if (buffer)
+               hammer_rel_buffer(buffer, 0);
+       return(error);
+}
+
+#if 0
+/*
+ * HAMMER version 4+ REDO support.
+ *
+ * Generate REDO record(s) for logical data writes to a file.  REDO records
+ * are only created if the created inode was previously synced (such that
+ * it will still exist after any recovery), and also only for a limited
+ * amount of write data between fsyncs.
+ *
+ * REDO records are used to improve fsync() performance.  Instead of having
+ * to go through a complete flush cycle involving at least two disk
+ * synchronizations the fsync need only flush UNDO FIFO buffers through
+ * the related REDO records, which is a single synchronization requiring
+ * no track seeking.  If a recovery becomes necessary the recovery code
+ * will generate logical data writes based on the REDO records encountered.
+ * That is, the recovery code will UNDO any partial meta-data/data writes
+ * at the raw disk block level and then REDO the data writes at the logical
+ * level.
+ */
+int
+hammer_generate_redo(hammer_transaction_t trans, hammer_inode_t ip,
+                    hammer_off_t file_off, hammer_off_t zone_off,
+                    void *base, int len)
+{
+}
+#endif
+
+/*
+ * Preformat a new UNDO block.  We could read the old one in but we get
+ * better performance if we just pre-format a new one.
+ *
+ * The recovery code always works forwards so the caller just makes sure the
+ * seqno is not contiguous with prior UNDOs or ancient UNDOs now being
+ * overwritten.
+ */
+static
+void
+hammer_format_undo(void *base, u_int32_t seqno)
+{
+       hammer_fifo_head_t head;
+       hammer_fifo_tail_t tail;
+       int i;
+       int bytes = HAMMER_UNDO_ALIGN;
+
+       bzero(base, HAMMER_BUFSIZE);
+
+       for (i = 0; i < HAMMER_BUFSIZE; i += bytes) {
+               head = (void *)((char *)base + i);
+               tail = (void *)((char *)head + bytes - sizeof(*tail));
+
+               head->hdr_signature = HAMMER_HEAD_SIGNATURE;
+               head->hdr_type = HAMMER_HEAD_TYPE_DUMMY;
+               head->hdr_size = bytes;
+               head->hdr_seq = seqno++;
+               head->hdr_crc = 0;
+
+               tail->tail_signature = HAMMER_TAIL_SIGNATURE;
+               tail->tail_type = HAMMER_HEAD_TYPE_DUMMY;
+               tail->tail_size = bytes;
+
+               head->hdr_crc = crc32(head, HAMMER_FIFO_HEAD_CRCOFF) ^
+                            crc32(head + 1, bytes - sizeof(*head));
        }
+}
+
+/*
+ * HAMMER version 4+ conversion support.
+ *
+ * Convert a HAMMER version < 4 UNDO FIFO area to a 4+ UNDO FIFO area.
+ * The 4+ UNDO FIFO area is backwards compatible.  The conversion is
+ * needed to initialize the sequence space and place headers on the
+ * new 512-byte undo boundary.
+ */
+int
+hammer_upgrade_undo_4(hammer_transaction_t trans)
+{
+       hammer_mount_t hmp;
+       hammer_volume_t root_volume;
+       hammer_blockmap_t undomap;
+       hammer_buffer_t buffer = NULL;
+       hammer_fifo_head_t head;
+       hammer_fifo_tail_t tail;
+       hammer_off_t next_offset;
+       u_int32_t seqno;
+       int error;
+       int bytes;
+
+       hmp = trans->hmp;
+
+       root_volume = trans->rootvol;
+
+       /* no undo recursion */
+       hammer_lock_ex(&hmp->undo_lock);
+       hammer_modify_volume(NULL, root_volume, NULL, 0);
 
        /*
-        * This is a tail-chasing FIFO, when we hit the start of a new
-        * buffer we don't have to read it in.
+        * Adjust the in-core undomap and the on-disk undomap.
         */
-       if ((next_offset & HAMMER_BUFMASK) == 0)
-               undo = hammer_bnew(hmp, next_offset, &error, &buffer);
-       else
-               undo = hammer_bread(hmp, next_offset, &error, &buffer);
-       if (error)
-               goto done;
-
-       hammer_modify_buffer(NULL, buffer, NULL, 0);
+       next_offset = HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0);
+       undomap = &hmp->blockmap[HAMMER_ZONE_UNDO_INDEX];
+       undomap->next_offset = next_offset;
+       undomap->first_offset = next_offset;
 
-       KKASSERT(undomap->next_offset == next_offset);
+       undomap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
+       undomap->next_offset = next_offset;
+       undomap->first_offset = next_offset;
 
        /*
-        * The FIFO entry would cross a buffer boundary, PAD to the end
-        * of the buffer and try again.  Due to our data alignment, the
-        * worst case (smallest) PAD record is 8 bytes.  PAD records only
-        * populate the first 8 bytes of hammer_fifo_head and the tail may
-        * be at the same offset as the head.
+        * Loop over the entire UNDO space creating DUMMY entries.  Sequence
+        * numbers are assigned.
         */
-       if ((next_offset ^ (next_offset + bytes)) & ~HAMMER_BUFMASK64) {
-               bytes = HAMMER_BUFSIZE - ((int)next_offset & HAMMER_BUFMASK);
-               tail = (void *)((char *)undo + bytes - sizeof(*tail));
-               if ((void *)undo != (void *)tail) {
-                       tail->tail_signature = HAMMER_TAIL_SIGNATURE;
-                       tail->tail_type = HAMMER_HEAD_TYPE_PAD;
-                       tail->tail_size = bytes;
-               }
-               undo->head.hdr_signature = HAMMER_HEAD_SIGNATURE;
-               undo->head.hdr_type = HAMMER_HEAD_TYPE_PAD;
-               undo->head.hdr_size = bytes;
-               /* NO CRC */
-               undomap->next_offset += bytes;
+       seqno = 0;
+       bytes = HAMMER_UNDO_ALIGN;
+
+       while (next_offset != undomap->alloc_offset) {
+               head = hammer_bnew(hmp, next_offset, &error, &buffer);
+               if (error)
+                       break;
+               hammer_modify_buffer(NULL, buffer, NULL, 0);
+               tail = (void *)((char *)head + bytes - sizeof(*tail));
+
+               head->hdr_signature = HAMMER_HEAD_SIGNATURE;
+               head->hdr_type = HAMMER_HEAD_TYPE_DUMMY;
+               head->hdr_size = bytes;
+               head->hdr_seq = seqno;
+               head->hdr_crc = 0;
+
+               tail = (void *)((char *)head + bytes - sizeof(*tail));
+               tail->tail_signature = HAMMER_TAIL_SIGNATURE;
+               tail->tail_type = HAMMER_HEAD_TYPE_DUMMY;
+               tail->tail_size = bytes;
+
+               head->hdr_crc = crc32(head, HAMMER_FIFO_HEAD_CRCOFF) ^
+                            crc32(head + 1, bytes - sizeof(*head));
                hammer_modify_buffer_done(buffer);
+
                hammer_stats_undo += bytes;
-               goto again;
-       }
-       if (hammer_debug_general & 0x0080) {
-               kprintf("undo %016llx %d %d\n",
-                       (long long)next_offset, bytes, len);
+               next_offset += HAMMER_UNDO_ALIGN;
+               ++seqno;
        }
 
        /*
-        * We're good, create the entry.
+        * The sequence number will be the next sequence number to lay down.
         */
-       undo->head.hdr_signature = HAMMER_HEAD_SIGNATURE;
-       undo->head.hdr_type = HAMMER_HEAD_TYPE_UNDO;
-       undo->head.hdr_size = bytes;
-       undo->head.reserved01 = 0;
-       undo->head.hdr_crc = 0;
-       undo->undo_offset = zone_off;
-       undo->undo_data_bytes = len;
-       bcopy(base, undo + 1, len);
-
-       tail = (void *)((char *)undo + bytes - sizeof(*tail));
-       tail->tail_signature = HAMMER_TAIL_SIGNATURE;
-       tail->tail_type = HAMMER_HEAD_TYPE_UNDO;
-       tail->tail_size = bytes;
-
-       KKASSERT(bytes >= sizeof(undo->head));
-       undo->head.hdr_crc = crc32(undo, HAMMER_FIFO_HEAD_CRCOFF) ^
-                            crc32(&undo->head + 1, bytes - sizeof(undo->head));
-       undomap->next_offset += bytes;
-       hammer_stats_undo += bytes;
+       hmp->undo_seqno = seqno;
+       kprintf("version upgrade seqno start %08x\n", seqno);
 
-       hammer_modify_buffer_done(buffer);
-done:
        hammer_modify_volume_done(root_volume);
        hammer_unlock(&hmp->undo_lock);
 
        if (buffer)
                hammer_rel_buffer(buffer, 0);
-       return(error);
+       return (error);
 }
 
 /*
index 55d16b9..89a3832 100644 (file)
@@ -47,7 +47,7 @@
 #include <sys/buf2.h>
 #include "hammer.h"
 
-int hammer_supported_version = HAMMER_VOL_VERSION_TWO;
+int hammer_supported_version = HAMMER_VOL_VERSION_DEFAULT;
 int hammer_debug_io;
 int hammer_debug_general;
 int hammer_debug_debug = 1;            /* medium-error panics */ 
@@ -426,6 +426,9 @@ hammer_vfs_mount(struct mount *mp, char *mntpt, caddr_t data,
 
        /*
         * Re-open read-write if originally read-only, or vise-versa.
+        *
+        * When going from read-only to read-write execute the stage2
+        * recovery if it has not already been run.
         */
        if (mp->mnt_flag & MNT_UPDATE) {
                error = 0;
@@ -437,6 +440,7 @@ hammer_vfs_mount(struct mount *mp, char *mntpt, caddr_t data,
                        rootvol = hammer_get_root_volume(hmp, &error);
                        if (rootvol) {
                                hammer_recover_flush_buffers(hmp, rootvol, 1);
+                               error = hammer_recover_stage2(hmp, rootvol);
                                bcopy(rootvol->ondisk->vol0_blockmap,
                                      hmp->blockmap,
                                      sizeof(hmp->blockmap));
@@ -593,7 +597,11 @@ hammer_vfs_mount(struct mount *mp, char *mntpt, caddr_t data,
        if (hammer_debug_general & 0x0001)
                kprintf("HAMMER: undo_rec_limit %d\n", hmp->undo_rec_limit);
 
-       error = hammer_recover(hmp, rootvol);
+       /*
+        * NOTE: Recover stage1 not only handles meta-data recovery, it
+        *       also sets hmp->undo_seqno for HAMMER VERSION 4+ filesystems.
+        */
+       error = hammer_recover_stage1(hmp, rootvol);
        if (error) {
                kprintf("Failed to recover HAMMER filesystem on mount\n");
                goto done;
@@ -648,6 +656,7 @@ hammer_vfs_mount(struct mount *mp, char *mntpt, caddr_t data,
                goto done;
        vput(rootvp);
        /*vn_unlock(hmp->rootvp);*/
+       error = hammer_recover_stage2(hmp, rootvol);
 
 done:
        hammer_rel_volume(rootvol, 0);