HAMMER VFS - Implement REDO recovery code
authorMatthew Dillon <dillon@apollo.backplane.com>
Fri, 26 Mar 2010 05:42:47 +0000 (22:42 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Fri, 26 Mar 2010 05:42:47 +0000 (22:42 -0700)
* Implement the primary REDO recovery mechanics and document the whole mess.
  REDO recovery essentially works using an expanded UNDO/REDO FIFO range.

  The nominal UNDO range, required for running UNDOs, is calculated first.
  If a REDO_SYNC record is found within this range the record specifies
  the expanded FIFO start offset required to run REDOs.  This is necessary
  because the inodes related to REDOs layed down in the FIFO are not
  necessarily flush in the next flush sequence, so the recovery code may
  have to scan the UNDO/REDO FIFO backwards considerably beyond the nominal
  recovery ranged required to run UNDOs in order to find active REDOs.

  When a REDO_SYNC record is found the recovery code expands the range
  by scanning backwards and validating the UNDO/REDO FIFO as it goes.
  It must make sure that the sequence space remains contiguous all the
  back to the REDO_SYNC point.

  While doing the reverse scan the recovery code collects REDO_TERM_*
  records which are used to mask earlier REDO_* records once their
  meta-data has been flushed.  Only TERM records in the expanded range
  that are outside the nominal UNDO range matter.  Any TERM records in
  the nominal UNDO range refer to meta-data which was undone by the
  stage1 UNDO recovery and so must be ignored (we want to run the
  related REDOs).

  The recovery code then does a forward scan through the entire expanded
  range of the UNDO/REDO FIFO executing any REDO_* records it finds
  which have not been masked by later REDO_TERM_* records.  It executes
  the REDOs using the live filesystem.

* Expand the REDO FIFO structure, I had forgotten to add a localization
  field, otherwise HAMMER doesn't know which PFS the REDO is refering to.

* Umount was improperly flushing the FIFO to the disk for read-only mounts.
  Fix it.

* The recovery code now detects whether any REDOs are present by the
  observation of a REDO_SYNC record in the nominal UNDO recovery range.
  It will not run stage2 (the REDO pass) if it does not see this record.

* Properly generate a REDO_SYNC record in the UNDO space when generating
  only REDOs, as well as UNDOs.  HAMMER was previously only generating
  the REDO_SYNC record when generating UNDOs.

* Generate a REDO_TRUNC record during a file flush if any records were
  previously queued with REDO, even if those records no longer exist
  (e.g. due to a truncation) and even if REDO is now turned off due to
  redo heuristic limits being exceeded.

  This is necessary in order for the recovery code to properly sequence
  REDOs and TRUNCations during recovery.

* For now be very verbose during redo recovery.

* Make sure that mount -o ro and mount -u -o rw work properly.  The
  stage2 REDO cannot be run on a read-only mount because it requires a
  live filesystem.  The operations are defered until the mount is
  upgraded to rw.

sys/vfs/hammer/hammer.h
sys/vfs/hammer/hammer_disk.h
sys/vfs/hammer/hammer_flusher.c
sys/vfs/hammer/hammer_mount.h
sys/vfs/hammer/hammer_recover.c
sys/vfs/hammer/hammer_redo.c
sys/vfs/hammer/hammer_vnops.c

index 3040abc..06c792b 100644 (file)
@@ -869,6 +869,8 @@ struct hammer_mount {
        hammer_tid_t    flush_tid2;             /* flusher tid sequencing */
        int64_t copy_stat_freebigblocks;        /* number of free bigblocks */
        u_int32_t       undo_seqno;             /* UNDO/REDO FIFO seqno */
+       u_int32_t       recover_stage2_seqno;   /* REDO recovery seqno */
+       hammer_off_t    recover_stage2_offset;  /* REDO recovery offset */
 
        struct netexport export;
        struct hammer_lock sync_lock;
@@ -896,6 +898,8 @@ typedef struct hammer_mount *hammer_mount_t;
 #define HAMMER_MOUNT_CRITICAL_ERROR    0x0001
 #define HAMMER_MOUNT_FLUSH_RECOVERY    0x0002
 #define HAMMER_MOUNT_REDO_SYNC         0x0004
+#define HAMMER_MOUNT_REDO_RECOVERY_REQ 0x0008
+#define HAMMER_MOUNT_REDO_RECOVERY_RUN 0x0010
 
 struct hammer_sync_info {
        int error;
index 6871246..7d6ddfa 100644 (file)
@@ -494,6 +494,8 @@ struct hammer_fifo_redo {
        hammer_off_t            redo_offset;    /* logical offset in file */
        int32_t                 redo_data_bytes;
        u_int32_t               redo_flags;
+       u_int32_t               redo_localization;
+       u_int32_t               redo_reserved;
        u_int64_t               redo_mtime;     /* set mtime */
 };
 
index abf1ceb..4dd72b9 100644 (file)
@@ -894,6 +894,8 @@ hammer_flusher_meta_halflimit(hammer_mount_t hmp)
 int
 hammer_flusher_haswork(hammer_mount_t hmp)
 {
+       if (hmp->ronly)
+               return(0);
        if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR)
                return(0);
        if (TAILQ_FIRST(&hmp->flush_group_list) ||      /* dirty inodes */
index db8e3da..1d8ca8d 100644 (file)
@@ -59,8 +59,6 @@ struct hammer_mount_info {
 #define HMNT_MASTERID  0x00000002      /* master_id field set */
 #define HMNT_EXPORTREQ 0x00000004
 #define HMNT_UNDO_DIRTY        0x00000008
-#define HMNT_STAGE2    0x00000010      /* ran stage-2 recovery */
-#define HMNT_HASREDO   0x00000020      /* stage-2 must scan for REDO */
 
 #define HMNT_USERFLAGS (HMNT_NOHISTORY | HMNT_MASTERID)
 
index 03f1f07..34643b8 100644 (file)
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
- * 
- * $DragonFly: src/sys/vfs/hammer/hammer_recover.c,v 1.29 2008/07/26 05:36:21 dillon Exp $
+ */
+
+/*
+ * UNDO ALGORITHM:
+ *
+ *     The UNDO algorithm is trivial.  The nominal UNDO range in the
+ *     FIFO is determined by taking the first/next offset stored in
+ *     the volume header.  The next offset may not be correct since
+ *     UNDO flushes are not required to flush the volume header, so
+ *     the code also scans forward until it finds a discontinuous
+ *     sequence number.
+ *
+ *     The UNDOs are then scanned and executed in reverse order.  These
+ *     UNDOs are effectively just data restorations based on HAMMER offsets.
+ *
+ * REDO ALGORITHM:
+ *
+ *     REDO records are laid down in the UNDO/REDO FIFO for nominal
+ *     writes, truncations, and file extension ops.  On a per-inode
+ *     basis two types of REDO records are generated, REDO_WRITE
+ *     and REDO_TRUNC.
+ *
+ *     Essentially the recovery block will contain UNDO records backing
+ *     out partial operations and REDO records to regenerate those partial
+ *     operations guaranteed by the filesystem during recovery.
+ *
+ *     REDO generation is optional, and can also be started and then
+ *     later stopped due to excessive write()s inbetween fsyncs, or not
+ *     started at all.  Because of this the recovery code must determine
+ *     when REDOs are valid and when they are not.  Additional records are
+ *     generated to help figure it out.
+ *
+ *     The REDO_TERM_WRITE and REDO_TERM_TRUNC records are generated
+ *     during a flush cycle indicating which records the flush cycle
+ *     has synched meta-data for, and HAMMER_REDO_SYNC is generated in
+ *     each flush cycle to indicate how far back in the UNDO/REDO FIFO
+ *     the recovery code must go to find the earliest applicable REDO
+ *     record.  Applicable REDO records can be far outside the nominal
+ *     UNDO recovery range, for example if a write() lays down a REDO but
+ *     the related file is not flushed for several cycles.
+ *
+ *     The SYNC reference is to a point prior to the nominal UNDO FIFO
+ *     range, creating an extended REDO range which must be scanned.
+ *
+ *     Any REDO_WRITE/REDO_TRUNC encountered within the extended range
+ *     which have no matching REDO_TERM_WRITE/REDO_TERM_TRUNC records
+ *     prior to the start of the nominal UNDO range are applicable.
+ *     That is, any REDO_TERM_* records in the extended range but not in
+ *     the nominal undo range will mask any redo operations for prior REDO
+ *     records.  This is necessary because once the TERM is laid down
+ *     followup operations may make additional changes to the related
+ *     records but not necessarily record them as REDOs (because REDOs are
+ *     optional).
+ *
+ *     REDO_TERM_WRITE/REDO_TERM_TRUNC records in the nominal UNDO range
+ *     must be ignored since they represent meta-data flushes which are
+ *     undone by the UNDOs in that nominal UNDO range by the recovery
+ *     code.  Only REDO_TERM_* records in the extended range but not
+ *     in the nominal undo range are applicable.
+ *
+ *     The REDO_SYNC record itself always exists in the nominal UNDO range
+ *     (this is how the extended range is determined).  For recovery
+ *     purposes the most recent REDO_SYNC record is always used if several
+ *     are found.
+ *
+ * CRASHES DURING UNDO/REDO
+ *
+ *     A crash during the UNDO phase requires no additional effort.  The
+ *     UNDOs will simply be re-run again.  The state of the UNDO/REDO fifo
+ *     remains unchanged and has no re-crash issues.
+ *
+ *     A crash during the REDO phase is more complex because the REDOs
+ *     run normal filesystem ops and generate additional UNDO/REDO records.
+ *     REDO is disabled during REDO recovery and any SYNC records generated
+ *     by flushes during REDO recovery must continue to reference the
+ *     original extended range.
+ *
+ *     If multiple crashes occur and the UNDO/REDO FIFO wraps, REDO recovery
+ *     may become impossible.  This is detected when the start of the
+ *     extended range fails to have monotonically increasing sequence
+ *     numbers leading into the nominal undo range.
  */
 
 #include "hammer.h"
 
+/*
+ * Each rterm entry has a list of fifo offsets indicating termination
+ * points.  These are stripped as the scan progresses.
+ */
+typedef struct hammer_rterm_entry {
+       struct hammer_rterm_entry *next;
+       hammer_off_t            fifo_offset;
+} *hammer_rterm_entry_t;
+
+/*
+ * rterm entries sorted in RB tree are indexed by objid, flags, and offset.
+ * TRUNC entries ignore the offset.
+ */
+typedef struct hammer_rterm {
+       RB_ENTRY(hammer_rterm)  rb_node;
+       int64_t                 redo_objid;
+       u_int32_t               redo_localization;
+       u_int32_t               redo_flags;
+       hammer_off_t            redo_offset;
+       hammer_rterm_entry_t    term_list;
+} *hammer_rterm_t;
+
+static int hammer_rterm_rb_cmp(hammer_rterm_t rt1, hammer_rterm_t rt2);
+struct hammer_rterm_rb_tree;
+RB_HEAD(hammer_rterm_rb_tree, hammer_rterm);
+RB_PROTOTYPE(hammer_rterm_rb_tree, hammer_rterm, rb_node, hammer_rterm_rb_cmp);
+
 static int hammer_check_tail_signature(hammer_fifo_tail_t tail,
                        hammer_off_t end_off);
 static int hammer_check_head_signature(hammer_fifo_head_t head,
@@ -55,6 +161,16 @@ static void hammer_recover_debug_dump(int w, char *buf, int bytes);
 #endif
 static int hammer_recover_undo(hammer_mount_t hmp, hammer_volume_t root_volume,
                        hammer_fifo_undo_t undo);
+static int hammer_recover_redo_rec(hammer_mount_t hmp,
+                       struct hammer_rterm_rb_tree *root,
+                       hammer_off_t redo_fifo_offset, hammer_fifo_redo_t redo);
+static int hammer_recover_redo_run(hammer_mount_t hmp,
+                       struct hammer_rterm_rb_tree *root,
+                       hammer_off_t redo_fifo_offset, hammer_fifo_redo_t redo);
+static void hammer_recover_redo_exec(hammer_mount_t hmp,
+                       hammer_fifo_redo_t redo);
+
+RB_GENERATE(hammer_rterm_rb_tree, hammer_rterm, rb_node, hammer_rterm_rb_cmp);
 
 /*
  * Recover filesystem meta-data on mount.  This procedure figures out the
@@ -96,6 +212,8 @@ hammer_recover_stage1(hammer_mount_t hmp, hammer_volume_t root_volume)
        buffer = NULL;
        error = 0;
 
+       hmp->recover_stage2_offset = 0;
+
        if (first_offset > rootmap->alloc_offset ||
            last_offset > rootmap->alloc_offset) {
                kprintf("HAMMER(%s) Illegal UNDO FIFO index range "
@@ -162,6 +280,8 @@ hammer_recover_stage1(hammer_mount_t hmp, hammer_volume_t root_volume)
                scan_offset = first_offset;
                scan_offset_save = scan_offset;
                ++seqno;
+               hmp->recover_stage2_seqno = seqno;
+
                for (;;) {
                        head = hammer_recover_scan_fwd(hmp, root_volume,
                                                       &scan_offset,
@@ -242,8 +362,7 @@ hammer_recover_stage1(hammer_mount_t hmp, hammer_volume_t root_volume)
                goto done;
        }
 
-       kprintf("HAMMER(%s) Start recovery undo %016jx - %016jx "
-               "(%jd bytes of UNDO)%s\n",
+       kprintf("HAMMER(%s) recovery undo  %016jx-%016jx (%jd bytes)%s\n",
                root_volume->ondisk->vol_name,
                (intmax_t)first_offset,
                (intmax_t)last_offset,
@@ -266,6 +385,10 @@ hammer_recover_stage1(hammer_mount_t hmp, hammer_volume_t root_volume)
                                               &scan_offset, &error, &buffer);
                if (error)
                        break;
+
+               /*
+                * Normal UNDO
+                */
                error = hammer_recover_undo(hmp, root_volume, &head->undo);
                if (error) {
                        kprintf("HAMMER(%s) UNDO record at %016jx failed\n",
@@ -273,6 +396,28 @@ hammer_recover_stage1(hammer_mount_t hmp, hammer_volume_t root_volume)
                                (intmax_t)scan_offset - head->head.hdr_size);
                        break;
                }
+
+               /*
+                * The first REDO_SYNC record encountered (scanning backwards)
+                * enables REDO processing.
+                */
+               if (head->head.hdr_type == HAMMER_HEAD_TYPE_REDO &&
+                   head->redo.redo_flags == HAMMER_REDO_SYNC) {
+                       if (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) {
+                               kprintf("HAMMER(%s) Ignoring extra REDO_SYNC "
+                                       "records in UNDO/REDO FIFO.\n",
+                                       root_volume->ondisk->vol_name
+                               );
+                       } else {
+                               hmp->flags |= HAMMER_MOUNT_REDO_RECOVERY_REQ;
+                               hmp->recover_stage2_offset =
+                                       head->redo.redo_offset;
+                               kprintf("HAMMER(%s) Found REDO_SYNC %016jx\n",
+                                       root_volume->ondisk->vol_name,
+                                       (intmax_t)head->redo.redo_offset);
+                       }
+               }
+
                bytes -= head->head.hdr_size;
 
                /*
@@ -301,6 +446,7 @@ hammer_recover_stage1(hammer_mount_t hmp, hammer_volume_t root_volume)
                        }
                }
        }
+       KKASSERT(error || bytes == 0);
 done:
        if (buffer) {
                hammer_rel_buffer(buffer, 0);
@@ -354,7 +500,8 @@ done:
  * switches from read-only to read-write.  vnodes may or may not be present.
  *
  * The stage1 code will have already calculated the correct FIFO range
- * and stored it in the rootmap.
+ * for the nominal UNDO FIFO and stored it in the rootmap.  The extended
+ * range for REDO is stored in hmp->recover_stage2_offset.
  */
 int
 hammer_recover_stage2(hammer_mount_t hmp, hammer_volume_t root_volume)
@@ -362,21 +509,25 @@ hammer_recover_stage2(hammer_mount_t hmp, hammer_volume_t root_volume)
        hammer_blockmap_t rootmap;
        hammer_buffer_t buffer;
        hammer_off_t scan_offset;
+       hammer_off_t oscan_offset;
        hammer_off_t bytes;
+       hammer_off_t ext_bytes;
        hammer_fifo_any_t head;
        hammer_off_t first_offset;
        hammer_off_t last_offset;
+       hammer_off_t ext_offset;
+       struct hammer_rterm_rb_tree rterm_root;
+       u_int32_t seqno;
        int error;
+       int verbose = 0;
+       int dorscan;
 
        /*
         * Stage 2 can only be run on a RW mount, or when the mount is
-        * switched from RO to RW.  It must be run only once.
+        * switched from RO to RW.
         */
        KKASSERT(hmp->ronly == 0);
-
-       if (hmp->hflags & HMNT_STAGE2)
-               return(0);
-       hmp->hflags |= HMNT_STAGE2;
+       RB_INIT(&rterm_root);
 
        /*
         * Examine the UNDO FIFO.  If it is empty the filesystem is clean
@@ -385,44 +536,159 @@ hammer_recover_stage2(hammer_mount_t hmp, hammer_volume_t root_volume)
        rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
        first_offset = rootmap->first_offset;
        last_offset  = rootmap->next_offset;
-       if (first_offset == last_offset)
+       if (first_offset == last_offset) {
+               KKASSERT((hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) == 0);
                return(0);
+       }
+
+       /*
+        * Stage2 must only be run once, and will not be run at all
+        * if Stage1 did not find a REDO_SYNC record.
+        */
+       error = 0;
+       buffer = NULL;
+
+       if ((hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) == 0)
+               goto done;
+       hmp->flags &= ~HAMMER_MOUNT_REDO_RECOVERY_REQ;
+       hmp->flags |= HAMMER_MOUNT_REDO_RECOVERY_RUN;
+       ext_offset = hmp->recover_stage2_offset;
+       if (ext_offset == 0) {
+               kprintf("HAMMER(%s) REDO stage specified but no REDO_SYNC "
+                       "offset, ignoring\n",
+                       root_volume->ondisk->vol_name);
+               goto done;
+       }
 
+       /*
+        * Calculate nominal UNDO range (this is not yet the extended
+        * range).
+        */
        if (last_offset >= first_offset) {
                bytes = last_offset - first_offset;
        } else {
                bytes = rootmap->alloc_offset - first_offset +
                        (last_offset & HAMMER_OFF_LONG_MASK);
        }
-       kprintf("HAMMER(%s) Start recovery redo %016jx - %016jx "
-               "(%jd bytes of REDO)%s\n",
+       kprintf("HAMMER(%s) recovery redo  %016jx-%016jx (%jd bytes)%s\n",
                root_volume->ondisk->vol_name,
                (intmax_t)first_offset,
                (intmax_t)last_offset,
                (intmax_t)bytes,
                (hmp->ronly ? " (RO)" : "(RW)"));
+       verbose = 1;
        if (bytes > (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK)) {
                kprintf("Undo size is absurd, unable to mount\n");
-               return(EIO);
+               error = EIO;
+               goto fatal;
        }
 
        /*
-        * Scan the REDOs forwards.
+        * Scan the REDOs backwards collecting REDO_TERM_* information.
+        * This information is only collected for the extended range,
+        * non-inclusive of any TERMs in the nominal UNDO range.
+        *
+        * If the stage2 extended range is inside the nominal undo range
+        * we have nothing to scan.
+        *
+        * This must fit in memory!
         */
-       scan_offset = first_offset;
-       buffer = NULL;
+       if (first_offset < last_offset) {
+               /*
+                * [      first_offset........last_offset      ]
+                */
+               if (ext_offset < first_offset) {
+                       dorscan = 1;
+                       ext_bytes = first_offset - ext_offset;
+               } else if (ext_offset > last_offset) {
+                       dorscan = 1;
+                       ext_bytes = (rootmap->alloc_offset - ext_offset) +
+                                   (first_offset & HAMMER_OFF_LONG_MASK);
+               } else {
+                       ext_bytes = -(ext_offset - first_offset);
+                       dorscan = 0;
+               }
+       } else {
+               /*
+                * [......last_offset         first_offset.....]
+                */
+               if (ext_offset < last_offset) {
+                       ext_bytes = -((rootmap->alloc_offset - first_offset) +
+                                   (ext_offset & HAMMER_OFF_LONG_MASK));
+                       dorscan = 0;
+               } else if (ext_offset > first_offset) {
+                       ext_bytes = -(ext_offset - first_offset);
+                       dorscan = 0;
+               } else {
+                       ext_bytes = first_offset - ext_offset;
+                       dorscan = 1;
+               }
+       }
 
-       while (bytes) {
+       if (dorscan) {
+               scan_offset = first_offset;
+               kprintf("HAMMER(%s) Find extended redo  %016jx, %jd extbytes\n",
+                       root_volume->ondisk->vol_name,
+                       (intmax_t)ext_offset,
+                       (intmax_t)ext_bytes);
+               seqno = hmp->recover_stage2_seqno - 1;
+               for (;;) {
+                       head = hammer_recover_scan_rev(hmp, root_volume,
+                                                      &scan_offset,
+                                                      &error, &buffer);
+                       if (error)
+                               break;
+                       if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) {
+                               if (head->head.hdr_seq != seqno) {
+                                       error = ERANGE;
+                                       break;
+                               }
+                               error = hammer_recover_redo_rec(
+                                               hmp, &rterm_root,
+                                               scan_offset, &head->redo);
+                               --seqno;
+                       }
+                       if (scan_offset == ext_offset)
+                               break;
+               }
+               if (error) {
+                       kprintf("HAMMER(%s) Find extended redo failed %d, "
+                               "unable to run REDO\n",
+                               root_volume->ondisk->vol_name,
+                               error);
+                       goto done;
+               }
+       } else {
+               kprintf("HAMMER(%s) Embeded extended redo %016jx, "
+                       "%jd extbytes\n",
+                       root_volume->ondisk->vol_name,
+                       (intmax_t)ext_offset,
+                       (intmax_t)ext_bytes);
+       }
+
+       /*
+        * Scan the REDO forwards through the entire extended range.
+        * Anything with a previously recorded matching TERM is discarded.
+        */
+       scan_offset = ext_offset;
+       bytes += ext_bytes;
+
+       /*
+        * NOTE: when doing a forward scan the returned scan_offset is
+        *       for the record following the returned record, so we
+        *       have to play a bit.
+        */
+       while ((int64_t)bytes > 0) {
                KKASSERT(scan_offset != last_offset);
 
+               oscan_offset = scan_offset;
                head = hammer_recover_scan_fwd(hmp, root_volume,
                                               &scan_offset, &error, &buffer);
                if (error)
                        break;
 
-#if 0
-               error = hammer_recover_redo(hmp, root_volume, &head->redo);
-#endif
+               error = hammer_recover_redo_run(hmp, &rterm_root,
+                                               oscan_offset, &head->redo);
                if (error) {
                        kprintf("HAMMER(%s) UNDO record at %016jx failed\n",
                                root_volume->ondisk->vol_name,
@@ -431,11 +697,30 @@ hammer_recover_stage2(hammer_mount_t hmp, hammer_volume_t root_volume)
                }
                bytes -= head->head.hdr_size;
        }
+       KKASSERT(error || bytes == 0);
        if (buffer) {
                hammer_rel_buffer(buffer, 0);
                buffer = NULL;
        }
 
+done:
+       /*
+        * Cleanup rterm tree
+        */
+       {
+               hammer_rterm_t rterm;
+               hammer_rterm_entry_t rte;
+
+               while ((rterm = RB_ROOT(&rterm_root)) != NULL) {
+                       RB_REMOVE(hammer_rterm_rb_tree, &rterm_root, rterm);
+                       while ((rte = rterm->term_list) != NULL) {
+                               rterm->term_list = rte->next;
+                               kfree(rte, hmp->m_misc);
+                       }
+                       kfree(rterm, hmp->m_misc);
+               }
+       }
+
        /*
         * Finish up flushing (or discarding) recovered buffers by executing
         * a normal flush cycle.  Setting HMNT_UNDO_DIRTY bypasses degenerate
@@ -449,8 +734,12 @@ hammer_recover_stage2(hammer_mount_t hmp, hammer_volume_t root_volume)
                        hmp->hflags |= HMNT_UNDO_DIRTY;
                hammer_flusher_sync(hmp);
        }
-       kprintf("HAMMER(%s) End redo recovery\n",
-               root_volume->ondisk->vol_name);
+fatal:
+       hmp->flags &= ~HAMMER_MOUNT_REDO_RECOVERY_RUN;
+       if (verbose) {
+               kprintf("HAMMER(%s) End redo recovery\n",
+                       root_volume->ondisk->vol_name);
+       }
        return (error);
 }
 
@@ -741,11 +1030,8 @@ hammer_recover_undo(hammer_mount_t hmp, hammer_volume_t root_volume,
         * Only process UNDO records.  Flag if we find other records to
         * optimize stage2 recovery.
         */
-       if (undo->head.hdr_type != HAMMER_HEAD_TYPE_UNDO) {
-               if (undo->head.hdr_type == HAMMER_HEAD_TYPE_REDO)
-                       hmp->hflags |= HMNT_HASREDO;
+       if (undo->head.hdr_type != HAMMER_HEAD_TYPE_UNDO)
                return(0);
-       }
 
        /*
         * Validate the UNDO record.
@@ -855,6 +1141,229 @@ hammer_recover_copy_undo(hammer_off_t undo_offset,
        bcopy(src, dst, bytes);
 }
 
+/*
+ * Record HAMMER_REDO_TERM_WRITE and HAMMER_REDO_TERM_TRUNC operations
+ * during the backwards scan of the extended UNDO/REDO FIFO.  This scan
+ * does not include the nominal UNDO range, just the extended range.
+ */
+int
+hammer_recover_redo_rec(hammer_mount_t hmp, struct hammer_rterm_rb_tree *root,
+                       hammer_off_t scan_offset, hammer_fifo_redo_t redo)
+{
+       hammer_rterm_t rterm;
+       hammer_rterm_t nrterm;
+       hammer_rterm_entry_t rte;
+
+       if (redo->head.hdr_type != HAMMER_HEAD_TYPE_REDO)
+               return(0);
+       if (redo->redo_flags != HAMMER_REDO_TERM_WRITE &&
+           redo->redo_flags != HAMMER_REDO_TERM_TRUNC) {
+               return(0);
+       }
+
+       nrterm = kmalloc(sizeof(*nrterm), hmp->m_misc, M_WAITOK|M_ZERO);
+       nrterm->redo_objid = redo->redo_objid;
+       nrterm->redo_localization = redo->redo_localization;
+       nrterm->redo_flags = redo->redo_flags;
+       nrterm->redo_offset = redo->redo_offset;
+
+       rterm = RB_INSERT(hammer_rterm_rb_tree, root, nrterm);
+       if (rterm)
+               kfree(nrterm, hmp->m_misc);
+       else
+               rterm = nrterm;
+
+       kprintf("record record %016jx objid %016jx offset %016jx flags %08x\n",
+               (intmax_t)scan_offset,
+               (intmax_t)redo->redo_objid,
+               (intmax_t)redo->redo_offset,
+               (int)redo->redo_flags);
+
+       /*
+        * Scan in reverse order, rte prepended, so the rte list will be
+        * in forward order.
+        */
+       rte = kmalloc(sizeof(*rte), hmp->m_misc, M_WAITOK|M_ZERO);
+       rte->fifo_offset = scan_offset;
+       rte->next = rterm->term_list;
+       rterm->term_list = rte;
+
+       return(0);
+}
+
+/*
+ * Execute HAMMER_REDO_WRITE and HAMMER_REDO_TRUNC operations during
+ * the forwards scan of the entire extended UNDO/REDO FIFO range.
+ *
+ * Records matching previously recorded TERMs have already been committed
+ * and are ignored.
+ */
+int
+hammer_recover_redo_run(hammer_mount_t hmp, struct hammer_rterm_rb_tree *root,
+                       hammer_off_t scan_offset, hammer_fifo_redo_t redo)
+{
+       struct hammer_rterm rtval;
+       hammer_rterm_t rterm;
+       hammer_rterm_entry_t rte;
+
+       if (redo->head.hdr_type != HAMMER_HEAD_TYPE_REDO)
+               return(0);
+
+       switch(redo->redo_flags) {
+       case HAMMER_REDO_WRITE:
+       case HAMMER_REDO_TRUNC:
+               /*
+                * We hit a REDO request.  The REDO request is only executed
+                * if there is no matching TERM.
+                */
+               bzero(&rtval, sizeof(rtval));
+               rtval.redo_objid = redo->redo_objid;
+               rtval.redo_localization = redo->redo_localization;
+               rtval.redo_offset = redo->redo_offset;
+               rtval.redo_flags = (redo->redo_flags == HAMMER_REDO_WRITE) ?
+                                  HAMMER_REDO_TERM_WRITE :
+                                  HAMMER_REDO_TERM_TRUNC;
+
+               rterm = RB_FIND(hammer_rterm_rb_tree, root, &rtval);
+               if (rterm) {
+                       kprintf("ignore record %016jx objid %016jx "
+                               "offset %016jx flags %08x\n",
+                               (intmax_t)scan_offset,
+                               (intmax_t)redo->redo_objid,
+                               (intmax_t)redo->redo_offset,
+                               (int)redo->redo_flags);
+
+                       break;
+               }
+               kprintf("run    record %016jx objid %016jx "
+                       "offset %016jx flags %08x\n",
+                       (intmax_t)scan_offset,
+                       (intmax_t)redo->redo_objid,
+                       (intmax_t)redo->redo_offset,
+                       (int)redo->redo_flags);
+
+               /*
+                * Redo stage2 can access a live filesystem, acquire the
+                * vnode.
+                */
+               hammer_recover_redo_exec(hmp, redo);
+               break;
+       case HAMMER_REDO_TERM_WRITE:
+       case HAMMER_REDO_TERM_TRUNC:
+               /*
+                * As we encounter TERMs in the forward scan we remove
+                * them.  Once the forward scan hits the nominal undo range
+                * there will be no more recorded TERMs.
+                */
+               bzero(&rtval, sizeof(rtval));
+               rtval.redo_objid = redo->redo_objid;
+               rtval.redo_localization = redo->redo_localization;
+               rtval.redo_flags = redo->redo_flags;
+               rtval.redo_offset = redo->redo_offset;
+
+               rterm = RB_FIND(hammer_rterm_rb_tree, root, &rtval);
+               if (rterm) {
+                       if ((rte = rterm->term_list) != NULL) {
+                               KKASSERT(rte->fifo_offset == scan_offset);
+                               rterm->term_list = rte->next;
+                               kfree(rte, hmp->m_misc);
+                       }
+               }
+               break;
+       }
+       return(0);
+}
+
+static void
+hammer_recover_redo_exec(hammer_mount_t hmp, hammer_fifo_redo_t redo)
+{
+       struct hammer_transaction trans;
+       struct vattr va;
+       struct hammer_inode *ip;
+       struct vnode *vp = NULL;
+       int error;
+
+       hammer_start_transaction(&trans, hmp);
+
+       ip = hammer_get_inode(&trans, NULL, redo->redo_objid,
+                             HAMMER_MAX_TID, redo->redo_localization,
+                             0, &error);
+       if (ip == NULL) {
+               kprintf("unable to find objid %016jx lo %08x\n",
+                       (intmax_t)redo->redo_objid, redo->redo_localization);
+               goto done2;
+       }
+       error = hammer_get_vnode(ip, &vp);
+       if (error) {
+               kprintf("unable to acquire vnode for %016jx lo %08x\n",
+                       (intmax_t)redo->redo_objid, redo->redo_localization);
+               goto done1;
+       }
+
+       switch(redo->redo_flags) {
+       case HAMMER_REDO_WRITE:
+               error = VOP_OPEN(vp, FREAD|FWRITE, proc0.p_ucred, NULL);
+               if (error) {
+                       kprintf("vn_rdwr open returned %d\n", error);
+                       break;
+               }
+               vn_unlock(vp);
+               error = vn_rdwr(UIO_WRITE, vp, (void *)(redo + 1),
+                               redo->redo_data_bytes,
+                               redo->redo_offset, UIO_SYSSPACE,
+                               0, proc0.p_ucred, NULL);
+               vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+               if (error)
+                       kprintf("vn_rdwr write returned %d\n", error);
+               VOP_CLOSE(vp, FREAD|FWRITE);
+               break;
+       case HAMMER_REDO_TRUNC:
+               kprintf("setattr offset %016jx error %d\n",
+                       (intmax_t)redo->redo_offset, error);
+               VATTR_NULL(&va);
+               va.va_size = redo->redo_offset;
+               error = VOP_SETATTR(vp, &va, proc0.p_ucred);
+               if (error)
+                       kprintf("stattr returned %d\n", error);
+               break;
+       }
+       vput(vp);
+done1:
+       hammer_rel_inode(ip, 0);
+done2:
+       hammer_done_transaction(&trans);
+}
+
+/*
+ * RB tree compare function.  Note that REDO_TERM_TRUNC ops ignore
+ * the offset.
+ *
+ * WRITE@0 TERM@0 WRITE@0 .... (no TERM@0) etc.
+ */
+static int
+hammer_rterm_rb_cmp(hammer_rterm_t rt1, hammer_rterm_t rt2)
+{
+       if (rt1->redo_objid < rt2->redo_objid)
+               return(-1);
+       if (rt1->redo_objid > rt2->redo_objid)
+               return(1);
+       if (rt1->redo_localization < rt2->redo_localization)
+               return(-1);
+       if (rt1->redo_localization > rt2->redo_localization)
+               return(1);
+       if (rt1->redo_flags < rt2->redo_flags)
+               return(-1);
+       if (rt1->redo_flags > rt2->redo_flags)
+               return(1);
+       if (rt1->redo_flags != HAMMER_REDO_TERM_TRUNC) {
+               if (rt1->redo_offset < rt2->redo_offset)
+                       return(-1);
+               if (rt1->redo_offset > rt2->redo_offset)
+                       return(1);
+       }
+       return(0);
+}
+
 #if 0
 
 static void
index c50792e..b145d44 100644 (file)
@@ -175,6 +175,7 @@ hammer_generate_redo(hammer_transaction_t trans, hammer_inode_t ip,
                 */
                if (ip) {
                        redo->redo_objid = ip->obj_id;
+                       redo->redo_localization = ip->obj_localization;
                        if ((ip->flags & HAMMER_INODE_RDIRTY) == 0) {
                                ip->redo_fifo_start = next_offset;
                                if (RB_INSERT(hammer_redo_rb_tree,
@@ -189,6 +190,7 @@ hammer_generate_redo(hammer_transaction_t trans, hammer_inode_t ip,
                                ip->redo_fifo_next = next_offset;
                } else {
                        redo->redo_objid = 0;
+                       redo->redo_localization = 0;
                }
 
                /*
@@ -283,6 +285,16 @@ hammer_generate_redo(hammer_transaction_t trans, hammer_inode_t ip,
 
        if (buffer)
                hammer_rel_buffer(buffer, 0);
+
+       /*
+        * Make sure the nominal undo span contains at least one REDO_SYNC,
+        * otherwise the REDO recovery will not be triggered.
+        */
+       if ((hmp->flags & HAMMER_MOUNT_REDO_SYNC) == 0 &&
+           flags != HAMMER_REDO_SYNC) {
+               hammer_generate_redo_sync(trans);
+       }
+
        return(error);
 }
 
@@ -294,20 +306,34 @@ hammer_generate_redo(hammer_transaction_t trans, hammer_inode_t ip,
  * The SYNC record contains the aggregate earliest UNDO/REDO FIFO offset
  * for all inodes with active REDOs.  This changes dynamically as inodes
  * get flushed.
+ *
+ * During recovery stage2 any new flush cycles must specify the original
+ * redo sync offset.  That way a crash will re-run the REDOs, at least
+ * up to the point where the UNDO FIFO does not overwrite the area.
  */
 void
 hammer_generate_redo_sync(hammer_transaction_t trans)
 {
        hammer_mount_t hmp = trans->hmp;
        hammer_inode_t ip;
+       hammer_off_t redo_fifo_start;
 
-       ip = RB_FIRST(hammer_redo_rb_tree, &hmp->rb_redo_root);
-       if (ip) {
+       if (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) {
+               ip = NULL;
+               redo_fifo_start = hmp->recover_stage2_offset;
+       } else {
+               ip = RB_FIRST(hammer_redo_rb_tree, &hmp->rb_redo_root);
+               if (ip)
+                       redo_fifo_start = ip->redo_fifo_start;
+               else
+                       redo_fifo_start = 0;
+       }
+       if (redo_fifo_start) {
                if (hammer_debug_io & 0x0004) {
                        kprintf("SYNC IP %p %016jx\n",
-                               ip, (uintmax_t)ip->redo_fifo_start);
+                               ip, (intmax_t)redo_fifo_start);
                }
-               hammer_generate_redo(trans, NULL, ip->redo_fifo_start,
+               hammer_generate_redo(trans, NULL, redo_fifo_start,
                                     HAMMER_REDO_SYNC, NULL, 0);
                trans->hmp->flags |= HAMMER_MOUNT_REDO_SYNC;
        }
index 77691a5..f34e8aa 100644 (file)
@@ -275,7 +275,8 @@ mode1:
                 * We weren't running REDOs before now so we have to fall
                 * through and do a full fsync of what we have.
                 */
-               if (hmp->version >= HAMMER_VOL_VERSION_FOUR) {
+               if (hmp->version >= HAMMER_VOL_VERSION_FOUR &&
+                   (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) == 0) {
                        ip->flags |= HAMMER_INODE_REDO;
                        ip->redo_count = 0;
                }
@@ -2144,9 +2145,17 @@ hammer_vop_setattr(struct vop_setattr_args *ap)
                                break;
 
                        /*
-                        * Log the operation if in fast-fsync mode.
+                        * Log the operation if in fast-fsync mode or if
+                        * there are unterminated redo write records present.
+                        *
+                        * The second check is needed so the recovery code
+                        * properly truncates write redos even if nominal
+                        * REDO operations is turned off due to excessive
+                        * writes, because the related records might be
+                        * destroyed and never lay down a TERM_WRITE.
                         */
-                       if (ip->flags & HAMMER_INODE_REDO) {
+                       if ((ip->flags & HAMMER_INODE_REDO) ||
+                           (ip->flags & HAMMER_INODE_RDIRTY)) {
                                error = hammer_generate_redo(&trans, ip,
                                                             vap->va_size,
                                                             HAMMER_REDO_TRUNC,