HAMMER VFS - REDO implementation base code part 2/many
authorMatthew Dillon <dillon@apollo.backplane.com>
Tue, 12 Jan 2010 00:09:51 +0000 (16:09 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Tue, 12 Jan 2010 00:15:58 +0000 (16:15 -0800)
* Move hammer_generate_redo() to its own source file, hammer_redo.c

* Fix bug in the REDO generation.  The tail type was not set the same
  as the head type and caused recoveries to fail.

* Flesh out the REDO sequencing by adding REDO_TERM_* records during the
  meta-data flush, allowing REDO_WRITEs and REDO_TRUNCs to be matched
  against REDO_TERM_WRITEs and REDO_TERM_TRUNCs.

* Interlock the writing of the root volume in the flusher.  Frontend
  code modifies the root volume when generating REDO records and
  can collide with the flusher.

sys/conf/files
sys/vfs/hammer/Makefile
sys/vfs/hammer/hammer.h
sys/vfs/hammer/hammer_disk.h
sys/vfs/hammer/hammer_flusher.c
sys/vfs/hammer/hammer_inode.c
sys/vfs/hammer/hammer_redo.c [new file with mode: 0644]
sys/vfs/hammer/hammer_undo.c
sys/vfs/hammer/hammer_vnops.c

index 1ff75a2..5137fdb 100644 (file)
@@ -1422,6 +1422,7 @@ vfs/hammer/hammer_signal.c        optional hammer
 vfs/hammer/hammer_subs.c       optional hammer
 vfs/hammer/hammer_transaction.c        optional hammer
 vfs/hammer/hammer_undo.c       optional hammer
+vfs/hammer/hammer_redo.c       optional hammer
 vfs/hammer/hammer_vfsops.c     optional hammer
 vfs/hammer/hammer_vnops.c      optional hammer
 vm/default_pager.c             standard
index 2b9d568..d548fef 100644 (file)
@@ -7,6 +7,7 @@ SRCS=   hammer_vfsops.c hammer_vnops.c hammer_inode.c \
        hammer_cursor.c hammer_btree.c hammer_transaction.c \
        hammer_object.c hammer_recover.c hammer_ioctl.c \
        hammer_blockmap.c hammer_freemap.c hammer_undo.c \
+       hammer_redo.c \
        hammer_reblock.c hammer_rebalance.c \
        hammer_flusher.c hammer_mirror.c \
        hammer_pfs.c hammer_prune.c hammer_volume.c
index 528b8b1..0db6d68 100644 (file)
@@ -342,6 +342,16 @@ typedef struct hammer_inode *hammer_inode_t;
  * NOTE: DDIRTY does not include atime or mtime and does not include
  *      write-append size changes.  SDIRTY handles write-append size
  *      changes.
+ *
+ *      REDO indicates that REDO logging is active, creating a definitive
+ *      stream of REDO records in the UNDO/REDO log for writes and
+ *      truncations, including boundary records when/if REDO is turned off.
+ *      REDO is typically enabled by fsync() and turned off if excessive
+ *      writes without an fsync() occurs.
+ *
+ *      RDIRTY indicates that REDO records were laid down in the UNDO/REDO
+ *      FIFO (even if REDO is turned off some might still be active) and
+ *      still being tracked for this inode.  See hammer_redo.c
  */
                                        /* (not including atime/mtime) */
 #define HAMMER_INODE_DDIRTY    0x0001  /* in-memory ino_data is dirty */
@@ -368,6 +378,8 @@ typedef struct hammer_inode *hammer_inode_t;
 #define HAMMER_INODE_WOULDBLOCK 0x00400000 /* re-issue to new flush group */
 #define HAMMER_INODE_DUMMY     0x00800000 /* dummy inode covering bad file */
 #define HAMMER_INODE_SDIRTY    0x01000000 /* in-memory ino_data.size is dirty*/
+#define HAMMER_INODE_REDO      0x02000000 /* REDO logging active */
+#define HAMMER_INODE_RDIRTY    0x04000000 /* REDO records active in fifo */
 
 #define HAMMER_INODE_MODMASK   (HAMMER_INODE_DDIRTY|HAMMER_INODE_SDIRTY|   \
                                 HAMMER_INODE_XDIRTY|HAMMER_INODE_BUFS|     \
@@ -456,6 +468,7 @@ typedef struct hammer_record *hammer_record_t;
 #define HAMMER_RECF_DIRECT_IO          0x0200  /* related direct I/O running*/
 #define HAMMER_RECF_DIRECT_WAIT                0x0400  /* related direct I/O running*/
 #define HAMMER_RECF_DIRECT_INVAL       0x0800  /* buffer alias invalidation */
+#define HAMMER_RECF_REDO               0x1000  /* REDO was laid down */
 
 /*
  * hammer_create_at_cursor() and hammer_delete_at_cursor() flags.
@@ -816,6 +829,7 @@ typedef struct hammer_mount *hammer_mount_t;
 
 #define HAMMER_MOUNT_CRITICAL_ERROR    0x0001
 #define HAMMER_MOUNT_FLUSH_RECOVERY    0x0002
+#define HAMMER_MOUNT_REDO_SYNC         0x0004
 
 struct hammer_sync_info {
        int error;
@@ -1117,7 +1131,10 @@ void *hammer_alloc_data(hammer_transaction_t trans, int32_t data_len,
 int hammer_generate_undo(hammer_transaction_t trans,
                        hammer_off_t zone1_offset, void *base, int len);
 int hammer_generate_redo(hammer_transaction_t trans, hammer_inode_t ip,
-                       hammer_off_t file_offset, void *base, int len);
+                       hammer_off_t file_offset, u_int32_t flags,
+                       void *base, int len);
+void hammer_generate_redo_sync(hammer_transaction_t trans);
+void hammer_format_undo(void *base, u_int32_t seqno);
 int hammer_upgrade_undo_4(hammer_transaction_t trans);
 
 void hammer_put_volume(struct hammer_volume *volume, int flush);
index 9e6afae..7de627a 100644 (file)
@@ -440,7 +440,7 @@ typedef struct hammer_fifo_tail *hammer_fifo_tail_t;
 /*
  * Misc FIFO structures.
  *
- * NOTE: redo records are for version 4+ filesystems.
+ * UNDO - Raw meta-data media updates.
  */
 struct hammer_fifo_undo {
        struct hammer_fifo_head head;
@@ -450,15 +450,59 @@ struct hammer_fifo_undo {
        /* followed by data */
 };
 
+/*
+ * REDO (HAMMER version 4+) - Logical file writes/truncates.
+ *
+ * REDOs contain information which will be duplicated in a later meta-data
+ * update, allowing fast write()+fsync() operations.  REDOs can be ignored
+ * without harming filesystem integrity but must be processed if fsync()
+ * semantics are desired.
+ *
+ * Unlike UNDOs which are processed backwards within the recovery span,
+ * REDOs must be processed forwards starting further back (starting outside
+ * the recovery span).
+ *
+ *     WRITE   - Write logical file (with payload).  Executed both
+ *               out-of-span and in-span.  Out-of-span WRITEs may be
+ *               filtered out by TERMs.
+ *
+ *     TRUNC   - Truncate logical file (no payload).  Executed both
+ *               out-of-span and in-span.  Out-of-span WRITEs may be
+ *               filtered out by TERMs.
+ *
+ *     TERM_*  - Indicates meta-data was committed (if out-of-span) or
+ *               will be rolled-back (in-span).  Any out-of-span TERMs
+ *               matching earlier WRITEs remove those WRITEs from
+ *               consideration as they might conflict with a later data
+ *               commit (which is not being rolled-back).
+ *
+ *     SYNC    - The earliest in-span SYNC (the last one when scanning
+ *               backwards) tells the recovery code how far out-of-span
+ *               it must go to run REDOs.
+ *
+ * NOTE: WRITEs do not always have matching TERMs even under
+ *      perfect conditions because truncations might remove the
+ *      buffers from consideration.  I/O problems can also remove
+ *      buffers from consideration.
+ *
+ *      TRUNCSs do not always have matching TERMs because several
+ *      truncations may be aggregated together into a single TERM.
+ */
 struct hammer_fifo_redo {
        struct hammer_fifo_head head;
        int64_t                 redo_objid;     /* file being written */
        hammer_off_t            redo_offset;    /* logical offset in file */
        int32_t                 redo_data_bytes;
-       int32_t                 redo_reserved01;
+       u_int32_t               redo_flags;
        u_int64_t               redo_mtime;     /* set mtime */
 };
 
+#define HAMMER_REDO_WRITE      0x00000001
+#define HAMMER_REDO_TRUNC      0x00000002
+#define HAMMER_REDO_TERM_WRITE 0x00000004
+#define HAMMER_REDO_TERM_TRUNC 0x00000008
+#define HAMMER_REDO_SYNC       0x00000010
+
 union hammer_fifo_any {
        struct hammer_fifo_head head;
        struct hammer_fifo_undo undo;
index 8b98e59..130a529 100644 (file)
@@ -694,6 +694,11 @@ hammer_flusher_finalize(hammer_transaction_t trans, int final)
         *
         * vol0_last_tid is the highest fully-synchronized TID.  It is
         * set-up when the UNDO fifo is fully synced, later on (not here).
+        *
+        * The root volume can be open for modification by other threads
+        * generating UNDO or REDO records.  For example, reblocking,
+        * pruning, REDO mode fast-fsyncs, so the write interlock is
+        * mandatory.
         */
        if (root_volume->io.modified) {
                hammer_modify_volume(NULL, root_volume, NULL, 0);
@@ -701,7 +706,9 @@ hammer_flusher_finalize(hammer_transaction_t trans, int final)
                        root_volume->ondisk->vol0_next_tid = trans->tid;
                hammer_crc_set_volume(root_volume->ondisk);
                hammer_modify_volume_done(root_volume);
+               hammer_io_write_interlock(&root_volume->io);
                hammer_io_flush(&root_volume->io, 0);
+               hammer_io_done_interlock(&root_volume->io);
        }
 
        /*
@@ -773,6 +780,13 @@ hammer_flusher_finalize(hammer_transaction_t trans, int final)
                        wakeup(&hmp->flush_tid1);
                }
                hmp->flush_tid2 = trans->tid;
+
+               /*
+                * Clear the REDO SYNC flag.  This flag is used to ensure
+                * that the recovery span in the UNDO/REDO FIFO contains
+                * at least one REDO SYNC record.
+                */
+               hmp->flags &= ~HAMMER_MOUNT_REDO_SYNC;
        }
 
        /*
index af18c29..9223857 100644 (file)
@@ -407,7 +407,6 @@ loop:
        ip->cache[1].ip = ip;
        ip->cache[2].ip = ip;
        ip->cache[3].ip = ip;
-       ip->redo_count = SIZE_T_MAX;
        if (hmp->ronly)
                ip->flags |= HAMMER_INODE_RO;
        ip->sync_trunc_off = ip->trunc_off = ip->save_trunc_off =
@@ -590,7 +589,6 @@ loop:
        ip->cache[1].ip = ip;
        ip->cache[2].ip = ip;
        ip->cache[3].ip = ip;
-       ip->redo_count = SIZE_T_MAX;
        ip->sync_trunc_off = ip->trunc_off = ip->save_trunc_off =
                0x7FFFFFFFFFFFFFFFLL;
        RB_INIT(&ip->rec_tree);
@@ -720,7 +718,6 @@ hammer_create_inode(hammer_transaction_t trans, struct vattr *vap,
        ip->cache[1].ip = ip;
        ip->cache[2].ip = ip;
        ip->cache[3].ip = ip;
-       ip->redo_count = SIZE_T_MAX;
 
        ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
        /* ip->save_trunc_off = 0; (already zero) */
@@ -2306,7 +2303,7 @@ hammer_flush_inode_done(hammer_inode_t ip, int error)
 
        /*
         * The backend may have adjusted nlinks, so if the adjusted nlinks
-        * does not match the fronttend set the frontend's RDIRTY flag again.
+        * does not match the fronttend set the frontend's DDIRTY flag again.
         */
        if (ip->ino_data.nlinks != ip->sync_ino_data.nlinks)
                ip->flags |= HAMMER_INODE_DDIRTY;
@@ -2567,6 +2564,28 @@ hammer_sync_record_callback(hammer_record_t record, void *data)
                record->leaf.base.create_tid = trans->tid;
                record->leaf.create_ts = trans->time32;
        }
+
+       /*
+        * This actually moves the record to the on-media B-Tree.  We
+        * must also generate REDO_TERM entries in the UNDO/REDO FIFO
+        * indicating that the related REDO_WRITE(s) have been committed.
+        *
+        * During recovery any REDO_TERM's within the nominal recovery span
+        * are ignored since the related meta-data is being undone, causing
+        * any matching REDO_WRITEs to execute.  The REDO_TERMs outside
+        * the nominal recovery span will match against REDO_WRITEs and
+        * prevent them from being executed (because the meta-data has
+        * already been synchronized).
+        */
+       if (record->flags & HAMMER_RECF_REDO) {
+               KKASSERT(record->type == HAMMER_MEM_RECORD_DATA);
+               hammer_generate_redo(trans, record->ip,
+                                    record->leaf.base.key -
+                                        record->leaf.data_len,
+                                    HAMMER_REDO_TERM_WRITE,
+                                    NULL,
+                                    record->leaf.data_len);
+       }
        for (;;) {
                error = hammer_ip_sync_record_cursor(cursor, record);
                if (error != EDEADLK)
@@ -2626,7 +2645,7 @@ hammer_sync_inode(hammer_transaction_t trans, hammer_inode_t ip)
        /*
         * Any directory records referencing this inode which are not in
         * our current flush group must adjust our nlink count for the
-        * purposes of synchronization to disk.
+        * purposes of synchronizating to disk.
         *
         * Records which are in our flush group can be unlinked from our
         * inode now, potentially allowing the inode to be physically
@@ -2732,8 +2751,22 @@ hammer_sync_inode(hammer_transaction_t trans, hammer_inode_t ip)
                        goto done;
 
                /*
+                * Generate a REDO_TERM_TRUNC entry in the UNDO/REDO FIFO.
+                *
+                * XXX we do this even if we did not previously generate
+                * a REDO_TRUNC record.  This operation may enclosed the
+                * range for multiple prior truncation entries in the REDO
+                * log.
+                */
+               if (trans->hmp->version >= HAMMER_VOL_VERSION_FOUR) {
+                       hammer_generate_redo(trans, ip, aligned_trunc_off,
+                                            HAMMER_REDO_TERM_TRUNC,
+                                            NULL, 0);
+               }
+
+               /*
                 * Clear the truncation flag on the backend after we have
-                * complete the deletions.  Backend data is now good again
+                * completed the deletions.  Backend data is now good again
                 * (including new records we are about to sync, below).
                 *
                 * Leave sync_trunc_off intact.  As we write additional
@@ -2805,7 +2838,7 @@ hammer_sync_inode(hammer_transaction_t trans, hammer_inode_t ip)
                        /*
                         * Set delete_tid in both the frontend and backend
                         * copy of the inode record.  The DELETED flag handles
-                        * this, do not set RDIRTY.
+                        * this, do not set DDIRTY.
                         */
                        ip->ino_leaf.base.delete_tid = trans->tid;
                        ip->sync_ino_leaf.base.delete_tid = trans->tid;
@@ -2896,7 +2929,7 @@ defer_buffer_flush:
        }
 
        /*
-        * If RDIRTY, DDIRTY, or SDIRTY is set, write out a new record.
+        * If DDIRTY or SDIRTY is set, write out a new record.
         * If the inode is already on-disk the old record is marked as
         * deleted.
         *
diff --git a/sys/vfs/hammer/hammer_redo.c b/sys/vfs/hammer/hammer_redo.c
new file mode 100644 (file)
index 0000000..06317c9
--- /dev/null
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@backplane.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * HAMMER redo - REDO record support for the UNDO/REDO FIFO.
+ *
+ * See also hammer_undo.c
+ */
+
+#include "hammer.h"
+
+/*
+ * HAMMER version 4+ REDO support.
+ *
+ * REDO records are used to improve fsync() performance.  Instead of having
+ * to go through a complete double-flush cycle involving at least two disk
+ * synchronizations the fsync need only flush UNDO/REDO FIFO buffers through
+ * the related REDO records, which is a single synchronization requiring
+ * no track seeking.  If a recovery becomes necessary the recovery code
+ * will generate logical data writes based on the REDO records encountered.
+ * That is, the recovery code will UNDO any partial meta-data/data writes
+ * at the raw disk block level and then REDO the data writes at the logical
+ * level.
+ */
+int
+hammer_generate_redo(hammer_transaction_t trans, hammer_inode_t ip,
+                    hammer_off_t file_off, u_int32_t flags,
+                    void *base, int len)
+{
+       hammer_mount_t hmp;
+       hammer_volume_t root_volume;
+       hammer_blockmap_t undomap;
+       hammer_buffer_t buffer = NULL;
+       hammer_fifo_redo_t redo;
+       hammer_fifo_tail_t tail;
+       hammer_off_t next_offset;
+       int error;
+       int bytes;
+       int n;
+
+       /*
+        * Setup
+        */
+       hmp = trans->hmp;
+
+       root_volume = trans->rootvol;
+       undomap = &hmp->blockmap[HAMMER_ZONE_UNDO_INDEX];
+
+       /*
+        * No undo recursion when modifying the root volume
+        */
+       hammer_modify_volume(NULL, root_volume, NULL, 0);
+       hammer_lock_ex(&hmp->undo_lock);
+
+       /* undo had better not roll over (loose test) */
+       if (hammer_undo_space(trans) < len + HAMMER_BUFSIZE*3)
+               panic("hammer: insufficient undo FIFO space!");
+
+       /*
+        * Loop until the undo for the entire range has been laid down.
+        */
+       while (len) {
+               /*
+                * Fetch the layout offset in the UNDO FIFO, wrap it as
+                * necessary.
+                */
+               if (undomap->next_offset == undomap->alloc_offset) {
+                       undomap->next_offset =
+                               HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0);
+               }
+               next_offset = undomap->next_offset;
+
+               /*
+                * This is a tail-chasing FIFO, when we hit the start of a new
+                * buffer we don't have to read it in.
+                */
+               if ((next_offset & HAMMER_BUFMASK) == 0) {
+                       redo = hammer_bnew(hmp, next_offset, &error, &buffer);
+                       hammer_format_undo(redo, hmp->undo_seqno ^ 0x40000000);
+               } else {
+                       redo = hammer_bread(hmp, next_offset, &error, &buffer);
+               }
+               if (error)
+                       break;
+               hammer_modify_buffer(NULL, buffer, NULL, 0);
+
+               /*
+                * Calculate how big a media structure fits up to the next
+                * alignment point and how large a data payload we can
+                * accomodate.
+                *
+                * If n calculates to 0 or negative there is no room for
+                * anything but a PAD.
+                */
+               bytes = HAMMER_UNDO_ALIGN -
+                       ((int)next_offset & HAMMER_UNDO_MASK);
+               n = bytes -
+                   (int)sizeof(struct hammer_fifo_redo) -
+                   (int)sizeof(struct hammer_fifo_tail);
+
+               /*
+                * If available space is insufficient for any payload
+                * we have to lay down a PAD.
+                *
+                * The minimum PAD is 8 bytes and the head and tail will
+                * overlap each other in that case.  PADs do not have
+                * sequence numbers or CRCs.
+                *
+                * A PAD may not start on a boundary.  That is, every
+                * 512-byte block in the UNDO/REDO FIFO must begin with
+                * a record containing a sequence number.
+                */
+               if (n <= 0) {
+                       KKASSERT(bytes >= sizeof(struct hammer_fifo_tail));
+                       KKASSERT(((int)next_offset & HAMMER_UNDO_MASK) != 0);
+                       tail = (void *)((char *)redo + bytes - sizeof(*tail));
+                       if ((void *)redo != (void *)tail) {
+                               tail->tail_signature = HAMMER_TAIL_SIGNATURE;
+                               tail->tail_type = HAMMER_HEAD_TYPE_PAD;
+                               tail->tail_size = bytes;
+                       }
+                       redo->head.hdr_signature = HAMMER_HEAD_SIGNATURE;
+                       redo->head.hdr_type = HAMMER_HEAD_TYPE_PAD;
+                       redo->head.hdr_size = bytes;
+                       /* NO CRC OR SEQ NO */
+                       undomap->next_offset += bytes;
+                       hammer_modify_buffer_done(buffer);
+                       hammer_stats_redo += bytes;
+                       continue;
+               }
+
+               /*
+                * Calculate the actual payload and recalculate the size
+                * of the media structure as necessary.  If no data buffer
+                * is supplied there is no payload.
+                */
+               if (base == NULL) {
+                       n = 0;
+               } else if (n > len) {
+                       n = len;
+               }
+               bytes = ((n + HAMMER_HEAD_ALIGN_MASK) &
+                        ~HAMMER_HEAD_ALIGN_MASK) +
+                       (int)sizeof(struct hammer_fifo_redo) +
+                       (int)sizeof(struct hammer_fifo_tail);
+               if (hammer_debug_general & 0x0080) {
+                       kprintf("redo %016llx %d %d\n",
+                               (long long)next_offset, bytes, n);
+               }
+
+               redo->head.hdr_signature = HAMMER_HEAD_SIGNATURE;
+               redo->head.hdr_type = HAMMER_HEAD_TYPE_REDO;
+               redo->head.hdr_size = bytes;
+               redo->head.hdr_seq = hmp->undo_seqno++;
+               redo->head.hdr_crc = 0;
+               if (ip)
+                       redo->redo_objid = ip->obj_id;
+               redo->redo_mtime = trans->time;
+               redo->redo_offset = file_off;
+               redo->redo_flags = flags;
+
+               /*
+                * Incremental payload.  If no payload we throw the entire
+                * len into redo_data_bytes and will not loop.
+                */
+               if (base) {
+                       redo->redo_data_bytes = n;
+                       bcopy(base, redo + 1, n);
+                       len -= n;
+                       base = (char *)base + n;
+                       file_off += n;
+               } else {
+                       redo->redo_data_bytes = len;
+                       file_off += len;
+                       len = 0;
+               }
+
+               tail = (void *)((char *)redo + bytes - sizeof(*tail));
+               tail->tail_signature = HAMMER_TAIL_SIGNATURE;
+               tail->tail_type = HAMMER_HEAD_TYPE_REDO;
+               tail->tail_size = bytes;
+
+               KKASSERT(bytes >= sizeof(redo->head));
+               redo->head.hdr_crc = crc32(redo, HAMMER_FIFO_HEAD_CRCOFF) ^
+                            crc32(&redo->head + 1, bytes - sizeof(redo->head));
+               undomap->next_offset += bytes;
+               hammer_stats_redo += bytes;
+
+               /*
+                * Before we finish off the buffer we have to deal with any
+                * junk between the end of the media structure we just laid
+                * down and the UNDO alignment boundary.  We do this by laying
+                * down a dummy PAD.  Even though we will probably overwrite
+                * it almost immediately we have to do this so recovery runs
+                * can iterate the UNDO space without having to depend on
+                * the indices in the volume header.
+                *
+                * This dummy PAD will be overwritten on the next undo so
+                * we do not adjust undomap->next_offset.
+                */
+               bytes = HAMMER_UNDO_ALIGN -
+                       ((int)undomap->next_offset & HAMMER_UNDO_MASK);
+               if (bytes != HAMMER_UNDO_ALIGN) {
+                       KKASSERT(bytes >= sizeof(struct hammer_fifo_tail));
+                       redo = (void *)(tail + 1);
+                       tail = (void *)((char *)redo + bytes - sizeof(*tail));
+                       if ((void *)redo != (void *)tail) {
+                               tail->tail_signature = HAMMER_TAIL_SIGNATURE;
+                               tail->tail_type = HAMMER_HEAD_TYPE_PAD;
+                               tail->tail_size = bytes;
+                       }
+                       redo->head.hdr_signature = HAMMER_HEAD_SIGNATURE;
+                       redo->head.hdr_type = HAMMER_HEAD_TYPE_PAD;
+                       redo->head.hdr_size = bytes;
+                       /* NO CRC OR SEQ NO */
+               }
+               hammer_modify_buffer_done(buffer);
+       }
+       hammer_modify_volume_done(root_volume);
+       hammer_unlock(&hmp->undo_lock);
+
+       if (buffer)
+               hammer_rel_buffer(buffer, 0);
+       return(error);
+}
+
+/*
+ * Generate a REDO SYNC record.  At least one such record must be generated
+ * in the nominal recovery span for the recovery code to be able to run
+ * REDOs outside of the span.
+ */
+void
+hammer_generate_redo_sync(hammer_transaction_t trans)
+{
+#if 0
+       hammer_generate_redo(trans, NULL, 0, HAMMER_REDO_SYNC, NULL, 0);
+#endif
+       trans->hmp->flags |= HAMMER_MOUNT_REDO_SYNC;
+}
index 14582b7..e7320cc 100644 (file)
@@ -41,7 +41,6 @@
 #include "hammer.h"
 
 static int hammer_und_rb_compare(hammer_undo_t node1, hammer_undo_t node2);
-static void hammer_format_undo(void *base, u_int32_t seqno);
 
 RB_GENERATE2(hammer_und_rb_tree, hammer_undo, rb_node,
              hammer_und_rb_compare, hammer_off_t, offset);
@@ -85,6 +84,8 @@ hammer_undo_lookup(hammer_mount_t hmp, hammer_off_t zone3_off, int *errorp)
  * will be laid down for any unused space.  UNDO FIFO media structures
  * will implement the hdr_seq field (it used to be reserved01), and
  * both flush and recovery mechanics will be very different.
+ *
+ * WARNING!  See also hammer_generate_redo() in hammer_redo.c
  */
 int
 hammer_generate_undo(hammer_transaction_t trans,
@@ -104,6 +105,17 @@ hammer_generate_undo(hammer_transaction_t trans,
        hmp = trans->hmp;
 
        /*
+        * A SYNC record may be required before we can lay down a general
+        * UNDO.  This ensures that the nominal recovery span contains
+        * at least one SYNC record telling the recovery code how far
+        * out-of-span it must go to run the REDOs.
+        */
+       if ((hmp->flags & HAMMER_MOUNT_REDO_SYNC) == 0 &&
+           hmp->version >= HAMMER_VOL_VERSION_FOUR) {
+               hammer_generate_redo_sync(trans);
+       }
+
+       /*
         * Enter the offset into our undo history.  If there is an existing
         * undo we do not have to generate a new one.
         */
@@ -276,209 +288,6 @@ hammer_generate_undo(hammer_transaction_t trans,
 }
 
 /*
- * HAMMER version 4+ REDO support.
- *
- * Generate REDO record(s) for logical data writes to a file.  REDO records
- * are only created if the created inode was previously synced (such that
- * it will still exist after any recovery), and also only for a limited
- * amount of write data between fsyncs.
- *
- * REDO records are used to improve fsync() performance.  Instead of having
- * to go through a complete flush cycle involving at least two disk
- * synchronizations the fsync need only flush UNDO FIFO buffers through
- * the related REDO records, which is a single synchronization requiring
- * no track seeking.  If a recovery becomes necessary the recovery code
- * will generate logical data writes based on the REDO records encountered.
- * That is, the recovery code will UNDO any partial meta-data/data writes
- * at the raw disk block level and then REDO the data writes at the logical
- * level.
- */
-int
-hammer_generate_redo(hammer_transaction_t trans, hammer_inode_t ip,
-                    hammer_off_t file_off, void *base, int len)
-{
-       hammer_mount_t hmp;
-       hammer_volume_t root_volume;
-       hammer_blockmap_t undomap;
-       hammer_buffer_t buffer = NULL;
-       hammer_fifo_redo_t redo;
-       hammer_fifo_tail_t tail;
-       hammer_off_t next_offset;
-       int error;
-       int bytes;
-       int n;
-
-       hmp = trans->hmp;
-
-       root_volume = trans->rootvol;
-       undomap = &hmp->blockmap[HAMMER_ZONE_UNDO_INDEX];
-
-       /* no undo recursion */
-       hammer_modify_volume(NULL, root_volume, NULL, 0);
-       hammer_lock_ex(&hmp->undo_lock);
-
-       /* undo had better not roll over (loose test) */
-       if (hammer_undo_space(trans) < len + HAMMER_BUFSIZE*3)
-               panic("hammer: insufficient undo FIFO space!");
-
-       /*
-        * Loop until the undo for the entire range has been laid down.
-        */
-       while (len) {
-               /*
-                * Fetch the layout offset in the UNDO FIFO, wrap it as
-                * necessary.
-                */
-               if (undomap->next_offset == undomap->alloc_offset) {
-                       undomap->next_offset =
-                               HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0);
-               }
-               next_offset = undomap->next_offset;
-
-               /*
-                * This is a tail-chasing FIFO, when we hit the start of a new
-                * buffer we don't have to read it in.
-                */
-               if ((next_offset & HAMMER_BUFMASK) == 0) {
-                       redo = hammer_bnew(hmp, next_offset, &error, &buffer);
-                       hammer_format_undo(redo, hmp->undo_seqno ^ 0x40000000);
-               } else {
-                       redo = hammer_bread(hmp, next_offset, &error, &buffer);
-               }
-               if (error)
-                       break;
-               hammer_modify_buffer(NULL, buffer, NULL, 0);
-
-               /*
-                * Calculate how big a media structure fits up to the next
-                * alignment point and how large a data payload we can
-                * accomodate.
-                *
-                * If n calculates to 0 or negative there is no room for
-                * anything but a PAD.
-                */
-               bytes = HAMMER_UNDO_ALIGN -
-                       ((int)next_offset & HAMMER_UNDO_MASK);
-               n = bytes -
-                   (int)sizeof(struct hammer_fifo_redo) -
-                   (int)sizeof(struct hammer_fifo_tail);
-
-               /*
-                * If available space is insufficient for any payload
-                * we have to lay down a PAD.
-                *
-                * The minimum PAD is 8 bytes and the head and tail will
-                * overlap each other in that case.  PADs do not have
-                * sequence numbers or CRCs.
-                *
-                * A PAD may not start on a boundary.  That is, every
-                * 512-byte block in the UNDO/REDO FIFO must begin with
-                * a record containing a sequence number.
-                */
-               if (n <= 0) {
-                       KKASSERT(bytes >= sizeof(struct hammer_fifo_tail));
-                       KKASSERT(((int)next_offset & HAMMER_UNDO_MASK) != 0);
-                       tail = (void *)((char *)redo + bytes - sizeof(*tail));
-                       if ((void *)redo != (void *)tail) {
-                               tail->tail_signature = HAMMER_TAIL_SIGNATURE;
-                               tail->tail_type = HAMMER_HEAD_TYPE_PAD;
-                               tail->tail_size = bytes;
-                       }
-                       redo->head.hdr_signature = HAMMER_HEAD_SIGNATURE;
-                       redo->head.hdr_type = HAMMER_HEAD_TYPE_PAD;
-                       redo->head.hdr_size = bytes;
-                       /* NO CRC OR SEQ NO */
-                       undomap->next_offset += bytes;
-                       hammer_modify_buffer_done(buffer);
-                       hammer_stats_redo += bytes;
-                       continue;
-               }
-
-               /*
-                * Calculate the actual payload and recalculate the size
-                * of the media structure as necessary.
-                */
-               if (n > len) {
-                       n = len;
-                       bytes = ((n + HAMMER_HEAD_ALIGN_MASK) &
-                                ~HAMMER_HEAD_ALIGN_MASK) +
-                               (int)sizeof(struct hammer_fifo_redo) +
-                               (int)sizeof(struct hammer_fifo_tail);
-               }
-               if (hammer_debug_general & 0x0080) {
-                       kprintf("redo %016llx %d %d\n",
-                               (long long)next_offset, bytes, n);
-               }
-
-               redo->head.hdr_signature = HAMMER_HEAD_SIGNATURE;
-               redo->head.hdr_type = HAMMER_HEAD_TYPE_REDO;
-               redo->head.hdr_size = bytes;
-               redo->head.hdr_seq = hmp->undo_seqno++;
-               redo->head.hdr_crc = 0;
-               redo->redo_objid = ip->obj_id;
-               redo->redo_mtime = trans->time;
-               redo->redo_offset = file_off;
-               redo->redo_data_bytes = n;
-               redo->redo_reserved01 = 0;
-               bcopy(base, redo + 1, n);
-
-               tail = (void *)((char *)redo + bytes - sizeof(*tail));
-               tail->tail_signature = HAMMER_TAIL_SIGNATURE;
-               tail->tail_type = HAMMER_HEAD_TYPE_UNDO;
-               tail->tail_size = bytes;
-
-               KKASSERT(bytes >= sizeof(redo->head));
-               redo->head.hdr_crc = crc32(redo, HAMMER_FIFO_HEAD_CRCOFF) ^
-                            crc32(&redo->head + 1, bytes - sizeof(redo->head));
-               undomap->next_offset += bytes;
-               hammer_stats_redo += bytes;
-
-               /*
-                * Before we finish off the buffer we have to deal with any
-                * junk between the end of the media structure we just laid
-                * down and the UNDO alignment boundary.  We do this by laying
-                * down a dummy PAD.  Even though we will probably overwrite
-                * it almost immediately we have to do this so recovery runs
-                * can iterate the UNDO space without having to depend on
-                * the indices in the volume header.
-                *
-                * This dummy PAD will be overwritten on the next undo so
-                * we do not adjust undomap->next_offset.
-                */
-               bytes = HAMMER_UNDO_ALIGN -
-                       ((int)undomap->next_offset & HAMMER_UNDO_MASK);
-               if (bytes != HAMMER_UNDO_ALIGN) {
-                       KKASSERT(bytes >= sizeof(struct hammer_fifo_tail));
-                       redo = (void *)(tail + 1);
-                       tail = (void *)((char *)redo + bytes - sizeof(*tail));
-                       if ((void *)redo != (void *)tail) {
-                               tail->tail_signature = HAMMER_TAIL_SIGNATURE;
-                               tail->tail_type = HAMMER_HEAD_TYPE_PAD;
-                               tail->tail_size = bytes;
-                       }
-                       redo->head.hdr_signature = HAMMER_HEAD_SIGNATURE;
-                       redo->head.hdr_type = HAMMER_HEAD_TYPE_PAD;
-                       redo->head.hdr_size = bytes;
-                       /* NO CRC OR SEQ NO */
-               }
-               hammer_modify_buffer_done(buffer);
-
-               /*
-                * Adjust for loop
-                */
-               len -= n;
-               base = (char *)base + n;
-               file_off += n;
-       }
-       hammer_modify_volume_done(root_volume);
-       hammer_unlock(&hmp->undo_lock);
-
-       if (buffer)
-               hammer_rel_buffer(buffer, 0);
-       return(error);
-}
-
-/*
  * Preformat a new UNDO block.  We could read the old one in but we get
  * better performance if we just pre-format a new one.
  *
@@ -488,8 +297,9 @@ hammer_generate_redo(hammer_transaction_t trans, hammer_inode_t ip,
  *
  * The preformatted UNDO headers use the smallest possible sector size
  * (512) to ensure that any missed media writes are caught.
+ *
+ * NOTE: Also used by the REDO code.
  */
-static
 void
 hammer_format_undo(void *base, u_int32_t seqno)
 {
index 18e9922..9b27196 100644 (file)
@@ -218,13 +218,11 @@ hammer_vop_fsync(struct vop_fsync_args *ap)
                switch(hammer_fsync_mode) {
                case 0:
 mode0:
-                       /* disable REDO, full synchronous flush */
-                       ip->redo_count = SIZE_T_MAX;
+                       /* no REDO, full synchronous flush */
                        goto skip;
                case 1:
 mode1:
-                       /* disable REDO, full asynchronous flush */
-                       ip->redo_count = SIZE_T_MAX;
+                       /* no REDO, full asynchronous flush */
                        if (waitfor == MNT_WAIT)
                                waitfor = MNT_NOWAIT;
                        goto skip;
@@ -254,12 +252,11 @@ mode1:
                }
 
                /*
-                * redo_count is initialized to a maximal value and set
-                * to 0 after the first fsync() on a file, which enables
-                * REDO logging on the inode unless the number of bytes
-                * written exceeds the limit.
+                * Fast fsync only needs to flush the UNDO/REDO fifo if
+                * HAMMER_INODE_REDO is non-zero and the only modifications
+                * made to the file are write or write-extends.
                 */
-               if (ip->redo_count < hammer_limit_redo &&
+               if ((ip->flags & HAMMER_INODE_REDO) &&
                    (ip->flags & HAMMER_INODE_MODMASK_NOREDO) == 0
                ) {
                        ++hammer_count_fsyncs;
@@ -267,7 +264,21 @@ mode1:
                        ip->redo_count = 0;
                        return(0);
                }
-               ip->redo_count = 0;
+
+               /*
+                * REDO is enabled by fsync(), the idea being we really only
+                * want to lay down REDO records when programs are using
+                * fsync() heavily.  The first fsync() on the file starts
+                * the gravy train going and later fsync()s keep it hot by
+                * resetting the redo_count.
+                *
+                * We weren't running REDOs before now so we have to fall
+                * through and do a full fsync of what we have.
+                */
+               if (hmp->version >= HAMMER_VOL_VERSION_FOUR) {
+                       ip->flags |= HAMMER_INODE_REDO;
+                       ip->redo_count = 0;
+               }
        }
 skip:
 
@@ -504,11 +515,14 @@ hammer_vop_write(struct vop_write_args *ap)
         * atomicy and allow the operation to be interrupted by a signal
         * or it can DOS the machine.
         *
-        * Adjust redo_count early to avoid generating unnecessary redos.
+        * Preset redo_count so we stop generating REDOs earlier if the
+        * limit is exceeded.
         */
        bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
-       if (ip->redo_count < hammer_limit_redo)
+       if ((ip->flags & HAMMER_INODE_REDO) &&
+           ip->redo_count < hammer_limit_redo) {
                ip->redo_count += uio->uio_resid;
+       }
 
        /*
         * Access the data typically in HAMMER_BUFSIZE blocks via the
@@ -663,21 +677,31 @@ hammer_vop_write(struct vop_write_args *ap)
                        error = uiomove(bp->b_data + offset, n, uio);
 
                /*
-                * Generate REDO records while redo_count has not exceeded
-                * the limit.  Note that redo_count is initialized to a
-                * maximal value until the first fsync(), and zerod on every
-                * fsync().  Thus at least one fsync() is required before we
-                * start generating REDO records for the ip.
+                * Generate REDO records if enabled and redo_count will not
+                * exceeded the limit.
+                *
+                * If redo_count exceeds the limit we stop generating records
+                * and clear HAMMER_INODE_REDO.  This will cause the next
+                * fsync() to do a full meta-data sync instead of just an
+                * UNDO/REDO fifo update.
+                *
+                * When clearing HAMMER_INODE_REDO any pre-existing REDOs
+                * will still be tracked.  The tracks will be terminated
+                * when the related meta-data (including possible data
+                * modifications which are not tracked via REDO) is
+                * flushed.
                 */
-               if (hmp->version >= HAMMER_VOL_VERSION_FOUR &&
-                   ip->redo_count < hammer_limit_redo &&
-                   error == 0) {
-                       hammer_sync_lock_sh(&trans);
-                       error = hammer_generate_redo(&trans, ip,
+               if ((ip->flags & HAMMER_INODE_REDO) && error == 0) {
+                       if (ip->redo_count < hammer_limit_redo) {
+                               bp->b_flags |= B_VFSFLAG1;
+                               error = hammer_generate_redo(&trans, ip,
                                                     base_offset + offset,
+                                                    HAMMER_REDO_WRITE,
                                                     bp->b_data + offset,
                                                     (size_t)n);
-                       hammer_sync_unlock(&trans);
+                       } else {
+                               ip->flags &= ~HAMMER_INODE_REDO;
+                       }
                }
 
                /*
@@ -2090,12 +2114,23 @@ hammer_vop_setattr(struct vop_setattr_args *ap)
                case VREG:
                        if (vap->va_size == ip->ino_data.size)
                                break;
+
+                       /*
+                        * Log the operation if in fast-fsync mode.
+                        */
+                       if (ip->flags & HAMMER_INODE_REDO) {
+                               error = hammer_generate_redo(&trans, ip,
+                                                            vap->va_size,
+                                                            HAMMER_REDO_TRUNC,
+                                                            NULL, 0);
+                       }
+                       blksize = hammer_blocksize(vap->va_size);
+
                        /*
                         * XXX break atomicy, we can deadlock the backend
                         * if we do not release the lock.  Probably not a
                         * big deal here.
                         */
-                       blksize = hammer_blocksize(vap->va_size);
                        if (vap->va_size < ip->ino_data.size) {
                                vtruncbuf(ap->a_vp, vap->va_size, blksize);
                                truncating = 1;
@@ -2107,6 +2142,7 @@ hammer_vop_setattr(struct vop_setattr_args *ap)
                        }
                        ip->ino_data.size = vap->va_size;
                        ip->ino_data.mtime = trans.time;
+                       /* XXX safe to use SDIRTY instead of DDIRTY here? */
                        modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
 
                        /*
@@ -3024,7 +3060,17 @@ hammer_vop_strategy_write(struct vop_strategy_args *ap)
 
        record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data,
                                    bytes, &error);
+
+       /*
+        * B_VFSFLAG1 indicates that a REDO_WRITE entry was generated
+        * in hammer_vop_write().  We must flag the record so the proper
+        * REDO_TERM_WRITE entry is generated during the flush.
+        */
        if (record) {
+               if (bp->b_flags & B_VFSFLAG1) {
+                       record->flags |= HAMMER_RECF_REDO;
+                       bp->b_flags &= ~B_VFSFLAG1;
+               }
                hammer_io_direct_write(hmp, record, bio);
                if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs)
                        hammer_flush_inode(ip, 0);