HAMMER VFS - REDO implementation base code part 3/many
authorMatthew Dillon <dillon@apollo.backplane.com>
Tue, 12 Jan 2010 04:46:08 +0000 (20:46 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Tue, 12 Jan 2010 04:57:06 +0000 (20:57 -0800)
* Track the oldest potentially uncommitted UNDO/REDO FIFO offset
  on an inode-by-inode basis and use a red-black tree to find
  the aggregate oldest offset.

* If REDOs are present generate a REDO SYNC entry in the UNDO/REDO FIFO
  within the recovery span which indicates to the recovery code how
  far out of the span it must go to process REDOs.

* Fix a bug in hammer_generate_redo() where the REDO would not be
  generated if the data length was 0 (SYNC records use a data length
  of 0 as a degenerate case).

* Print the REDO SYNC entries on the console if bit 2 is set in
  vfs.hammer.debug_io (0x04).

* NOTE: The recovery code does not yet process REDOs.

sys/vfs/hammer/hammer.h
sys/vfs/hammer/hammer_inode.c
sys/vfs/hammer/hammer_redo.c
sys/vfs/hammer/hammer_vfsops.c

index 0db6d68..29c231a 100644 (file)
@@ -286,6 +286,11 @@ RB_HEAD(hammer_ino_rb_tree, hammer_inode);
 RB_PROTOTYPEX(hammer_ino_rb_tree, INFO, hammer_inode, rb_node,
              hammer_ino_rb_compare, hammer_inode_info_t);
 
+struct hammer_redo_rb_tree;
+RB_HEAD(hammer_redo_rb_tree, hammer_inode);
+RB_PROTOTYPE2(hammer_redo_rb_tree, hammer_inode, rb_redonode,
+             hammer_redo_rb_compare, hammer_off_t);
+
 struct hammer_rec_rb_tree;
 struct hammer_record;
 RB_HEAD(hammer_rec_rb_tree, hammer_record);
@@ -299,6 +304,7 @@ struct hammer_inode {
        hammer_inode_state_t    flush_state;
        hammer_flush_group_t    flush_group;
        RB_ENTRY(hammer_inode)  rb_flsnode;     /* when on flush list */
+       RB_ENTRY(hammer_inode)  rb_redonode;    /* when INODE_RDIRTY is set */
        struct hammer_record_list target_list;  /* target of dependant recs */
        int64_t                 obj_id;         /* (key) object identifier */
        hammer_tid_t            obj_asof;       /* (key) snapshot or 0 */
@@ -332,6 +338,16 @@ struct hammer_inode {
        struct hammer_btree_leaf_elm sync_ino_leaf; /* to-sync cache */
        struct hammer_inode_data sync_ino_data; /* to-sync cache */
        size_t          redo_count;
+
+       /*
+        * Track the earliest offset in the UNDO/REDO FIFO containing
+        * REDO records.  This is staged to the backend during flush
+        * sequences.  While the inode is staged redo_fifo_next is used
+        * to track the earliest offset for rotation into redo_fifo_start
+        * on completion of the flush.
+        */
+       hammer_off_t    redo_fifo_start;
+       hammer_off_t    redo_fifo_next;
 };
 
 typedef struct hammer_inode *hammer_inode_t;
@@ -750,6 +766,7 @@ struct hammer_mount {
        struct mount *mp;
        /*struct vnode *rootvp;*/
        struct hammer_ino_rb_tree rb_inos_root;
+       struct hammer_redo_rb_tree rb_redo_root;
        struct hammer_vol_rb_tree rb_vols_root;
        struct hammer_nod_rb_tree rb_nods_root;
        struct hammer_und_rb_tree rb_undo_root;
@@ -1134,6 +1151,9 @@ int hammer_generate_redo(hammer_transaction_t trans, hammer_inode_t ip,
                        hammer_off_t file_offset, u_int32_t flags,
                        void *base, int len);
 void hammer_generate_redo_sync(hammer_transaction_t trans);
+void hammer_redo_fifo_start_flush(hammer_inode_t ip);
+void hammer_redo_fifo_end_flush(hammer_inode_t ip);
+
 void hammer_format_undo(void *base, u_int32_t seqno);
 int hammer_upgrade_undo_4(hammer_transaction_t trans);
 
@@ -1190,6 +1210,7 @@ int  hammer_create_inode(struct hammer_transaction *trans, struct vattr *vap,
 void hammer_rel_inode(hammer_inode_t ip, int flush);
 int hammer_reload_inode(hammer_inode_t ip, void *arg __unused);
 int hammer_ino_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2);
+int hammer_redo_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2);
 int hammer_destroy_inode_callback(hammer_inode_t ip, void *data __unused);
 
 int hammer_sync_inode(hammer_transaction_t trans, hammer_inode_t ip);
index 9223857..c70ac50 100644 (file)
@@ -76,6 +76,16 @@ hammer_ino_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2)
        return(0);
 }
 
+int
+hammer_redo_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2)
+{
+       if (ip1->redo_fifo_start < ip2->redo_fifo_start)
+               return(-1);
+       if (ip1->redo_fifo_start > ip2->redo_fifo_start)
+               return(1);
+       return(0);
+}
+
 /*
  * RB-Tree support for inode structures / special LOOKUP_INFO
  */
@@ -1436,6 +1446,10 @@ hammer_unload_inode(struct hammer_inode *ip)
        KKASSERT(RB_EMPTY(&ip->rec_tree));
        KKASSERT(TAILQ_EMPTY(&ip->target_list));
 
+       if (ip->flags & HAMMER_INODE_RDIRTY) {
+               RB_REMOVE(hammer_redo_rb_tree, &hmp->rb_redo_root, ip);
+               ip->flags &= ~HAMMER_INODE_RDIRTY;
+       }
        RB_REMOVE(hammer_ino_rb_tree, &hmp->rb_inos_root, ip);
 
        hammer_free_inode(ip);
@@ -1909,6 +1923,7 @@ hammer_flush_inode_core(hammer_inode_t ip, hammer_flush_group_t flg, int flags)
        ++ip->hmp->count_iqueued;
        ++hammer_count_iqueued;
        ++flg->total_count;
+       hammer_redo_fifo_start_flush(ip);
 
        /*
         * If the flush group reaches the autoflush limit we want to signal
@@ -2314,6 +2329,7 @@ hammer_flush_inode_done(hammer_inode_t ip, int error)
        if (ip->vp && RB_ROOT(&ip->vp->v_rbdirty_tree)) {
                ip->flags |= HAMMER_INODE_BUFS;
        }
+       hammer_redo_fifo_end_flush(ip);
 
        /*
         * Re-set the XDIRTY flag if some of the inode's in-memory records
@@ -2758,7 +2774,8 @@ hammer_sync_inode(hammer_transaction_t trans, hammer_inode_t ip)
                 * range for multiple prior truncation entries in the REDO
                 * log.
                 */
-               if (trans->hmp->version >= HAMMER_VOL_VERSION_FOUR) {
+               if (trans->hmp->version >= HAMMER_VOL_VERSION_FOUR &&
+                   (ip->flags & HAMMER_INODE_RDIRTY)) {
                        hammer_generate_redo(trans, ip, aligned_trunc_off,
                                             HAMMER_REDO_TERM_TRUNC,
                                             NULL, 0);
index 06317c9..c50792e 100644 (file)
@@ -40,6 +40,9 @@
 
 #include "hammer.h"
 
+RB_GENERATE2(hammer_redo_rb_tree, hammer_inode, rb_redonode,
+            hammer_redo_rb_compare, hammer_off_t, redo_fifo_start);
+
 /*
  * HAMMER version 4+ REDO support.
  *
@@ -89,8 +92,9 @@ hammer_generate_redo(hammer_transaction_t trans, hammer_inode_t ip,
 
        /*
         * Loop until the undo for the entire range has been laid down.
+        * Loop at least once (len might be 0 as a degenerate case).
         */
-       while (len) {
+       for (;;) {
                /*
                 * Fetch the layout offset in the UNDO FIFO, wrap it as
                 * necessary.
@@ -160,6 +164,33 @@ hammer_generate_redo(hammer_transaction_t trans, hammer_inode_t ip,
                        continue;
                }
 
+               /*
+                * When generating an inode-related REDO record we track
+                * the point in the UNDO/REDO FIFO containing the inode's
+                * earliest REDO record.  See hammer_generate_redo_sync().
+                *
+                * redo_fifo_next is cleared when an inode is staged to
+                * the backend and then used to determine how to reassign
+                * redo_fifo_start after the inode flush completes.
+                */
+               if (ip) {
+                       redo->redo_objid = ip->obj_id;
+                       if ((ip->flags & HAMMER_INODE_RDIRTY) == 0) {
+                               ip->redo_fifo_start = next_offset;
+                               if (RB_INSERT(hammer_redo_rb_tree,
+                                             &hmp->rb_redo_root, ip)) {
+                                       panic("hammer_generate_redo: "
+                                             "cannot insert inode %p on "
+                                             "redo FIFO", ip);
+                               }
+                               ip->flags |= HAMMER_INODE_RDIRTY;
+                       }
+                       if (ip->redo_fifo_next == 0)
+                               ip->redo_fifo_next = next_offset;
+               } else {
+                       redo->redo_objid = 0;
+               }
+
                /*
                 * Calculate the actual payload and recalculate the size
                 * of the media structure as necessary.  If no data buffer
@@ -184,8 +215,6 @@ hammer_generate_redo(hammer_transaction_t trans, hammer_inode_t ip,
                redo->head.hdr_size = bytes;
                redo->head.hdr_seq = hmp->undo_seqno++;
                redo->head.hdr_crc = 0;
-               if (ip)
-                       redo->redo_objid = ip->obj_id;
                redo->redo_mtime = trans->time;
                redo->redo_offset = file_off;
                redo->redo_flags = flags;
@@ -246,6 +275,8 @@ hammer_generate_redo(hammer_transaction_t trans, hammer_inode_t ip,
                        /* NO CRC OR SEQ NO */
                }
                hammer_modify_buffer_done(buffer);
+               if (len == 0)
+                       break;
        }
        hammer_modify_volume_done(root_volume);
        hammer_unlock(&hmp->undo_lock);
@@ -259,12 +290,62 @@ hammer_generate_redo(hammer_transaction_t trans, hammer_inode_t ip,
  * Generate a REDO SYNC record.  At least one such record must be generated
  * in the nominal recovery span for the recovery code to be able to run
  * REDOs outside of the span.
+ *
+ * The SYNC record contains the aggregate earliest UNDO/REDO FIFO offset
+ * for all inodes with active REDOs.  This changes dynamically as inodes
+ * get flushed.
  */
 void
 hammer_generate_redo_sync(hammer_transaction_t trans)
 {
-#if 0
-       hammer_generate_redo(trans, NULL, 0, HAMMER_REDO_SYNC, NULL, 0);
-#endif
-       trans->hmp->flags |= HAMMER_MOUNT_REDO_SYNC;
+       hammer_mount_t hmp = trans->hmp;
+       hammer_inode_t ip;
+
+       ip = RB_FIRST(hammer_redo_rb_tree, &hmp->rb_redo_root);
+       if (ip) {
+               if (hammer_debug_io & 0x0004) {
+                       kprintf("SYNC IP %p %016jx\n",
+                               ip, (uintmax_t)ip->redo_fifo_start);
+               }
+               hammer_generate_redo(trans, NULL, ip->redo_fifo_start,
+                                    HAMMER_REDO_SYNC, NULL, 0);
+               trans->hmp->flags |= HAMMER_MOUNT_REDO_SYNC;
+       }
+}
+
+/*
+ * This is called when an inode is queued to the backend.
+ */
+void
+hammer_redo_fifo_start_flush(hammer_inode_t ip)
+{
+       ip->redo_fifo_next = 0;
+}
+
+/*
+ * This is called when an inode backend flush is finished.  We have to make
+ * sure that RDIRTY is not set unless dirty bufs are present.  Dirty bufs
+ * can get destroyed through operations such as truncations and leave
+ * us with a stale redo_fifo_next.
+ */
+void
+hammer_redo_fifo_end_flush(hammer_inode_t ip)
+{
+       hammer_mount_t hmp = ip->hmp;
+
+       if (ip->flags & HAMMER_INODE_RDIRTY) {
+               RB_REMOVE(hammer_redo_rb_tree, &hmp->rb_redo_root, ip);
+               ip->flags &= ~HAMMER_INODE_RDIRTY;
+       }
+       if ((ip->flags & HAMMER_INODE_BUFS) == 0)
+               ip->redo_fifo_next = 0;
+       if (ip->redo_fifo_next) {
+               ip->redo_fifo_start = ip->redo_fifo_next;
+               if (RB_INSERT(hammer_redo_rb_tree, &hmp->rb_redo_root, ip)) {
+                       panic("hammer_generate_redo: cannot reinsert "
+                             "inode %p on redo FIFO",
+                             ip);
+               }
+               ip->flags |= HAMMER_INODE_RDIRTY;
+       }
 }
index fc500e9..d5c6ddf 100644 (file)
@@ -490,6 +490,7 @@ hammer_vfs_mount(struct mount *mp, char *mntpt, caddr_t data,
 
        RB_INIT(&hmp->rb_vols_root);
        RB_INIT(&hmp->rb_inos_root);
+       RB_INIT(&hmp->rb_redo_root);
        RB_INIT(&hmp->rb_nods_root);
        RB_INIT(&hmp->rb_undo_root);
        RB_INIT(&hmp->rb_resv_root);