HAMMER VFS - REDO implementation base code part 1/many
authorMatthew Dillon <dillon@apollo.backplane.com>
Sun, 10 Jan 2010 04:03:06 +0000 (20:03 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Sun, 10 Jan 2010 04:03:06 +0000 (20:03 -0800)
* Implement basic REDO record writing and fsync heuristic in order to
  test operation and performance.  Note that the recovery code is not
  implemented as of this commit and additional REDO records will probably
  have to be written to manage the span.

  There was no easy way to place all REDOs in a single UNDO/REDO FIFO span
  because the span is not known until the inode's meta-data is actually
  flushed.  We can control the flush to ensure that all required REDOs
  are present in the UNDO/REDO FIFO.  So what we will likely do is track
  the span with additional REDO records.

* Redo vfs.hammmer.fsync_mode:
0 REDO disabled, synchronous fsync semantics (default)
1 REDO disabled, asynchronous fsync semantics
2 REDO enabled, synchronous  (uses disk sync command)
3 REDO enabled, asynchronous (no disk sync command)
4 fsync is ignored

* Refactor hammer_flusher_flush_undos()

* Default operation is to disable the new features as they are not
  finished yet.

sys/vfs/hammer/hammer.h
sys/vfs/hammer/hammer_flusher.c
sys/vfs/hammer/hammer_inode.c
sys/vfs/hammer/hammer_vnops.c

index 8931844..528b8b1 100644 (file)
@@ -57,6 +57,7 @@
 #include <sys/queue.h>
 #include <sys/ktr.h>
 #include <sys/globaldata.h>
+#include <sys/limits.h>
 
 #include <sys/buf2.h>
 #include <sys/signal2.h>
@@ -330,14 +331,20 @@ struct hammer_inode {
        off_t           save_trunc_off;         /* write optimization */
        struct hammer_btree_leaf_elm sync_ino_leaf; /* to-sync cache */
        struct hammer_inode_data sync_ino_data; /* to-sync cache */
+       size_t          redo_count;
 };
 
 typedef struct hammer_inode *hammer_inode_t;
 
 #define VTOI(vp)       ((struct hammer_inode *)(vp)->v_data)
 
-#define HAMMER_INODE_DDIRTY    0x0001  /* in-memory ino_data is dirty */
+/*
+ * NOTE: DDIRTY does not include atime or mtime and does not include
+ *      write-append size changes.  SDIRTY handles write-append size
+ *      changes.
+ */
                                        /* (not including atime/mtime) */
+#define HAMMER_INODE_DDIRTY    0x0001  /* in-memory ino_data is dirty */
 #define HAMMER_INODE_RSV_INODES        0x0002  /* hmp->rsv_inodes bumped */
 #define HAMMER_INODE_CONN_DOWN 0x0004  /* include in downward recursion */
 #define HAMMER_INODE_XDIRTY    0x0008  /* in-memory records */
@@ -360,17 +367,21 @@ typedef struct hammer_inode *hammer_inode_t;
 #define HAMMER_INODE_MTIME     0x00200000 /* in-memory mtime modified */
 #define HAMMER_INODE_WOULDBLOCK 0x00400000 /* re-issue to new flush group */
 #define HAMMER_INODE_DUMMY     0x00800000 /* dummy inode covering bad file */
-#define HAMMER_INODE_CLOSESYNC 0x01000000 /* synchronously fsync on close */
-#define HAMMER_INODE_CLOSEASYNC        0x02000000 /* asynchronously fsync on close */
+#define HAMMER_INODE_SDIRTY    0x01000000 /* in-memory ino_data.size is dirty*/
 
-#define HAMMER_INODE_MODMASK   (HAMMER_INODE_DDIRTY|                       \
+#define HAMMER_INODE_MODMASK   (HAMMER_INODE_DDIRTY|HAMMER_INODE_SDIRTY|   \
                                 HAMMER_INODE_XDIRTY|HAMMER_INODE_BUFS|     \
                                 HAMMER_INODE_ATIME|HAMMER_INODE_MTIME|     \
                                 HAMMER_INODE_TRUNCATED|HAMMER_INODE_DELETING)
 
-#define HAMMER_INODE_MODMASK_NOXDIRTY \
+#define HAMMER_INODE_MODMASK_NOXDIRTY  \
                                (HAMMER_INODE_MODMASK & ~HAMMER_INODE_XDIRTY)
 
+#define HAMMER_INODE_MODMASK_NOREDO    \
+                               (HAMMER_INODE_DDIRTY|                       \
+                                HAMMER_INODE_XDIRTY|                       \
+                                HAMMER_INODE_TRUNCATED|HAMMER_INODE_DELETING)
+
 #define HAMMER_FLUSH_SIGNAL    0x0001
 #define HAMMER_FLUSH_RECURSION 0x0002
 
@@ -716,6 +727,9 @@ struct hammer_flusher {
        struct hammer_flusher_info_list ready_list;
 };
 
+#define HAMMER_FLUSH_UNDOS_RELAXED     0
+#define HAMMER_FLUSH_UNDOS_FORCED      1
+#define HAMMER_FLUSH_UNDOS_AUTO                2
 /*
  * Internal hammer mount data structure
  */
@@ -877,6 +891,7 @@ extern int hammer_limit_dirtybufspace;
 extern int hammer_limit_recs;
 extern int hammer_limit_inode_recs;
 extern int hammer_limit_reclaim;
+extern int hammer_limit_redo;
 extern int hammer_bio_count;
 extern int hammer_verify_zone;
 extern int hammer_verify_data;
index 708e6e0..8b98e59 100644 (file)
@@ -601,6 +601,9 @@ hammer_flusher_finalize(hammer_transaction_t trans, int final)
         * Flush data buffers.  This can occur asynchronously and at any
         * time.  We must interlock against the frontend direct-data write
         * but do not have to acquire the sync-lock yet.
+        *
+        * These data buffers have already been collected prior to the
+        * related inode(s) getting queued to the flush group.
         */
        count = 0;
        while ((io = TAILQ_FIRST(&hmp->data_list)) != NULL) {
@@ -645,7 +648,7 @@ hammer_flusher_finalize(hammer_transaction_t trans, int final)
         * Flush UNDOs.  This also waits for I/Os to complete and flushes
         * the cache on the target disk.
         */
-       hammer_flusher_flush_undos(hmp, 1);
+       hammer_flusher_flush_undos(hmp, HAMMER_FLUSH_UNDOS_FORCED);
 
        if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR)
                goto failed;
@@ -793,18 +796,14 @@ done:
 }
 
 /*
- * Flush UNDOs.  If already_flushed is non-zero we force a disk sync
- * even if no UNDOs are present.
+ * Flush UNDOs.
  */
 void
-hammer_flusher_flush_undos(hammer_mount_t hmp, int already_flushed)
+hammer_flusher_flush_undos(hammer_mount_t hmp, int mode)
 {
        hammer_io_t io;
        int count;
 
-       if (already_flushed == 0 && TAILQ_EMPTY(&hmp->undo_list))
-               return;
-
        count = 0;
        while ((io = TAILQ_FIRST(&hmp->undo_list)) != NULL) {
                if (io->ioerror)
@@ -819,7 +818,10 @@ hammer_flusher_flush_undos(hammer_mount_t hmp, int already_flushed)
                ++count;
        }
        hammer_flusher_clean_loose_ios(hmp);
-       hammer_io_wait_all(hmp, "hmrfl1");
+       if (mode == HAMMER_FLUSH_UNDOS_FORCED ||
+           (mode == HAMMER_FLUSH_UNDOS_AUTO && count)) {
+               hammer_io_wait_all(hmp, "hmrfl1");
+       }
 }
 
 /*
index 00cd207..af18c29 100644 (file)
@@ -407,6 +407,7 @@ loop:
        ip->cache[1].ip = ip;
        ip->cache[2].ip = ip;
        ip->cache[3].ip = ip;
+       ip->redo_count = SIZE_T_MAX;
        if (hmp->ronly)
                ip->flags |= HAMMER_INODE_RO;
        ip->sync_trunc_off = ip->trunc_off = ip->save_trunc_off =
@@ -589,6 +590,7 @@ loop:
        ip->cache[1].ip = ip;
        ip->cache[2].ip = ip;
        ip->cache[3].ip = ip;
+       ip->redo_count = SIZE_T_MAX;
        ip->sync_trunc_off = ip->trunc_off = ip->save_trunc_off =
                0x7FFFFFFFFFFFFFFFLL;
        RB_INIT(&ip->rec_tree);
@@ -718,6 +720,7 @@ hammer_create_inode(hammer_transaction_t trans, struct vattr *vap,
        ip->cache[1].ip = ip;
        ip->cache[2].ip = ip;
        ip->cache[3].ip = ip;
+       ip->redo_count = SIZE_T_MAX;
 
        ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
        /* ip->save_trunc_off = 0; (already zero) */
@@ -1253,6 +1256,7 @@ retry:
                        if (hammer_debug_inode)
                                kprintf("CLEANDELOND %p %08x\n", ip, ip->flags);
                        ip->sync_flags &= ~(HAMMER_INODE_DDIRTY |
+                                           HAMMER_INODE_SDIRTY |
                                            HAMMER_INODE_ATIME |
                                            HAMMER_INODE_MTIME);
                        ip->flags &= ~HAMMER_INODE_DELONDISK;
@@ -1283,6 +1287,7 @@ retry:
         */
        if (error == 0 && (ip->flags & HAMMER_INODE_DELETED)) { 
                ip->sync_flags &= ~(HAMMER_INODE_DDIRTY |
+                                   HAMMER_INODE_SDIRTY |
                                    HAMMER_INODE_ATIME |
                                    HAMMER_INODE_MTIME);
        }
@@ -1535,7 +1540,11 @@ hammer_reload_inode(hammer_inode_t ip, void *arg __unused)
  * A transaction has modified an inode, requiring updates as specified by
  * the passed flags.
  *
- * HAMMER_INODE_DDIRTY: Inode data has been updated
+ * HAMMER_INODE_DDIRTY: Inode data has been updated, not incl mtime/atime,
+ *                     and not including size changes due to write-append
+ *                     (but other size changes are included).
+ * HAMMER_INODE_SDIRTY: Inode data has been updated, size changes due to
+ *                     write-append.
  * HAMMER_INODE_XDIRTY: Dirty in-memory records
  * HAMMER_INODE_BUFS:   Dirty buffer cache buffers
  * HAMMER_INODE_DELETED: Inode record/data must be deleted
@@ -1550,6 +1559,7 @@ hammer_modify_inode(hammer_inode_t ip, int flags)
         */
        KKASSERT(ip->hmp->ronly != 1 ||
                  (flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY | 
+                           HAMMER_INODE_SDIRTY |
                            HAMMER_INODE_BUFS | HAMMER_INODE_DELETED |
                            HAMMER_INODE_ATIME | HAMMER_INODE_MTIME)) == 0);
        if ((ip->flags & HAMMER_INODE_RSV_INODES) == 0) {
@@ -2840,6 +2850,7 @@ defer_buffer_flush:
                 * Clear flags which may have been set by the frontend.
                 */
                ip->sync_flags &= ~(HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY |
+                                   HAMMER_INODE_SDIRTY |
                                    HAMMER_INODE_ATIME | HAMMER_INODE_MTIME |
                                    HAMMER_INODE_DELETING);
                break;
@@ -2851,6 +2862,7 @@ defer_buffer_flush:
                 * Clear flags which may have been set by the frontend.
                 */
                ip->sync_flags &= ~(HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY |
+                                   HAMMER_INODE_SDIRTY |
                                    HAMMER_INODE_ATIME | HAMMER_INODE_MTIME |
                                    HAMMER_INODE_DELETING);
                while (RB_ROOT(&ip->rec_tree)) {
@@ -2884,8 +2896,9 @@ defer_buffer_flush:
        }
 
        /*
-        * If RDIRTY or DDIRTY is set, write out a new record.  If the inode
-        * is already on-disk the old record is marked as deleted.
+        * If RDIRTY, DDIRTY, or SDIRTY is set, write out a new record.
+        * If the inode is already on-disk the old record is marked as
+        * deleted.
         *
         * If DELETED is set hammer_update_inode() will delete the existing
         * record without writing out a new one.
@@ -2895,11 +2908,12 @@ defer_buffer_flush:
        if (ip->flags & HAMMER_INODE_DELETED) {
                error = hammer_update_inode(&cursor, ip);
        } else 
-       if ((ip->sync_flags & HAMMER_INODE_DDIRTY) == 0 &&
+       if (!(ip->sync_flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_SDIRTY)) &&
            (ip->sync_flags & (HAMMER_INODE_ATIME | HAMMER_INODE_MTIME))) {
                error = hammer_update_itimes(&cursor, ip);
        } else
-       if (ip->sync_flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_ATIME | HAMMER_INODE_MTIME)) {
+       if (ip->sync_flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_SDIRTY |
+                             HAMMER_INODE_ATIME | HAMMER_INODE_MTIME)) {
                error = hammer_update_inode(&cursor, ip);
        }
 done:
index b1e20dc..18e9922 100644 (file)
@@ -206,37 +206,73 @@ int
 hammer_vop_fsync(struct vop_fsync_args *ap)
 {
        hammer_inode_t ip = VTOI(ap->a_vp);
+       hammer_mount_t hmp = ip->hmp;
        int waitfor = ap->a_waitfor;
+       int mode;
 
        /*
-        * Fsync rule relaxation (default disabled)
+        * Fsync rule relaxation (default is either full synchronous flush
+        * or REDO semantics with synchronous flush).
         */
        if (ap->a_flags & VOP_FSYNC_SYSCALL) {
                switch(hammer_fsync_mode) {
                case 0:
-                       /* full semantics */
-                       break;
+mode0:
+                       /* disable REDO, full synchronous flush */
+                       ip->redo_count = SIZE_T_MAX;
+                       goto skip;
                case 1:
-                       /* asynchronous */
+mode1:
+                       /* disable REDO, full asynchronous flush */
+                       ip->redo_count = SIZE_T_MAX;
                        if (waitfor == MNT_WAIT)
                                waitfor = MNT_NOWAIT;
-                       break;
+                       goto skip;
                case 2:
-                       /* synchronous fsync on close */
-                       ip->flags |= HAMMER_INODE_CLOSESYNC;
-                       return(0);
+                       /* REDO semantics, synchronous flush */
+                       if (hmp->version < HAMMER_VOL_VERSION_FOUR)
+                               goto mode0;
+                       mode = HAMMER_FLUSH_UNDOS_AUTO;
+                       break;
                case 3:
-                       /* asynchronous fsync on close */
-                       ip->flags |= HAMMER_INODE_CLOSEASYNC;
+                       /* REDO semantics, relaxed asynchronous flush */
+                       if (hmp->version < HAMMER_VOL_VERSION_FOUR)
+                               goto mode1;
+                       mode = HAMMER_FLUSH_UNDOS_RELAXED;
+                       if (waitfor == MNT_WAIT)
+                               waitfor = MNT_NOWAIT;
+                       break;
+               case 4:
+                       /* ignore the fsync() system call */
                        return(0);
                default:
-                       /* ignore the fsync() system call */
+                       /* we have to do something */
+                       mode = HAMMER_FLUSH_UNDOS_RELAXED;
+                       if (waitfor == MNT_WAIT)
+                               waitfor = MNT_NOWAIT;
+                       break;
+               }
+
+               /*
+                * redo_count is initialized to a maximal value and set
+                * to 0 after the first fsync() on a file, which enables
+                * REDO logging on the inode unless the number of bytes
+                * written exceeds the limit.
+                */
+               if (ip->redo_count < hammer_limit_redo &&
+                   (ip->flags & HAMMER_INODE_MODMASK_NOREDO) == 0
+               ) {
+                       ++hammer_count_fsyncs;
+                       hammer_flusher_flush_undos(hmp, mode);
+                       ip->redo_count = 0;
                        return(0);
                }
+               ip->redo_count = 0;
        }
+skip:
 
        /*
-        * Go do it
+        * Do a full flush sequence.
         */
        ++hammer_count_fsyncs;
        vfsync(ap->a_vp, waitfor, 1, NULL, NULL);
@@ -467,8 +503,12 @@ hammer_vop_write(struct vop_write_args *ap)
         * If reading or writing a huge amount of data we have to break
         * atomicy and allow the operation to be interrupted by a signal
         * or it can DOS the machine.
+        *
+        * Adjust redo_count early to avoid generating unnecessary redos.
         */
        bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
+       if (ip->redo_count < hammer_limit_redo)
+               ip->redo_count += uio->uio_resid;
 
        /*
         * Access the data typically in HAMMER_BUFSIZE blocks via the
@@ -619,9 +659,25 @@ hammer_vop_write(struct vop_write_args *ap)
                        if (error == 0)
                                bheavy(bp);
                }
-               if (error == 0) {
-                       error = uiomove((char *)bp->b_data + offset,
-                                       n, uio);
+               if (error == 0)
+                       error = uiomove(bp->b_data + offset, n, uio);
+
+               /*
+                * Generate REDO records while redo_count has not exceeded
+                * the limit.  Note that redo_count is initialized to a
+                * maximal value until the first fsync(), and zerod on every
+                * fsync().  Thus at least one fsync() is required before we
+                * start generating REDO records for the ip.
+                */
+               if (hmp->version >= HAMMER_VOL_VERSION_FOUR &&
+                   ip->redo_count < hammer_limit_redo &&
+                   error == 0) {
+                       hammer_sync_lock_sh(&trans);
+                       error = hammer_generate_redo(&trans, ip,
+                                                    base_offset + offset,
+                                                    bp->b_data + offset,
+                                                    (size_t)n);
+                       hammer_sync_unlock(&trans);
                }
 
                /*
@@ -641,7 +697,7 @@ hammer_vop_write(struct vop_write_args *ap)
                /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
                if (ip->ino_data.size < uio->uio_offset) {
                        ip->ino_data.size = uio->uio_offset;
-                       flags = HAMMER_INODE_DDIRTY;
+                       flags = HAMMER_INODE_SDIRTY;
                        vnode_pager_setsize(ap->a_vp, ip->ino_data.size);
                } else {
                        flags = 0;
@@ -742,10 +798,10 @@ static
 int
 hammer_vop_close(struct vop_close_args *ap)
 {
+#if 0
        struct vnode *vp = ap->a_vp;
        hammer_inode_t ip = VTOI(vp);
        int waitfor;
-
        if (ip->flags & (HAMMER_INODE_CLOSESYNC|HAMMER_INODE_CLOSEASYNC)) {
                if (vn_islocked(vp) == LK_EXCLUSIVE &&
                    (vp->v_flag & (VINACTIVE|VRECLAIMED)) == 0) {
@@ -758,6 +814,7 @@ hammer_vop_close(struct vop_close_args *ap)
                        VOP_FSYNC(vp, MNT_NOWAIT, waitfor);
                }
        }
+#endif
        return (vop_stdclose(ap));
 }