HAMMER: Mirroring, misc bug fixes
authorMatthew Dillon <dillon@dragonflybsd.org>
Thu, 31 Jul 2008 04:42:04 +0000 (04:42 +0000)
committerMatthew Dillon <dillon@dragonflybsd.org>
Thu, 31 Jul 2008 04:42:04 +0000 (04:42 +0000)
* Adjust hammer_flusher_async() to queue an extra flush if called twice
  in quick succession.  This fixes the 'sync' command to properly sync
  the entire filesytem.  Previously two syncs were needed.

* Fix a bug where a user application could get stuck due to HAMMER
  losing track of an inode flush.

* Mirroring masters now use the most recent fully committed transaction
  id instead of the last flushed (but still subject to rollback) tid.

  This fixes an issue where a mirror could pass information still subject
  to crash recovery rollback to the slave.  Now only fully committed
  information is passed to the slave.

* Fix a transitory bug where the mirroring code would sometimes not
  sync the correct delete state to the slave.  The slave would always
  be corrected in the next pass, however.  Now the slave is correct
  at all times.

* Fix a bug in hammer_mirror_write() where a delete-to operation could
  livelock.

* Add a new HAMMER ioctl which waits for the committed data transaction
  id to change.  This will be used by the mirroring code to implement
  continuous streaming operation.

Reported-by: Francois Tigeot <ftigeot@wolfpond.org>,
     Michael Neumann <mneumann@crater.dragonflybsd.org>
     (the user application freeze bug)

sys/vfs/hammer/hammer.h
sys/vfs/hammer/hammer_disk.h
sys/vfs/hammer/hammer_flusher.c
sys/vfs/hammer/hammer_inode.c
sys/vfs/hammer/hammer_ioctl.c
sys/vfs/hammer/hammer_ioctl.h
sys/vfs/hammer/hammer_mirror.c
sys/vfs/hammer/hammer_ondisk.c
sys/vfs/hammer/hammer_pfs.c
sys/vfs/hammer/hammer_vfsops.c

index 49d44ec..305993c 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.123 2008/07/27 23:01:25 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.124 2008/07/31 04:42:04 dillon Exp $
  */
 /*
  * This header file contains structures used internally by the HAMMERFS
@@ -720,7 +720,9 @@ struct hammer_mount {
        int     error;                          /* critical I/O error */
        struct krate    krate;                  /* rate limited kprintf */
        hammer_tid_t    asof;                   /* snapshot mount */
-       hammer_off_t    next_tid;
+       hammer_tid_t    next_tid;
+       hammer_tid_t    flush_tid1;             /* flusher tid sequencing */
+       hammer_tid_t    flush_tid2;             /* flusher tid sequencing */
        int64_t copy_stat_freebigblocks;        /* number of free bigblocks */
 
        u_int32_t namekey_iterator;
@@ -1140,6 +1142,8 @@ int hammer_ioc_downgrade_pseudofs(hammer_transaction_t trans, hammer_inode_t ip,
                         struct hammer_ioc_pseudofs_rw *pfs);
 int hammer_ioc_upgrade_pseudofs(hammer_transaction_t trans, hammer_inode_t ip,
                         struct hammer_ioc_pseudofs_rw *pfs);
+int hammer_ioc_wait_pseudofs(hammer_transaction_t trans, hammer_inode_t ip,
+                        struct hammer_ioc_pseudofs_rw *pfs);
 
 int hammer_signal_check(hammer_mount_t hmp);
 
index 08c83cc..43e9996 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_disk.h,v 1.51 2008/07/19 18:44:49 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_disk.h,v 1.52 2008/07/31 04:42:04 dillon Exp $
  */
 
 #ifndef VFS_HAMMER_DISK_H_
@@ -498,7 +498,7 @@ struct hammer_volume_ondisk {
        int64_t vol0_stat_inodes;       /* for statfs only */
        int64_t vol0_stat_records;      /* total records in filesystem */
        hammer_off_t vol0_btree_root;   /* B-Tree root */
-       hammer_tid_t vol0_next_tid;     /* highest synchronized TID */
+       hammer_tid_t vol0_next_tid;     /* highest partially synchronized TID */
        hammer_off_t vol0_unused03;
 
        /*
index 7209e5c..c7fdfcf 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_flusher.c,v 1.44 2008/07/19 04:49:39 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_flusher.c,v 1.45 2008/07/31 04:42:04 dillon Exp $
  */
 /*
  * HAMMER dependancy flusher thread
@@ -231,7 +231,13 @@ hammer_flusher_master_thread(void *arg)
                        break;
                while (hmp->flusher.signal == 0)
                        tsleep(&hmp->flusher.signal, 0, "hmrwwa", 0);
-               hmp->flusher.signal = 0;
+
+               /*
+                * Flush for each count on signal but only allow one extra
+                * flush request to build up.
+                */
+               if (--hmp->flusher.signal != 0)
+                       hmp->flusher.signal = 1;
        }
 
        /*
@@ -665,6 +671,13 @@ hammer_flusher_finalize(hammer_transaction_t trans, int final)
                hammer_modify_volume_done(root_volume);
        }
 
+       /*
+        * vol0_next_tid is used for TID selection and is updated without
+        * an UNDO so we do not reuse a TID that may have been rolled-back.
+        *
+        * vol0_last_tid is the highest fully-synchronized TID.  It is
+        * set-up when the UNDO fifo is fully synced, later on (not here).
+        */
        if (root_volume->io.modified) {
                hammer_modify_volume(NULL, root_volume, NULL, 0);
                if (root_volume->ondisk->vol0_next_tid < trans->tid)
@@ -722,6 +735,18 @@ hammer_flusher_finalize(hammer_transaction_t trans, int final)
                        hmp->hflags |= HMNT_UNDO_DIRTY;
                }
                hammer_clear_undo_history(hmp);
+
+               /*
+                * Flush tid sequencing.  flush_tid1 is fully synchronized,
+                * meaning a crash will not roll it back.  flush_tid2 has
+                * been written out asynchronously and a crash will roll
+                * it back.  flush_tid1 is used for all mirroring masters.
+                */
+               if (hmp->flush_tid1 != hmp->flush_tid2) {
+                       hmp->flush_tid1 = hmp->flush_tid2;
+                       wakeup(&hmp->flush_tid1);
+               }
+               hmp->flush_tid2 = trans->tid;
        }
 
        /*
@@ -738,6 +763,7 @@ failed:
 
 done:
        hammer_unlock(&hmp->flusher.finalize_lock);
+
        if (--hmp->flusher.finalize_want == 0)
                wakeup(&hmp->flusher.finalize_want);
        hammer_stats_commits += final;
index 91bb2bc..48949de 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.106 2008/07/27 23:01:25 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.107 2008/07/31 04:42:04 dillon Exp $
  */
 
 #include "hammer.h"
@@ -1360,8 +1360,9 @@ hammer_modify_inode(hammer_inode_t ip, int flags)
  * place the inode in a flushing state if it is currently idle and flag it
  * to reflush if it is currently flushing.
  *
- * If the HAMMER_FLUSH_SYNCHRONOUS flag is specified we will attempt to
- * flush the indoe synchronously using the caller's context.
+ * Upon return if the inode could not be flushed due to a setup
+ * dependancy, then it will be automatically flushed when the dependancy
+ * is satisfied.
  */
 void
 hammer_flush_inode(hammer_inode_t ip, int flags)
@@ -1440,10 +1441,14 @@ hammer_flush_inode(hammer_inode_t ip, int flags)
                        hammer_flush_inode_core(ip, flg, flags);
                } else {
                        /*
-                        * parent has no connectivity, tell it to flush
+                        * Parent has no connectivity, tell it to flush
                         * us as soon as it does.
+                        *
+                        * The REFLUSH flag is also needed to trigger
+                        * dependancy wakeups.
                         */
-                       ip->flags |= HAMMER_INODE_CONN_DOWN;
+                       ip->flags |= HAMMER_INODE_CONN_DOWN |
+                                    HAMMER_INODE_REFLUSH;
                        if (flags & HAMMER_FLUSH_SIGNAL) {
                                ip->flags |= HAMMER_INODE_RESIGNAL;
                                hammer_flusher_async(ip->hmp, flg);
@@ -1454,6 +1459,9 @@ hammer_flush_inode(hammer_inode_t ip, int flags)
                /*
                 * We are already flushing, flag the inode to reflush
                 * if needed after it completes its current flush.
+                *
+                * The REFLUSH flag is also needed to trigger
+                * dependancy wakeups.
                 */
                if ((ip->flags & HAMMER_INODE_REFLUSH) == 0)
                        ip->flags |= HAMMER_INODE_REFLUSH;
@@ -1706,17 +1714,22 @@ hammer_flush_inode_core(hammer_inode_t ip, hammer_flush_group_t flg, int flags)
         */
        if (go_count == 0) {
                if ((ip->flags & HAMMER_INODE_MODMASK_NOXDIRTY) == 0) {
-                       ip->flags |= HAMMER_INODE_REFLUSH;
-
                        --ip->hmp->count_iqueued;
                        --hammer_count_iqueued;
 
+                       --flg->total_count;
                        ip->flush_state = HAMMER_FST_SETUP;
                        ip->flush_group = NULL;
                        if (ip->flags & HAMMER_INODE_VHELD) {
                                ip->flags &= ~HAMMER_INODE_VHELD;
                                vrele(ip->vp);
                        }
+
+                       /*
+                        * REFLUSH is needed to trigger dependancy wakeups
+                        * when an inode is in SETUP.
+                        */
+                       ip->flags |= HAMMER_INODE_REFLUSH;
                        if (flags & HAMMER_FLUSH_SIGNAL) {
                                ip->flags |= HAMMER_INODE_RESIGNAL;
                                hammer_flusher_async(ip->hmp, flg);
@@ -1909,8 +1922,8 @@ hammer_setup_child_callback(hammer_record_t rec, void *data)
                         * flush groups before it can be completely
                         * flushed.
                         */
-                       ip->flags |= HAMMER_INODE_REFLUSH;
-                       ip->flags |= HAMMER_INODE_RESIGNAL;
+                       ip->flags |= HAMMER_INODE_RESIGNAL |
+                                    HAMMER_INODE_REFLUSH;
                        r = -1;
                } else if (rec->type == HAMMER_MEM_RECORD_ADD) {
                        /*
index 364dc18..b7f29e3 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_ioctl.c,v 1.29 2008/07/16 18:30:59 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_ioctl.c,v 1.30 2008/07/31 04:42:04 dillon Exp $
  */
 
 #include "hammer.h"
@@ -101,6 +101,12 @@ hammer_ioctl(hammer_inode_t ip, u_long com, caddr_t data, int fflag,
                                    (struct hammer_ioc_pseudofs_rw *)data);
                }
                break;
+       case HAMMERIOC_WAI_PSEUDOFS:
+               if (error == 0) {
+                       error = hammer_ioc_wait_pseudofs(&trans, ip,
+                                   (struct hammer_ioc_pseudofs_rw *)data);
+               }
+               break;
        case HAMMERIOC_MIRROR_READ:
                if (error == 0) {
                        error = hammer_ioc_mirror_read(&trans, ip,
index 2c31b8e..80d783f 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_ioctl.h,v 1.21 2008/07/12 23:04:50 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_ioctl.h,v 1.22 2008/07/31 04:42:04 dillon Exp $
  */
 /*
  * HAMMER ioctl's.  This file can be #included from userland
@@ -300,6 +300,7 @@ typedef union hammer_ioc_mrecord_any *hammer_ioc_mrecord_any_t;
 #define HAMMER_MREC_TYPE_SKIP          5       /* skip-range */
 #define HAMMER_MREC_TYPE_PASS          6       /* record for cmp only (pass) */
 #define HAMMER_MREC_TYPE_TERM          7       /* (userland only) */
+#define HAMMER_MREC_TYPE_IDLE          8       /* (userland only) */
 
 #define HAMMER_MREC_CRCOFF     (offsetof(struct hammer_ioc_mrecord_head, rec_size))
 #define HAMMER_MREC_HEADSIZE   sizeof(struct hammer_ioc_mrecord_head)
@@ -322,6 +323,7 @@ typedef union hammer_ioc_mrecord_any *hammer_ioc_mrecord_any_t;
 #define HAMMERIOC_UPG_PSEUDOFS _IOWR('h',9,struct hammer_ioc_pseudofs_rw)
 #define HAMMERIOC_DGD_PSEUDOFS _IOWR('h',10,struct hammer_ioc_pseudofs_rw)
 #define HAMMERIOC_RMR_PSEUDOFS _IOWR('h',11,struct hammer_ioc_pseudofs_rw)
+#define HAMMERIOC_WAI_PSEUDOFS _IOWR('h',12,struct hammer_ioc_pseudofs_rw)
 
 #endif
 
index 34c63c3..f752db2 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_mirror.c,v 1.15 2008/07/13 01:12:41 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_mirror.c,v 1.16 2008/07/31 04:42:04 dillon Exp $
  */
 /*
  * HAMMER mirroring ioctls - serialize and deserialize modifications made
@@ -246,7 +246,7 @@ retry:
                mrec.head.type = HAMMER_MREC_TYPE_REC;
                mrec.head.rec_size = bytes;
                mrec.rec.leaf = *elm;
-               if (elm->base.delete_tid >= mirror->tid_end)
+               if (elm->base.delete_tid > mirror->tid_end)
                        mrec.rec.leaf.base.delete_tid = 0;
                rec_crc = crc32(&mrec.head.rec_size,
                                sizeof(mrec.rec) - crc_start);
@@ -668,14 +668,13 @@ hammer_mirror_delete_to(hammer_cursor_t cursor,
        while (error == 0) {
                elm = &cursor->node->ondisk->elms[cursor->index].leaf;
                KKASSERT(elm->base.btype == HAMMER_BTREE_TYPE_RECORD);
+               cursor->flags |= HAMMER_CURSOR_ATEDISK;
                if (elm->base.delete_tid == 0) {
                        error = hammer_delete_at_cursor(cursor,
                                                        HAMMER_DELETE_ADJUST,
                                                        mirror->tid_end,
                                                        time_second,
                                                        1, NULL);
-                       if (error == 0)
-                               cursor->flags |= HAMMER_CURSOR_ATEDISK;
                }
                if (error == 0)
                        error = hammer_btree_iterate(cursor);
index f462c4a..44f267e 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.72 2008/07/27 21:34:04 mneumann Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.73 2008/07/31 04:42:04 dillon Exp $
  */
 /*
  * Manage HAMMER's on-disk structures.  These routines are primarily
@@ -1478,6 +1478,7 @@ hammer_sync_hmp(hammer_mount_t hmp, int waitfor)
                 hammer_flusher_sync(hmp);
        } else {
                 hammer_flusher_async(hmp, NULL);
+                hammer_flusher_async(hmp, NULL);
        }
        return(info.error);
 }
index 639c53a..e92b05f 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_pfs.c,v 1.4 2008/07/19 18:44:49 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_pfs.c,v 1.5 2008/07/31 04:42:04 dillon Exp $
  */
 /*
  * HAMMER PFS ioctls - Manage pseudo-fs configurations
@@ -76,9 +76,13 @@ hammer_ioc_get_pseudofs(hammer_transaction_t trans, hammer_inode_t ip,
         * If the PFS is a master the sync tid is set by normal operation
         * rather then the mirroring code, and will always track the
         * real HAMMER filesystem.
+        *
+        * We use flush_tid1, which is the highest fully committed TID.
+        * flush_tid2 is the TID most recently flushed, but the UNDO hasn't
+        * caught up to it yet so a crash will roll us back to flush_tid1.
         */
        if ((pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE) == 0)
-               pfsm->pfsd.sync_end_tid = trans->rootvol->ondisk->vol0_next_tid;
+               pfsm->pfsd.sync_end_tid = trans->hmp->flush_tid1;
 
        /*
         * Copy out to userland.
@@ -126,6 +130,11 @@ hammer_ioc_set_pseudofs(hammer_transaction_t trans, hammer_inode_t ip,
                        error = hammer_mkroot_pseudofs(trans, cred, pfsm);
                if (error == 0)
                        error = hammer_save_pseudofs(trans, pfsm);
+
+               /*
+                * Wakeup anyone waiting for a TID update for this PFS
+                */
+               wakeup(&pfsm->pfsd.sync_end_tid);
                hammer_rel_pseudofs(trans->hmp, pfsm);
        }
        return(error);
@@ -255,6 +264,48 @@ hammer_ioc_destroy_pseudofs(hammer_transaction_t trans, hammer_inode_t ip,
        return(error);
 }
 
+/*
+ * Wait for the PFS to sync past the specified TID
+ */
+int
+hammer_ioc_wait_pseudofs(hammer_transaction_t trans, hammer_inode_t ip,
+                        struct hammer_ioc_pseudofs_rw *pfs)
+{
+       hammer_pseudofs_inmem_t pfsm;
+       struct hammer_pseudofs_data pfsd;
+       u_int32_t localization;
+       hammer_tid_t tid;
+       void *waitp;
+       int error;
+
+       if ((error = hammer_pfs_autodetect(pfs, ip)) != 0)
+               return(error);
+       localization = (u_int32_t)pfs->pfs_id << 16;
+
+       if ((error = copyin(pfs->ondisk, &pfsd, sizeof(pfsd))) != 0)
+               return(error);
+
+       pfsm = hammer_load_pseudofs(trans, localization, &error);
+       if (error == 0) {
+               if (pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE) {
+                       tid = pfsm->pfsd.sync_end_tid;
+                       waitp = &pfsm->pfsd.sync_end_tid;
+               } else {
+                       tid = trans->hmp->flush_tid1;
+                       waitp = &trans->hmp->flush_tid1;
+               }
+               if (tid <= pfsd.sync_end_tid)
+                       tsleep(waitp, PCATCH, "hmrmwt", 0);
+       }
+       hammer_rel_pseudofs(trans->hmp, pfsm);
+       if (error == EINTR) {
+               pfs->head.flags |= HAMMER_IOC_HEAD_INTR;
+               error = 0;
+       }
+       return(error);
+}
+
+
 /*
  * Auto-detect the pseudofs and do basic bounds checking.
  */
index 9951498..286fd5a 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_vfsops.c,v 1.69 2008/07/27 23:01:25 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_vfsops.c,v 1.70 2008/07/31 04:42:04 dillon Exp $
  */
 
 #include <sys/param.h>
@@ -583,6 +583,8 @@ hammer_vfs_mount(struct mount *mp, char *mntpt, caddr_t data,
         * on-disk first_offset represents the LAST flush cycle.
         */
        hmp->next_tid = rootvol->ondisk->vol0_next_tid;
+       hmp->flush_tid1 = hmp->next_tid;
+       hmp->flush_tid2 = hmp->next_tid;
        bcopy(rootvol->ondisk->vol0_blockmap, hmp->blockmap,
              sizeof(hmp->blockmap));
        hmp->copy_stat_freebigblocks = rootvol->ondisk->vol0_stat_freebigblocks;
@@ -872,8 +874,6 @@ hammer_vfs_sync(struct mount *mp, int waitfor)
 
        if (panicstr == NULL) {
                error = hammer_sync_hmp(hmp, waitfor);
-               if (error == 0)
-                       error = hammer_sync_hmp(hmp, waitfor);
        } else {
                error = EIO;
        }