hammer2 - Stabilization pass, more flush refactoring
authorMatthew Dillon <dillon@apollo.backplane.com>
Fri, 1 Nov 2013 05:57:55 +0000 (22:57 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Fri, 1 Nov 2013 06:08:38 +0000 (23:08 -0700)
* Add voldata.inode_tid, separate inode TID allocations from
  transaction TID allocations in voldata.

* Rewrite the transaction management functions.

* Rewrite hammer2's filesystem sync code to reduce stalls.

* Keep track of a generation number on the hammer2_chain_core structure
  so the flush code can re-scan when it modifies elements within the
  flush transaction.

* Cleanup the duplication and delete-duplication code and hardlink handling.
  The delete-duplication code now properly tags delete_tid when a flush is
  delete-duplicating a chain which is deleted in the live view but is still
  valid in the flush view.

* Correct numerous bugs in tracking the modified/deleted state of
  a chain.

* Correct numerous flush bugs.

* Separate the mirror TID for the freemap chain from the volume chain.
  This will allow freemap updates to be delayed.

* Implement a more stringent algorithm to determine when CHAIN_MOVED
  can be cleared in chain->flags.

* Do a better job limiting the flush scan when concurrent modifying
  operations are occuring in large volumes.

sbin/newfs_hammer2/newfs_hammer2.c
sys/vfs/hammer2/hammer2.h
sys/vfs/hammer2/hammer2_chain.c
sys/vfs/hammer2/hammer2_disk.h
sys/vfs/hammer2/hammer2_flush.c
sys/vfs/hammer2/hammer2_freemap.c
sys/vfs/hammer2/hammer2_inode.c
sys/vfs/hammer2/hammer2_ioctl.c
sys/vfs/hammer2/hammer2_vfsops.c
sys/vfs/hammer2/hammer2_vnops.c

index 6ea0b07..6de5f20 100644 (file)
@@ -646,7 +646,8 @@ format_hammer2(int fd, hammer2_off_t total_space, hammer2_off_t free_space)
 
        vol->sroot_blockset.blockref[0] = sroot_blockref;
        vol->mirror_tid = 0;
-       vol->alloc_tid = 16;
+       vol->alloc_tid = 16;    /* first transaction id */
+       vol->inode_tid = 16;    /* first allocatable inode number */
        vol->icrc_sects[HAMMER2_VOL_ICRC_SECT1] =
                        hammer2_icrc32((char *)vol + HAMMER2_VOLUME_ICRC1_OFF,
                                       HAMMER2_VOLUME_ICRC1_SIZE);
index 3c681f9..2b1b184 100644 (file)
@@ -129,7 +129,7 @@ struct hammer2_chain_layer {
        int             good;
        TAILQ_ENTRY(hammer2_chain_layer) entry;
        struct hammer2_chain_tree rbtree;
-       int     refs;           /* prevent destruction */
+       int             refs;           /* prevent destruction */
 };
 
 typedef struct hammer2_chain_layer hammer2_chain_layer_t;
@@ -145,11 +145,12 @@ struct hammer2_chain_core {
        u_int           sharecnt;
        u_int           flags;
        u_int           live_count;     /* live (not deleted) chains in tree */
+       int             generation;     /* generation number (inserts only) */
 };
 
 typedef struct hammer2_chain_core hammer2_chain_core_t;
 
-#define HAMMER2_CORE_INDIRECT          0x0001
+#define HAMMER2_CORE_UNUSED0001                0x0001
 #define HAMMER2_CORE_COUNTEDBREFS      0x0002
 
 struct hammer2_chain {
@@ -172,6 +173,7 @@ struct hammer2_chain {
        u_int           flags;
        u_int           refs;
        u_int           lockcnt;
+       int             debug_reason;
        hammer2_media_data_t *data;             /* data pointer shortcut */
        TAILQ_ENTRY(hammer2_chain) flush_node;  /* flush deferral list */
 };
@@ -196,7 +198,7 @@ RB_PROTOTYPE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
 #define HAMMER2_CHAIN_MODIFIED         0x00000001      /* dirty chain data */
 #define HAMMER2_CHAIN_ALLOCATED                0x00000002      /* kmalloc'd chain */
 #define HAMMER2_CHAIN_DIRTYBP          0x00000004      /* dirty on unlock */
-#define HAMMER2_CHAIN_UNUSED00008      0x00000008
+#define HAMMER2_CHAIN_FORCECOW         0x00000008      /* force copy-on-wr */
 #define HAMMER2_CHAIN_DELETED          0x00000010      /* deleted chain */
 #define HAMMER2_CHAIN_INITIAL          0x00000020      /* initial create */
 #define HAMMER2_CHAIN_FLUSHED          0x00000040      /* flush on unlock */
@@ -427,7 +429,9 @@ RB_PROTOTYPE2(hammer2_inode_tree, hammer2_inode, rbnode, hammer2_inode_cmp,
 struct hammer2_trans {
        TAILQ_ENTRY(hammer2_trans) entry;
        struct hammer2_pfsmount *pmp;
+       hammer2_tid_t           real_tid;
        hammer2_tid_t           sync_tid;
+       hammer2_tid_t           inode_tid;
        thread_t                td;             /* pointer */
        int                     flags;
        int                     blocked;
@@ -440,7 +444,7 @@ typedef struct hammer2_trans hammer2_trans_t;
 #define HAMMER2_TRANS_ISFLUSH          0x0001  /* formal flush */
 #define HAMMER2_TRANS_RESTRICTED       0x0002  /* snapshot flush restrict */
 #define HAMMER2_TRANS_BUFCACHE         0x0004  /* from bioq strategy write */
-#define HAMMER2_TRANS_INVFSYNC         0x0008  /* with ISFLUSH */
+#define HAMMER2_TRANS_NEWINODE         0x0008  /* caller allocating inode */
 #define HAMMER2_TRANS_ISALLOCATING     0x0010  /* in allocator */
 
 #define HAMMER2_FREEMAP_HEUR_NRADIX    4       /* pwr 2 PBUFRADIX-MINIORADIX */
@@ -468,10 +472,6 @@ struct hammer2_mount {
        struct lock     alloclk;        /* lockmgr lock */
        struct lock     voldatalk;      /* lockmgr lock */
        struct hammer2_trans_queue transq; /* all in-progress transactions */
-       hammer2_trans_t *curflush;      /* current flush in progress */
-       hammer2_tid_t   topo_flush_tid; /* currently synchronizing flush pt */
-       hammer2_tid_t   last_flush_tid; /* previous synchronizing flush pt */
-       hammer2_tid_t   free_flush_tid; /* currently synchronizing flush pt */
        hammer2_off_t   heur_freemap[HAMMER2_FREEMAP_HEUR];
        int             flushcnt;       /* #of flush trans on the list */
 
index 9782443..4fb92dc 100644 (file)
  * flush synchronization boundary, allowing the flush code to continue flushing
  * the older version of the topology and not be disrupted by new frontend
  * operations.
+ *
+ *                             LIVE VS FLUSH VIEW
+ *
+ * All lookup and iterate operations and most modifications are done on the
+ * live view.  During flushes lookups are not normally done and modifications
+ * may be run on the flush view.  However, flushes often needs to allocate
+ * blocks and the freemap_alloc/free code issues lookups.  This code is
+ * special cased to use the live view when called from a flush.
+ *
+ * General chain lookup/iteration functions are NOT aware of the flush view,
+ * they only know about live views.
  */
 #include <sys/cdefs.h>
 #include <sys/param.h>
@@ -138,30 +149,41 @@ hammer2_isclusterable(hammer2_chain_t *chain)
  * 'above' core, but the core itself can be multi-homed with parents iterated
  * via core->ownerq.
  *
- * However, this function is only used on the live tree which we can locate
- * by finding the first non-DUPLICATED parent.  Note that this parent might
- * be deleted.
+ * This function is not used during a flush (except when the flush is
+ * allocating which requires the live tree).  The flush keeps track of its
+ * recursion itself.
  *
- * The live tree can be ripped out while *any* deeper node is held
- * (XXX is that still true?), so we can recurse upward without locks.
+ * XXX needs to be optimized to use roll-up TIDs.  update_tid is only really
+ * compared against bref.mirror_tid which itself is only updated by a flush.
  */
 void
 hammer2_chain_setsubmod(hammer2_trans_t *trans, hammer2_chain_t *chain)
 {
        hammer2_chain_core_t *above;
 
+#if 0
        if ((trans->flags &
             (HAMMER2_TRANS_ISFLUSH | HAMMER2_TRANS_ISALLOCATING)) ==
            HAMMER2_TRANS_ISFLUSH) {
                return;
        }
+#endif
+
        while ((above = chain->above) != NULL) {
                spin_lock(&above->cst.spin);
+               /* XXX optimize */
                if (above->update_tid < trans->sync_tid)
                        above->update_tid = trans->sync_tid;
-               chain = TAILQ_FIRST(&above->ownerq);
-               while (chain->flags & HAMMER2_CHAIN_DUPLICATED)
-                       chain = TAILQ_NEXT(chain, core_entry);
+               chain = TAILQ_LAST(&above->ownerq, h2_core_list);
+#if 0
+               TAILQ_FOREACH_REVERSE(chain, &above->ownerq,
+                                     h2_core_list, core_entry) {
+                       if (trans->sync_tid >= chain->modify_tid &&
+                           trans->sync_tid <= chain->delete_tid) {
+                               break;
+                       }
+               }
+#endif
                spin_unlock(&above->cst.spin);
        }
 }
@@ -223,8 +245,8 @@ hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_pfsmount_t *pmp,
 
        /*
         * Set modify_tid if a transaction is creating the chain.  When
-        * loading a chain from backing store modify_tid is left set to 0,
-        * which forces a delete-duplication.
+        * loading a chain from backing store trans is passed as NULL and
+        * modify_tid is left set to 0.
         */
        if (trans)
                chain->modify_tid = trans->sync_tid;
@@ -261,6 +283,10 @@ hammer2_chain_core_alloc(hammer2_trans_t *trans,
                TAILQ_INIT(&core->ownerq);
                core->sharecnt = 1;
                core->good = 0x1234;
+               if (trans)
+                       core->update_tid = trans->sync_tid;
+               else
+                       core->update_tid = nchain->bref.mirror_tid;
                nchain->core = core;
                ccms_cst_init(&core->cst, nchain);
                TAILQ_INSERT_TAIL(&core->ownerq, nchain, core_entry);
@@ -287,9 +313,16 @@ hammer2_chain_core_alloc(hammer2_trans_t *trans,
                 * The flusher understands the blockref synchronization state
                 * for any stale chains by observing bref.mirror_tid, which
                 * delete-duplicate replicates.
+                *
+                * WARNING! However, the case is disallowed when the flusher
+                *          is allocating freemap space because this entails
+                *          more than just adjusting a block table.
                 */
                if (ochain->flags & HAMMER2_CHAIN_DUPLICATED) {
-                       KKASSERT(trans->flags & HAMMER2_TRANS_ISFLUSH);
+                       KKASSERT((trans->flags &
+                                 (HAMMER2_TRANS_ISFLUSH |
+                                  HAMMER2_TRANS_ISALLOCATING)) ==
+                                HAMMER2_TRANS_ISFLUSH);
                        atomic_set_int(&nchain->flags,
                                       HAMMER2_CHAIN_DUPLICATED);
                }
@@ -302,6 +335,8 @@ hammer2_chain_core_alloc(hammer2_trans_t *trans,
 
                spin_lock(&core->cst.spin);
                nchain->core = core;
+               if (core->update_tid < trans->sync_tid)
+                       core->update_tid = trans->sync_tid;
 
                /*
                 * Maintain ordering for refactor test so we don't skip over
@@ -381,6 +416,7 @@ hammer2_chain_insert(hammer2_chain_core_t *above, hammer2_chain_t *chain,
        }
        chain->inlayer = layer;
        ++above->chain_count;
+       ++above->generation;
 
        if ((flags & HAMMER2_CHAIN_INSERT_LIVE) &&
            (chain->flags & HAMMER2_CHAIN_DELETED) == 0) {
@@ -1323,6 +1359,7 @@ hammer2_chain_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
         */
        hammer2_freemap_alloc(trans, chain->hmp, &chain->bref, nbytes);
        chain->bytes = nbytes;
+       atomic_clear_int(&chain->flags, HAMMER2_CHAIN_FORCECOW);
        /*ip->delta_dcount += (ssize_t)(nbytes - obytes);*/ /* XXX atomic */
 
        /*
@@ -1338,6 +1375,7 @@ hammer2_chain_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
         */
        KKASSERT(chain->bp == NULL);
 
+#if 0
        /*
         * Make sure the chain is marked MOVED and propagate the update
         * to the root for flush.
@@ -1347,6 +1385,7 @@ hammer2_chain_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
                atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
        }
        hammer2_chain_setsubmod(trans, chain);
+#endif
        *chainp = chain;
 }
 
@@ -1400,6 +1439,9 @@ hammer2_chain_modify(hammer2_trans_t *trans, hammer2_chain_t **chainp,
        chain = *chainp;
        hmp = chain->hmp;
 
+#if 0
+       kprintf("MODIFY %p.%d flags %08x mod=%016jx del=%016jx\n", chain, chain->bref.type, chain->flags, chain->modify_tid, chain->delete_tid);
+#endif
        /*
         * Data must be resolved if already assigned unless explicitly
         * flagged otherwise.
@@ -1424,21 +1466,24 @@ hammer2_chain_modify(hammer2_trans_t *trans, hammer2_chain_t **chainp,
         * Determine if a delete-duplicate is needed.
         *
         * (a) Modify_tid is part of a prior flush
-        * (b) Concurrent post-flush sync and modify_tid is part of current
-        *     flush.
+        * (b) Transaction is concurrent with a flush (has higher tid)
         * (c) and chain is not in the initial state (freshly created)
         * (d) and caller didn't request an in-place modification.
         *
         * The freemap and volume header special chains are never D-Dd.
         */
-       if ((chain->modify_tid <= hmp->last_flush_tid ||
-            (trans->sync_tid > hmp->topo_flush_tid &&
-             chain->modify_tid <= hmp->topo_flush_tid)) &&
-           (chain->flags & HAMMER2_CHAIN_INITIAL) == 0 &&
-           (flags & HAMMER2_MODIFY_INPLACE) == 0) {
+       if (chain->modify_tid != trans->sync_tid &&        /* cross boundary */
+           (flags & HAMMER2_MODIFY_INPLACE) == 0) {       /* from d-d */
                if (chain != &hmp->fchain && chain != &hmp->vchain) {
                        KKASSERT((flags & HAMMER2_MODIFY_ASSERTNOCOPY) == 0);
                        hammer2_chain_delete_duplicate(trans, chainp, 0);
+#if 0
+       kprintf("RET1A %p.%d flags %08x mod=%016jx del=%016jx\n", chain, chain->bref.type, chain->flags, chain->modify_tid, chain->delete_tid);
+#endif
+#if 0
+                       chain = *chainp;
+       kprintf("RET1B %p.%d flags %08x mod=%016jx del=%016jx\n", chain, chain->bref.type, chain->flags, chain->modify_tid, chain->delete_tid);
+#endif
                        return;
                }
                /* fall through if fchain or vchain */
@@ -1447,14 +1492,10 @@ hammer2_chain_modify(hammer2_trans_t *trans, hammer2_chain_t **chainp,
        /*
         * Otherwise do initial-chain handling
         */
-       if ((flags & HAMMER2_MODIFY_NO_MODIFY_TID) == 0)
-               chain->bref.modify_tid = trans->sync_tid;
-
        if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) {
                atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
                hammer2_chain_ref(chain);
        }
-       chain->modify_tid = trans->sync_tid;
 
        /*
         * The modification or re-modification requires an allocation and
@@ -1463,19 +1504,26 @@ hammer2_chain_modify(hammer2_trans_t *trans, hammer2_chain_t **chainp,
         * We normally always allocate new storage here.  If storage exists
         * and MODIFY_NOREALLOC is passed in, we do not allocate new storage.
         */
-       if (chain != &hmp->vchain &&
-           chain != &hmp->fchain &&
-           ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0 ||
-            (flags & HAMMER2_MODIFY_NOREALLOC) == 0 ||
-            chain->modify_tid <= hmp->last_flush_tid ||
-            (trans->sync_tid > hmp->topo_flush_tid &&
-             chain->modify_tid <= hmp->topo_flush_tid))
-       ) {
-               hammer2_freemap_alloc(trans, chain->hmp,
-                                     &chain->bref, chain->bytes);
-               /* XXX failed allocation */
+       if (chain != &hmp->vchain && chain != &hmp->fchain) {
+               if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0 ||
+                    ((flags & HAMMER2_MODIFY_NOREALLOC) == 0 &&
+                     chain->modify_tid != trans->sync_tid)
+               ) {
+                       hammer2_freemap_alloc(trans, chain->hmp,
+                                             &chain->bref, chain->bytes);
+                       /* XXX failed allocation */
+               } else if (chain->flags & HAMMER2_CHAIN_FORCECOW) {
+                       hammer2_freemap_alloc(trans, chain->hmp,
+                                             &chain->bref, chain->bytes);
+                       /* XXX failed allocation */
+               }
+               atomic_clear_int(&chain->flags, HAMMER2_CHAIN_FORCECOW);
        }
 
+       chain->modify_tid = trans->sync_tid;
+       if ((flags & HAMMER2_MODIFY_NO_MODIFY_TID) == 0)
+               chain->bref.modify_tid = trans->sync_tid;
+
        /*
         * Do not COW if OPTDATA is set.  INITIAL flag remains unchanged.
         * (OPTDATA does not prevent [re]allocation of storage, only the
@@ -1594,6 +1642,9 @@ hammer2_chain_modify(hammer2_trans_t *trans, hammer2_chain_t **chainp,
 
        }
 skip2:
+#if 0
+       kprintf("RET2 %p.%d flags %08x mod=%016jx del=%016jx\n", chain, chain->bref.type, chain->flags, chain->modify_tid, chain->delete_tid);
+#endif
        hammer2_chain_setsubmod(trans, chain);
 }
 
@@ -1796,6 +1847,7 @@ hammer2_chain_get(hammer2_chain_t *parent, hammer2_blockref_t *bref)
        chain = hammer2_chain_alloc(hmp, parent->pmp, NULL, bref);
        hammer2_chain_core_alloc(NULL, chain, NULL);
        /* ref'd chain returned */
+       chain->modify_tid = chain->bref.mirror_tid;
 
        /*
         * Link the chain into its parent.  A spinlock is required to safely
@@ -2484,8 +2536,8 @@ again:
                        hammer2_chain_ref(chain);
                        atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
                }
-               hammer2_chain_setsubmod(trans, chain);
        }
+       hammer2_chain_setsubmod(trans, chain);
 
 done:
        *chainp = chain;
@@ -2494,11 +2546,14 @@ done:
 }
 
 /*
- * Replace (*chainp) with a duplicate.  The original *chainp is unlocked
- * and the replacement will be returned locked.  Both the original and the
- * new chain will share the same RBTREE (have the same chain->core), with
- * the new chain becoming the 'current' chain (meaning it is the first in
- * the linked list at core->chain_first).
+ * Replace (*chainp) with a duplicate in-memory chain structure which shares
+ * the same core and media state as the orignal.  The original *chainp is
+ * unlocked and the replacement will be returned locked.
+ *
+ * The old chain may or may not be in a DELETED state.  This new chain will
+ * be live (not deleted).
+ *
+ * The new chain will be marked modified for the current transaction.
  *
  * If (parent) is non-NULL then the new duplicated chain is inserted under
  * the parent.
@@ -2507,10 +2562,8 @@ done:
  * similar to if it had just been chain_alloc()'d (suitable for passing into
  * hammer2_chain_create() after this function returns).
  *
- * NOTE! Duplication is used in order to retain the original topology to
- *      support flush synchronization points.  Both the original and the
- *      new chain will have the same transaction id and thus the operation
- *      appears atomic w/regards to media flushes.
+ * WARNING! This is not a snapshot.  Changes made underneath either the old
+ *         or new chain will affect both.
  */
 static void hammer2_chain_dup_fixup(hammer2_chain_t *ochain,
                                    hammer2_chain_t *nchain);
@@ -2521,22 +2574,57 @@ hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t **parentp,
                        int snapshot)
 {
        hammer2_mount_t *hmp;
-       hammer2_blockref_t *base;
        hammer2_chain_t *parent;
        hammer2_chain_t *ochain;
        hammer2_chain_t *nchain;
        hammer2_chain_core_t *above;
        size_t bytes;
-       int count;
-       int oflags;
 
        /*
-        * First create a duplicate of the chain structure, associating
-        * it with the same core, making it the same size, pointing it
-        * to the same bref (the same media block).
+        * We want nchain to be our go-to live chain, but ochain may be in
+        * a MODIFIED state within the current flush synchronization segment.
+        * Force any further modifications of ochain to do another COW
+        * operation even if modify_tid indicates that one is not needed.
+        *
+        * WARNING!  We should never resolve DATA to device buffers
+        *           (XXX allow it if the caller did?), and since
+        *           we currently do not have the logical buffer cache
+        *           buffer in-hand to fix its cached physical offset
+        *           we also force the modify code to not COW it. XXX
         */
        ochain = *chainp;
        hmp = ochain->hmp;
+       if (parentp)
+       ochain->debug_reason += 0x10000;
+       else
+       ochain->debug_reason += 0x100000;
+
+#if 0
+       if (ochain->bref.type == HAMMER2_BREF_TYPE_DATA) {
+               hammer2_chain_modify(trans, &ochain,
+                                    HAMMER2_MODIFY_OPTDATA |
+                                    HAMMER2_MODIFY_NOREALLOC);
+       } else if (ochain->flags & HAMMER2_CHAIN_INITIAL) {
+               hammer2_chain_modify(trans, &ochain,
+                                    HAMMER2_MODIFY_OPTDATA);
+       } else {
+               hammer2_chain_modify(trans, &ochain, 0);
+       }
+#endif
+       atomic_set_int(&ochain->flags, HAMMER2_CHAIN_FORCECOW);
+
+       /*
+        * Now create a duplicate of the chain structure, associating
+        * it with the same core, making it the same size, pointing it
+        * to the same bref (the same media block).
+        *
+        * Give the duplicate the same modify_tid that we previously
+        * ensured was sufficiently advanced to trigger a block table
+        * insertion on flush.
+        *
+        * NOTE: bref.mirror_tid duplicated by virtue of bref copy in
+        *       hammer2_chain_alloc()
+        */
        if (bref == NULL)
                bref = &ochain->bref;
        if (snapshot) {
@@ -2550,66 +2638,55 @@ hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t **parentp,
                (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
        nchain->bytes = bytes;
        nchain->modify_tid = ochain->modify_tid;
+       if (ochain->flags & HAMMER2_CHAIN_INITIAL)
+               atomic_set_int(&nchain->flags, HAMMER2_CHAIN_INITIAL);
 
        /*
         * Fixup (copy) any embedded data.  Non-embedded data relies on the
-        * media block.
+        * media block.  We must unlock ochain before we can access nchain's
+        * media block because they might share the same bp and deadlock if
+        * we don't.
         */
        hammer2_chain_lock(nchain, HAMMER2_RESOLVE_NEVER |
                                   HAMMER2_RESOLVE_NOREF);
        hammer2_chain_dup_fixup(ochain, nchain);
        /* nchain has 1 ref */
+       hammer2_chain_unlock(ochain);
+       KKASSERT((ochain->flags & HAMMER2_CHAIN_EMBEDDED) ||
+                ochain->data == NULL);
+
+       /*
+        * Place nchain in the modified state, instantiate media data
+        * if necessary.  Because modify_tid is already completely
+        * synchronized this should not result in a delete-duplicate.
+        *
+        * We want nchain at the target to look like a new insertion.
+        * Forcing the modification to be INPLACE accomplishes this
+        * because we get the same nchain with an updated modify_tid.
+        */
+       if (nchain->bref.type == HAMMER2_BREF_TYPE_DATA) {
+               hammer2_chain_modify(trans, &nchain,
+                                    HAMMER2_MODIFY_OPTDATA |
+                                    HAMMER2_MODIFY_NOREALLOC |
+                                    HAMMER2_MODIFY_INPLACE);
+       } else if (nchain->flags & HAMMER2_CHAIN_INITIAL) {
+               hammer2_chain_modify(trans, &nchain,
+                                    HAMMER2_MODIFY_OPTDATA |
+                                    HAMMER2_MODIFY_INPLACE);
+       } else {
+               hammer2_chain_modify(trans, &nchain,
+                                    HAMMER2_MODIFY_INPLACE);
+       }
 
        /*
-        * If parent is not NULL, insert the duplicated chain into the
-        * parent.  The newly duplicated chain must be marked MOVED and
-        * update_tid set in its parent(s).
+        * If parent is not NULL the duplicated chain will be entered under
+        * the parent and the MOVED bit set.
         *
         * Having both chains locked is extremely important for atomicy.
         */
        if (parentp && (parent = *parentp) != NULL) {
-               /*
-                * Locate a free blockref in the parent's array
-                */
                above = parent->core;
                KKASSERT(ccms_thread_lock_owned(&above->cst));
-
-               switch(parent->bref.type) {
-               case HAMMER2_BREF_TYPE_INODE:
-                       KKASSERT((parent->data->ipdata.op_flags &
-                                 HAMMER2_OPFLAG_DIRECTDATA) == 0);
-                       KKASSERT(parent->data != NULL);
-                       base = &parent->data->ipdata.u.blockset.blockref[0];
-                       count = HAMMER2_SET_COUNT;
-                       break;
-               case HAMMER2_BREF_TYPE_INDIRECT:
-               case HAMMER2_BREF_TYPE_FREEMAP_NODE:
-                       if (parent->flags & HAMMER2_CHAIN_INITIAL) {
-                               base = NULL;
-                       } else {
-                               KKASSERT(parent->data != NULL);
-                               base = &parent->data->npdata[0];
-                       }
-                       count = parent->bytes / sizeof(hammer2_blockref_t);
-                       break;
-               case HAMMER2_BREF_TYPE_VOLUME:
-                       KKASSERT(parent->data != NULL);
-                       base = &hmp->voldata.sroot_blockset.blockref[0];
-                       count = HAMMER2_SET_COUNT;
-                       break;
-               case HAMMER2_BREF_TYPE_FREEMAP:
-                       KKASSERT(parent->data != NULL);
-                       base = &hmp->voldata.freemap_blockset.blockref[0];
-                       count = HAMMER2_SET_COUNT;
-                       break;
-               default:
-                       panic("hammer2_chain_create: unrecognized "
-                             "blockref type: %d",
-                             parent->bref.type);
-                       count = 0;
-                       break;
-               }
-
                KKASSERT((nchain->flags & HAMMER2_CHAIN_DELETED) == 0);
                KKASSERT(parent->refs > 0);
 
@@ -2617,10 +2694,6 @@ hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t **parentp,
                                     nchain->bref.key, nchain->bref.keybits,
                                     nchain->bref.type, nchain->bytes);
                parent = NULL;
-#if 0
-               hammer2_chain_insert(above, nchain, HAMMER2_CHAIN_INSERT_SPIN |
-                                                   HAMMER2_CHAIN_INSERT_LIVE);
-#endif
 
                if ((nchain->flags & HAMMER2_CHAIN_MOVED) == 0) {
                        hammer2_chain_ref(nchain);
@@ -2630,60 +2703,17 @@ hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t **parentp,
        }
 
        /*
-        * We have to unlock ochain before we can mark nchain modified to
-        * avoid deadlocking due to the media blocks currently being the
-        * same.
-        *
-        * Assert that (data == NULL) to catch any extra locks that might
-        * have been present.  Extra locks are not allowed.
+        * Unconditionally set MOVED to force the parent blockrefs to
+        * update, and adjust update_tid below nchain so nchain's
+        * blockrefs are updated with the new attachment.
         */
-       oflags = ochain->flags;
-       hammer2_chain_unlock(ochain);
-       KKASSERT((ochain->flags & HAMMER2_CHAIN_EMBEDDED) ||
-                ochain->data == NULL);
-
-       if (oflags & HAMMER2_CHAIN_INITIAL)
-               atomic_set_int(&nchain->flags, HAMMER2_CHAIN_INITIAL);
-
-       /*
-        * WARNING!  We should never resolve DATA to device buffers
-        *           (XXX allow it if the caller did?), and since
-        *           we currently do not have the logical buffer cache
-        *           buffer in-hand to fix its cached physical offset
-        *           we also force the modify code to not COW it. XXX
-        *
-        * WARNING!  nchain should have only one manual ref plus additional
-        *           refs related to flags or the hammer2_chain_modify()
-        *           replacement could leave a ref hanging.
-        */
-       if (oflags & HAMMER2_CHAIN_MODIFIED) {
-               if (nchain->bref.type == HAMMER2_BREF_TYPE_DATA) {
-                       hammer2_chain_modify(trans, &nchain,
-                                            HAMMER2_MODIFY_OPTDATA |
-                                            HAMMER2_MODIFY_NOREALLOC |
-                                            HAMMER2_MODIFY_INPLACE);
-               } else if (oflags & HAMMER2_CHAIN_INITIAL) {
-                       hammer2_chain_modify(trans, &nchain,
-                                            HAMMER2_MODIFY_OPTDATA |
-                                            HAMMER2_MODIFY_INPLACE);
-               } else {
-                       hammer2_chain_modify(trans, &nchain,
-                                            HAMMER2_MODIFY_INPLACE);
-               }
-       } else {
-               if (nchain->bref.type == HAMMER2_BREF_TYPE_DATA) {
-                       ;
-               } else if (oflags & HAMMER2_CHAIN_INITIAL) {
-                       ;
-               } else {
-                       hammer2_chain_lock(nchain, HAMMER2_RESOLVE_ALWAYS);
-                       hammer2_chain_unlock(nchain);
-               }
+       if (nchain->core->update_tid < trans->sync_tid) {
+               spin_lock(&nchain->core->cst.spin);
+               if (nchain->core->update_tid < trans->sync_tid)
+                       nchain->core->update_tid = trans->sync_tid;
+               spin_unlock(&nchain->core->cst.spin);
        }
-       spin_lock(&nchain->core->cst.spin);
-       if (nchain->core->update_tid < trans->sync_tid)
-               nchain->core->update_tid = trans->sync_tid;
-       spin_unlock(&nchain->core->cst.spin);
+
        *chainp = nchain;
 }
 
@@ -2693,9 +2723,13 @@ hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t **parentp,
  * with a duplicate.  Atomicy is at the very-fine spin-lock level in
  * order to ensure that lookups do not race us.
  *
- * If the input chain is already marked deleted the duplicated chain will
- * also be marked deleted.  This case can occur when an inode is removed
- * from the filesystem but programs still have an open descriptor to it.
+ * If the old chain is already marked deleted the new chain will also be
+ * marked deleted.  This case can occur when an inode is removed from the
+ * filesystem but programs still have an open descriptor to it, and during
+ * flushes when the flush needs to operate on a chain that is deleted in
+ * the live view but still alive in the flush view.
+ *
+ * The new chain will be marked modified for the current transaction.
  */
 void
 hammer2_chain_delete_duplicate(hammer2_trans_t *trans, hammer2_chain_t **chainp,
@@ -2706,22 +2740,19 @@ hammer2_chain_delete_duplicate(hammer2_trans_t *trans, hammer2_chain_t **chainp,
        hammer2_chain_t *nchain;
        hammer2_chain_core_t *above;
        size_t bytes;
-       int oflags;
 
+       /*
+        * Note that we do not have to call setsubmod on ochain, calling it
+        * on nchain is sufficient.
+        */
        ochain = *chainp;
-       oflags = ochain->flags;
        hmp = ochain->hmp;
 
-#if 0
-       /*
-        * Shortcut DELETED case if possible (only if delete_tid already
-        * matches the transaction id).
-        */
-       if ((oflags & HAMMER2_CHAIN_DELETED) &&
-           ochain->delete_tid == trans->sync_tid) {
-               return;
+       ochain->debug_reason += 0x1000;
+       if ((ochain->debug_reason & 0xF000) > 0x1000) {
+               kprintf("ochain %p\n", ochain);
+               Debugger("shit2");
        }
-#endif
 
        /*
         * First create a duplicate of the chain structure.
@@ -2737,9 +2768,28 @@ hammer2_chain_delete_duplicate(hammer2_trans_t *trans, hammer2_chain_t **chainp,
        bytes = (hammer2_off_t)1 <<
                (int)(ochain->bref.data_off & HAMMER2_OFF_MASK_RADIX);
        nchain->bytes = bytes;
-       nchain->modify_tid = trans->sync_tid;
+
+       /*
+        * Duplicate inherits ochain's live state including its modification
+        * state.  This function disposes of the original.  Because we are
+        * doing this in-place under the same parent the block array
+        * inserted/deleted state does not change.
+        *
+        * The caller isn't expected to make further modifications of ochain
+        * but set the FORCECOW bit anyway, just in case it does.  If ochain
+        * was previously marked FORCECOW we also flag nchain FORCECOW
+        * (used during hardlink splits).
+        *
+        * NOTE: bref.mirror_tid duplicated by virtue of bref copy in
+        *       hammer2_chain_alloc()
+        */
        nchain->data_count += ochain->data_count;
        nchain->inode_count += ochain->inode_count;
+       nchain->modify_tid = ochain->modify_tid;
+       atomic_set_int(&nchain->flags,
+                      ochain->flags & (HAMMER2_CHAIN_INITIAL |
+                                       HAMMER2_CHAIN_FORCECOW));
+       atomic_set_int(&ochain->flags, HAMMER2_CHAIN_FORCECOW);
 
        /*
         * Lock nchain so both chains are now locked (extremely important
@@ -2767,14 +2817,15 @@ hammer2_chain_delete_duplicate(hammer2_trans_t *trans, hammer2_chain_t **chainp,
        spin_lock(&above->cst.spin);
        KKASSERT(ochain->flags & HAMMER2_CHAIN_ONRBTREE);
 
-       if (oflags & HAMMER2_CHAIN_DELETED) {
+       if (ochain->flags & HAMMER2_CHAIN_DELETED) {
                atomic_set_int(&nchain->flags, HAMMER2_CHAIN_DELETED);
-               nchain->delete_tid = trans->sync_tid;
-               /*nchain->delete_gen = ++trans->delete_gen;*/
+               /* very important to inherit ochain's delete_tid */
+               KKASSERT(ochain->delete_tid >= nchain->modify_tid);
+               nchain->delete_tid = ochain->delete_tid;
                hammer2_chain_insert(above, nchain, 0);
        } else {
+               KKASSERT(trans->sync_tid >= ochain->modify_tid);
                ochain->delete_tid = trans->sync_tid;
-               /*ochain->delete_gen = ++trans->delete_gen;*/
                atomic_set_int(&ochain->flags, HAMMER2_CHAIN_DELETED);
                atomic_add_int(&above->live_count, -1);
                hammer2_chain_insert(above, nchain, HAMMER2_CHAIN_INSERT_LIVE);
@@ -2794,18 +2845,15 @@ hammer2_chain_delete_duplicate(hammer2_trans_t *trans, hammer2_chain_t **chainp,
        KKASSERT(ochain->bp == NULL);
 
        /*
-        * Fixup nchain
+        * Finishing fixing up nchain.  A new block will be allocated if
+        * crossing a synchronization point (meta-data only).
         */
-       if (oflags & HAMMER2_CHAIN_INITIAL)
-               atomic_set_int(&nchain->flags, HAMMER2_CHAIN_INITIAL);
-
-
        if (nchain->bref.type == HAMMER2_BREF_TYPE_DATA) {
                hammer2_chain_modify(trans, &nchain,
                                     HAMMER2_MODIFY_OPTDATA |
                                     HAMMER2_MODIFY_NOREALLOC |
                                     HAMMER2_MODIFY_INPLACE);
-       } else if (oflags & HAMMER2_CHAIN_INITIAL) {
+       } else if (nchain->flags & HAMMER2_CHAIN_INITIAL) {
                hammer2_chain_modify(trans, &nchain,
                                     HAMMER2_MODIFY_OPTDATA |
                                     HAMMER2_MODIFY_INPLACE);
@@ -2816,17 +2864,20 @@ hammer2_chain_delete_duplicate(hammer2_trans_t *trans, hammer2_chain_t **chainp,
        hammer2_chain_drop(nchain);
 
        /*
-        * Unconditionally set the MOVED and update_tid bit to force
-        * update of parent bref and indirect blockrefs during flush.
+        * Unconditionally set MOVED to force the parent blockrefs to
+        * update, and adjust update_tid below nchain so nchain's
+        * blockrefs are updated with the new attachment.
         */
        if ((nchain->flags & HAMMER2_CHAIN_MOVED) == 0) {
                atomic_set_int(&nchain->flags, HAMMER2_CHAIN_MOVED);
                hammer2_chain_ref(nchain);
        }
-       spin_lock(&nchain->core->cst.spin);
-       if (nchain->core->update_tid < trans->sync_tid)
-               nchain->core->update_tid = trans->sync_tid;
-       spin_unlock(&nchain->core->cst.spin);
+       if (nchain->core->update_tid < trans->sync_tid) {
+               spin_lock(&nchain->core->cst.spin);
+               if (nchain->core->update_tid < trans->sync_tid)
+                       nchain->core->update_tid = trans->sync_tid;
+               spin_unlock(&nchain->core->cst.spin);
+       }
        hammer2_chain_setsubmod(trans, nchain);
        *chainp = nchain;
 }
@@ -3277,10 +3328,12 @@ hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent,
         * recursively.
         */
        /*hammer2_chain_modify(trans, &ichain, HAMMER2_MODIFY_OPTDATA);*/
-       spin_lock(&ichain->core->cst.spin);
-       if (ichain->core->update_tid < trans->sync_tid)
-               ichain->core->update_tid = trans->sync_tid;
-       spin_unlock(&ichain->core->cst.spin);
+       if (ichain->core->update_tid < trans->sync_tid) {
+               spin_lock(&ichain->core->cst.spin);
+               if (ichain->core->update_tid < trans->sync_tid)
+                       ichain->core->update_tid = trans->sync_tid;
+               spin_unlock(&ichain->core->cst.spin);
+       }
        hammer2_chain_setsubmod(trans, ichain);
 
        /*
@@ -3626,10 +3679,11 @@ hammer2_chain_delete(hammer2_trans_t *trans, hammer2_chain_t *chain, int flags)
        KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE);
        spin_lock(&chain->above->cst.spin);
 
+       KKASSERT(trans->sync_tid >= chain->modify_tid);
        chain->delete_tid = trans->sync_tid;
-       /*chain->delete_gen = ++trans->delete_gen;*/
        atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
        atomic_add_int(&chain->above->live_count, -1);
+       ++chain->above->generation;
 
        if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
                hammer2_chain_ref(chain);
@@ -3637,13 +3691,13 @@ hammer2_chain_delete(hammer2_trans_t *trans, hammer2_chain_t *chain, int flags)
        }
        spin_unlock(&chain->above->cst.spin);
 
-       /*
-        * Mark the underlying block as possibly being free unless WILLDUP
-        * is set.  Duplication can occur in many situations, particularly
-        * when chains are moved to indirect blocks.
-        */
-       if ((flags & HAMMER2_DELETE_WILLDUP) == 0)
+       if (flags & HAMMER2_DELETE_WILLDUP)
+               atomic_set_int(&chain->flags, HAMMER2_CHAIN_FORCECOW);
+
+       if ((chain->flags & HAMMER2_CHAIN_FORCECOW) == 0) {
                hammer2_freemap_free(trans, chain->hmp, &chain->bref, 0);
+               chain->bref.data_off &= ~HAMMER2_OFF_MASK_RADIX;
+       }
        hammer2_chain_setsubmod(trans, chain);
 }
 
@@ -3867,12 +3921,6 @@ hammer2_base_delete(hammer2_chain_t *chain,
        hammer2_key_t key_next;
        int i;
 
-#if 0
-       kprintf("base delete %p.%d %016jx/%d\n",
-               child, child->bref.type,
-               child->bref.key, child->bref.keybits);
-#endif
-
        /*
         * Delete element.  Expect the element to exist.
         *
@@ -3920,11 +3968,6 @@ hammer2_base_insert(hammer2_chain_t *parent,
        int l;
        int u = 1;
 
-#if 0
-       kprintf("base insert %p.%d %016jx/%d\n",
-               child, child->bref.type,
-               child->bref.key, child->bref.keybits);
-#endif
        /*
         * Insert new element.  Expect the element to not already exist
         * unless we are replacing it.
index 0ed606a..9e5e13a 100644 (file)
@@ -866,9 +866,11 @@ struct hammer2_volume_data {
        hammer2_off_t   allocator_size;         /* 0060 Total data space */
        hammer2_off_t   allocator_free;         /* 0068 Free space */
        hammer2_off_t   allocator_beg;          /* 0070 Initial allocations */
-       hammer2_tid_t   mirror_tid;             /* 0078 best committed tid */
+       hammer2_tid_t   mirror_tid;             /* 0078 committed tid (vol) */
        hammer2_tid_t   alloc_tid;              /* 0080 Alloctable modify tid */
-       hammer2_blockref_t reserved0088;        /* 0088-00C7 */
+       hammer2_tid_t   inode_tid;              /* 0088 Inode allocator tid */
+       hammer2_tid_t   freemap_tid;            /* 0090 committed tid (fmap) */
+       hammer2_tid_t   reserved0090[6];        /* 0098-00C7 */
 
        /*
         * Copyids are allocated dynamically from the copyexists bitmap.
index 36db9b9..3feaddc 100644 (file)
@@ -56,6 +56,7 @@ struct hammer2_flush_info {
        int             diddeferral;
        int             pass;
        int             cache_index;
+       int             domodify;
        struct h2_flush_deferral_list flush_list;
        hammer2_tid_t   sync_tid;       /* flush synchronization point */
        hammer2_tid_t   mirror_tid;     /* collect mirror TID updates */
@@ -117,7 +118,7 @@ void
 hammer2_trans_init(hammer2_trans_t *trans, hammer2_pfsmount_t *pmp, int flags)
 {
        hammer2_mount_t *hmp;
-       hammer2_trans_t *scan;
+       hammer2_trans_t *head;
 
        bzero(trans, sizeof(*trans));
        trans->pmp = pmp;
@@ -131,177 +132,95 @@ hammer2_trans_init(hammer2_trans_t *trans, hammer2_pfsmount_t *pmp, int flags)
        if (flags & HAMMER2_TRANS_ISFLUSH) {
                /*
                 * If multiple flushes are trying to run we have to
-                * wait until it is our turn, then set curflush to
-                * indicate that a flush is now pending (but not
-                * necessarily active yet).
+                * wait until it is our turn.  All flushes are serialized.
                 *
-                * NOTE: Do not set trans->blocked here.
+                * We queue ourselves and then wait to become the head
+                * of the queue, allowing all prior flushes to complete.
                 */
                ++hmp->flushcnt;
-               while (hmp->curflush != NULL) {
-                       lksleep(&hmp->curflush, &hmp->voldatalk,
-                               0, "h2multf", hz);
-               }
-               hmp->curflush = trans;
+               trans->sync_tid = hmp->voldata.alloc_tid++;
+               trans->real_tid = trans->sync_tid;
                TAILQ_INSERT_TAIL(&hmp->transq, trans, entry);
-
-               /*
-                * If we are a flush we have to wait for all transactions
-                * prior to our flush synchronization point to complete
-                * before we can start our flush.
-                *
-                * Most importantly, this includes bioq flushes.
-                *
-                * NOTE: Do not set trans->blocked here.
-                */
-               while (TAILQ_FIRST(&hmp->transq) != trans) {
-                       lksleep(&trans->sync_tid, &hmp->voldatalk,
-                               0, "h2syncw", hz);
+               if (TAILQ_FIRST(&hmp->transq) != trans) {
+                       trans->blocked = 1;
+                       while (trans->blocked) {
+                               lksleep(&trans->sync_tid, &hmp->voldatalk,
+                                       0, "h2multf", hz);
+                       }
                }
-
+       } else if (hmp->flushcnt == 0) {
                /*
-                * don't assign sync_tid until we become the running
-                * flush.  last_flush_tid and topo_flush_tid eare used
-                * to determine when a copy-on-write (aka delete-duplicate)
-                * is required.
+                * No flushes are pending, we can go.
                 */
+               TAILQ_INSERT_TAIL(&hmp->transq, trans, entry);
                trans->sync_tid = hmp->voldata.alloc_tid;
-               hmp->voldata.alloc_tid += 2;
-               hmp->topo_flush_tid = trans->sync_tid;
+               trans->real_tid = trans->sync_tid;
 
-               /*
-                * Once we become the running flush we can wakeup anyone
-                * who blocked on us, up to the next flush.  That is,
-                * our flush can run concurrent with frontend operations.
-                */
-               scan = trans;
-               while ((scan = TAILQ_NEXT(scan, entry)) != NULL) {
-                       if (scan->flags & HAMMER2_TRANS_ISFLUSH)
-                               break;
-                       if (scan->blocked == 0)
-                               break;
-                       scan->blocked = 0;
-                       wakeup(&scan->blocked);
-               }
-       } else if ((flags & HAMMER2_TRANS_BUFCACHE) && hmp->curflush) {
-               /*
-                * We cannot block if we are the bioq thread.
-                *
-                * When possible we steal the flush's TID and flush buffers
-                * as part of the larger filesystem flush.  The flush will
-                * interlock against buffer cache transactions when INVFSYNC
-                * is set.
-                *
-                * NOTE: Transactions are not ordered by sync_tid on the
-                *       transq.  Append to avoid confusion.  Other waiting
-                *       flushes will have not added themselves to transq
-                *       yet.
-                */
-               TAILQ_INSERT_TAIL(&hmp->transq, trans, entry);
-               if ((scan = hmp->curflush) != NULL) {
-                       if (scan->flags & HAMMER2_TRANS_INVFSYNC) {
-                               trans->sync_tid = scan->sync_tid;
-                       } else {
-                               trans->sync_tid = hmp->voldata.alloc_tid++;
-                       }
-               } else {
-                       trans->sync_tid = hmp->voldata.alloc_tid++;
-               }
+               /* XXX improve/optimize inode allocation */
        } else {
                /*
-                * If this is a normal transaction and not a flush, or
-                * if this is a bioq transaction and no flush is pending,
-                * we can queue normally.
+                * One or more flushes are pending.  We insert after
+                * the current flush and may block.  We have priority
+                * over any flushes that are not the current flush.
                 *
-                * Normal transactions must block while a pending flush is
-                * waiting for prior transactions to complete.  Once the
-                * pending flush becomes active we can run concurrently
-                * with it.
+                * TRANS_BUFCACHE transactions cannot block.
                 */
-               TAILQ_INSERT_TAIL(&hmp->transq, trans, entry);
-               scan = TAILQ_FIRST(&hmp->transq);
-               if (hmp->curflush && hmp->curflush != scan) {
-                       trans->blocked = 1;
-                       while (trans->blocked) {
-                               lksleep(&trans->blocked, &hmp->voldatalk,
-                                       0, "h2trans", hz);
+               TAILQ_FOREACH(head, &hmp->transq, entry) {
+                       if (head->flags & HAMMER2_TRANS_ISFLUSH)
+                               break;
+               }
+               KKASSERT(head);
+               TAILQ_INSERT_AFTER(&hmp->transq, head, trans, entry);
+               trans->sync_tid = head->real_tid + 1;
+               trans->real_tid = trans->sync_tid;
+
+               if ((trans->flags & HAMMER2_TRANS_BUFCACHE) == 0) {
+                       if (TAILQ_FIRST(&hmp->transq) != head) {
+                               trans->blocked = 1;
+                               while (trans->blocked) {
+                                       lksleep(&trans->sync_tid,
+                                               &hmp->voldatalk, 0,
+                                               "h2multf", hz);
+                               }
                        }
                }
-               trans->sync_tid = hmp->voldata.alloc_tid++;
        }
+       if (flags & HAMMER2_TRANS_NEWINODE)
+               trans->inode_tid = hmp->voldata.inode_tid++;
        hammer2_voldata_unlock(hmp, 0);
 }
 
-/*
- * Clear the flag that allowed buffer cache flushes to steal the
- * main flush's transaction id and wait for any in-progress BC flushes
- * to finish.
- */
-void
-hammer2_trans_clear_invfsync(hammer2_trans_t *trans)
-{
-       hammer2_mount_t *hmp = trans->pmp->cluster.chains[0]->hmp;
-
-        hammer2_bioq_sync(trans->pmp);
-       atomic_clear_int(&trans->flags, HAMMER2_TRANS_INVFSYNC);
-       if (TAILQ_FIRST(&hmp->transq) != trans) {
-               hammer2_voldata_lock(hmp);
-               while (TAILQ_FIRST(&hmp->transq) != trans) {
-                       tsleep(&trans->sync_tid, 0, "h2flbw", 0);
-               }
-               hammer2_voldata_unlock(hmp, 0);
-       }
-       hammer2_bioq_sync(trans->pmp);
-       ++trans->sync_tid;
-       hmp->topo_flush_tid = trans->sync_tid;
-}
-
 void
 hammer2_trans_done(hammer2_trans_t *trans)
 {
        hammer2_mount_t *hmp;
+       hammer2_trans_t *head;
        hammer2_trans_t *scan;
-       int wasathead;
 
        hmp = trans->pmp->cluster.chains[0]->hmp;
 
+       /*
+        * Remove and adjust flushcnt
+        */
        hammer2_voldata_lock(hmp);
-       wasathead = (TAILQ_FIRST(&hmp->transq) == trans);
        TAILQ_REMOVE(&hmp->transq, trans, entry);
-
-       if (trans->flags & HAMMER2_TRANS_ISFLUSH) {
+       if (trans->flags & HAMMER2_TRANS_ISFLUSH)
                --hmp->flushcnt;
-               if (hmp->flushcnt) {
-                       /*
-                        * If we were a flush then wakeup anyone waiting on
-                        * curflush (i.e. other flushes that want to run).
-                        */
-                       hmp->curflush = NULL;
-                       wakeup(&hmp->curflush);
-               } else {
-                       /*
-                        * Cycle the flush_tid.
-                        */
-                       hmp->curflush = NULL;
-               }
-               hmp->last_flush_tid = hmp->topo_flush_tid;
-               hmp->topo_flush_tid = HAMMER2_MAX_TID;
-       } else {
-               /*
-                * If we are not a flush but a flush is now at the head
-                * of the queue and we were previously blocking it,
-                * we can now unblock it.
-                *
-                * Special case where sync_tid == scan->sync_tid occurs
-                * when buffer flush is issued while a normal flush is
-                * running (and in the correct stager), which is typically
-                * semi-synchronous but not always.
-                */
-               if (hmp->flushcnt &&
-                   (scan = TAILQ_FIRST(&hmp->transq)) != NULL &&
-                   wasathead &&
-                   (scan->flags & HAMMER2_TRANS_ISFLUSH)) {
+
+       /*
+        * Unblock the head of the queue and any additional transactions
+        * up to the next flush.
+        */
+       head = TAILQ_FIRST(&hmp->transq);
+       if (head && head->blocked) {
+               head->blocked = 0;
+               wakeup(&head->sync_tid);
+
+               scan = TAILQ_NEXT(head, entry);
+               while (scan && (scan->flags & HAMMER2_TRANS_ISFLUSH) == 0) {
+                       scan->blocked = 0;
                        wakeup(&scan->sync_tid);
+                       scan = TAILQ_NEXT(scan, entry);
                }
        }
        hammer2_voldata_unlock(hmp, 0);
@@ -357,6 +276,9 @@ hammer2_chain_flush(hammer2_trans_t *trans, hammer2_chain_t **chainp)
        info.cache_index = -1;
 
        core = chain->core;
+#if FLUSH_DEBUG
+       kprintf("CHAIN FLUSH trans %p.%016jx chain %p.%d mod %016jx upd %016jx\n", trans, trans->sync_tid, chain, chain->bref.type, chain->modify_tid, core->update_tid);
+#endif
 
        /*
         * Extra ref needed because flush_core expects it when replacing
@@ -460,20 +382,24 @@ hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp)
                        chain->flags,
                        ((chain->bref.type == HAMMER2_BREF_TYPE_INODE) ?
                                chain->data->ipdata.filename : "?"));
+       kprintf("PUSH   %p.%d %08x mod=%016jx del=%016jx mirror=%016jx\n", chain, chain->bref.type, chain->flags, chain->modify_tid, chain->delete_tid, chain->bref.mirror_tid);
 #endif
+
        /*
         * Ignore chains modified beyond the current flush point.  These
-        * will be treated as if they did not exist.
+        * will be treated as if they did not exist.  Subchains with lower
+        * modify_tid's will still be accessible via other parents.
+        *
+        * (vchain and fchain are exceptions since they cannot be duplicated)
         */
-       if (chain->modify_tid > info->sync_tid)
+       if (chain->modify_tid > info->sync_tid &&
+           chain != &hmp->fchain && chain != &hmp->vchain) {
+               chain->debug_reason = (chain->debug_reason & ~255) | 5;
                return;
+       }
 
        core = chain->core;
 
-#if 0
-       kprintf("PUSH   %p.%d %08x mirror=%016jx\n", chain, chain->bref.type, chain->flags, chain->bref.mirror_tid);
-#endif
-
        /*
         * If update_tid triggers we recurse the flush and adjust the
         * blockrefs accordingly.
@@ -487,10 +413,13 @@ hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp)
         *       not fully updated and causes it to miss the non-DELETED
         *       path.
         */
-       if (chain->bref.mirror_tid < core->update_tid) {
+       if (chain->bref.mirror_tid < core->update_tid &&
+           chain->bref.mirror_tid < info->sync_tid) {
                hammer2_chain_t *saved_parent;
                hammer2_tid_t saved_mirror;
                hammer2_chain_layer_t *layer;
+               int saved_domodify;
+               int save_gen;
 
                /*
                 * Races will bump update_tid above trans->sync_tid causing
@@ -522,11 +451,18 @@ hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp)
                 * NOTE: RB_SCAN() must be used instead of RB_FOREACH()
                 *       because children can be physically removed during
                 *       the scan.
+                *
+                * NOTE: We would normally not care about insertions except
+                *       that some insertions might occur from the flush
+                *       itself, so loop on generation number changes.
                 */
                saved_parent = info->parent;
                saved_mirror = info->mirror_tid;
+               saved_domodify = info->domodify;
                info->parent = chain;
                info->mirror_tid = chain->bref.mirror_tid;
+               info->domodify = 0;
+               chain->debug_reason = (chain->debug_reason & ~255) | 6;
 
                if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) {
                        if ((chain->flags & HAMMER2_CHAIN_DEFERRED) == 0) {
@@ -541,19 +477,47 @@ hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp)
                        info->diddeferral = 0;
                        spin_lock(&core->cst.spin);
                        KKASSERT(core->good == 0x1234 && core->sharecnt > 0);
-                       TAILQ_FOREACH_REVERSE(layer, &core->layerq,
-                                             h2_layer_list, entry) {
-                               ++layer->refs;
-                               KKASSERT(layer->good == 0xABCD);
-                               RB_SCAN(hammer2_chain_tree, &layer->rbtree,
-                                       NULL, hammer2_chain_flush_scan1, info);
-                               --layer->refs;
-                               diddeferral += info->diddeferral;
-                       }
+                       do {
+                               save_gen = core->generation;
+                               TAILQ_FOREACH_REVERSE(layer, &core->layerq,
+                                                     h2_layer_list, entry) {
+                                       ++layer->refs;
+                                       KKASSERT(layer->good == 0xABCD);
+                                       RB_SCAN(hammer2_chain_tree,
+                                               &layer->rbtree,
+                                               NULL, hammer2_chain_flush_scan1,
+                                               info);
+                                       --layer->refs;
+                                       diddeferral += info->diddeferral;
+                               }
+                       } while (core->generation != save_gen);
                        spin_unlock(&core->cst.spin);
                }
 
-               KKASSERT(info->parent == chain);
+               /*
+                * Blockrefs are only updated on live chains.
+                *
+                * We are possibly causing a delete-duplicate from inside the
+                * flush itself.  The parent might be live or might have been
+                * deleted concurrently in a post-flush transaction.  If
+                * the parent was deleted our modified chain will also be
+                * marked deleted, but since it inherits the parent's
+                * delete_tid it will still appear to be 'live' for the
+                * purposes of the flush.
+                *
+                * There may also be a side-effect due to the freemap
+                * allocation.  See freemap_alloc()
+                */
+               if (info->domodify && chain->delete_tid > info->sync_tid) {
+                       hammer2_chain_modify(info->trans, &info->parent,
+                                            HAMMER2_MODIFY_NO_MODIFY_TID);
+                       if (info->parent != chain) {
+                               hammer2_chain_drop(chain);
+                               hammer2_chain_ref(info->parent);
+                       }
+                       chain = info->parent;
+               }
+               chain->debug_reason = (chain->debug_reason & ~255) | 7;
 
                /*
                 * Handle successfully flushed children who are in the MOVED
@@ -589,25 +553,17 @@ hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp)
                         * in scan2 to determine when a chain must be applied
                         * to the related block table.
                         */
-#if 0
-                       kprintf("chainA %p.%d set parent bref mirror_tid %016jx -> %016jx\n",
-                               info->parent, info->parent->bref.type,
-                               info->mirror_tid, info->parent->bref.mirror_tid);
-#endif
                        KKASSERT(info->parent->bref.mirror_tid <=
                                 info->mirror_tid);
-                       info->parent->bref.mirror_tid = info->mirror_tid;
+                       chain->bref.mirror_tid = info->mirror_tid;
                }
 
                /*
-                * chain may have been replaced.
+                * info->parent must not have been replaced again
                 */
-#if 0
-               if (info->parent != *chainp)
-                       kprintf("SWITCH PARENT %p->%p\n",
-                               *chainp, info->parent);
-#endif
-               chain = info->parent;
+               KKASSERT(info->parent == chain);
+
+               chain->debug_reason = (chain->debug_reason & ~255) | 8;
                *chainp = chain;
 
                hammer2_chain_layer_check_locked(chain->hmp, core);
@@ -615,10 +571,11 @@ hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp)
 
                info->mirror_tid = saved_mirror;
                info->parent = saved_parent;
+               info->domodify = saved_domodify;
                KKASSERT(chain->refs > 1);
        }
 
-#if 0
+#if FLUSH_DEBUG
        kprintf("POP    %p.%d\n", chain, chain->bref.type);
 #endif
 
@@ -632,6 +589,7 @@ hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp)
         * retried later after the deferrals are independently handled.
         */
        if (diddeferral) {
+               chain->debug_reason = (chain->debug_reason & ~255) | 99;
                if (hammer2_debug & 0x0008) {
                        kprintf("%*.*s} %p/%d %04x (deferred)",
                                info->depth, info->depth, "",
@@ -663,6 +621,7 @@ hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp)
         *        purposes.
         */
        if (chain->delete_tid <= info->sync_tid) {
+               chain->debug_reason = (chain->debug_reason & ~255) | 9;
                if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
                        if (chain->bp) {
                                if (chain->bytes == chain->bp->b_bufsize)
@@ -717,8 +676,10 @@ hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp)
        if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) {
                if (chain->bref.mirror_tid < info->sync_tid)
                        chain->bref.mirror_tid = info->sync_tid;
+               chain->debug_reason = (chain->debug_reason & ~255) | 10;
                return;
        }
+       chain->debug_reason = (chain->debug_reason & ~255) | 11;
 
        /*
         * Issue flush.
@@ -785,6 +746,7 @@ hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp)
        switch(chain->bref.type) {
        case HAMMER2_BREF_TYPE_FREEMAP:
                hammer2_modify_volume(hmp);
+               hmp->voldata.freemap_tid = chain->bref.mirror_tid;
                break;
        case HAMMER2_BREF_TYPE_VOLUME:
                /*
@@ -796,12 +758,13 @@ hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp)
                 */
                hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
                if ((hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) ||
-                   hmp->voldata.mirror_tid < hmp->fchain.core->update_tid) {
+                   hmp->voldata.freemap_tid < hmp->fchain.core->update_tid) {
                        /* this will modify vchain as a side effect */
                        hammer2_chain_t *tmp = &hmp->fchain;
                        hammer2_chain_flush(info->trans, &tmp);
                        KKASSERT(tmp == &hmp->fchain);
                }
+               hmp->voldata.mirror_tid = chain->bref.mirror_tid;
 
                /*
                 * The volume header is flushed manually by the syncer, not
@@ -977,36 +940,44 @@ hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data)
        int diddeferral;
 
        /*
-        * We only need to recurse if MODIFIED is set or
-        * child->bref.mirror_tid has not caught up to update_tid.
+        * Child is beyond the flush synchronization zone, don't persue.
+        * Remember that modifications generally delete-duplicate so if the
+        * sub-tree is dirty another child will get us there.  But not this
+        * one.
+        *
+        * Or MODIFIED is not set and child is already fully synchronized
+        * with its sub-tree.  Don't persue.
         */
-       if ((child->flags & HAMMER2_CHAIN_MODIFIED) == 0 &&
-           child->bref.mirror_tid >= child->core->update_tid) {
+       if (child->modify_tid > trans->sync_tid) {
+               KKASSERT(child->delete_tid >= child->modify_tid);
+               child->debug_reason = (child->debug_reason & ~255) | 1;
                return (0);
        }
-       if (child->modify_tid > trans->sync_tid)
-               return (0);
-
-       hammer2_chain_ref(child);
-       spin_unlock(&parent->core->cst.spin);
 
        /*
+        * We must ref the child before unlocking the spinlock.
+        *
         * The caller has added a ref to the parent so we can temporarily
-        * unlock it in order to lock the child.  Re-check the flags before
-        * continuing.
+        * unlock it in order to lock the child.
         */
+       hammer2_chain_ref(child);
+       spin_unlock(&parent->core->cst.spin);
+
        hammer2_chain_unlock(parent);
        hammer2_chain_lock(child, HAMMER2_RESOLVE_MAYBE);
 
        if ((child->flags & HAMMER2_CHAIN_MODIFIED) == 0 &&
-           child->bref.mirror_tid >= child->core->update_tid) {
-               hammer2_chain_unlock(child);
-               hammer2_chain_drop(child);
-               hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE);
-               spin_lock(&parent->core->cst.spin);
-               return (0);
+           (child->bref.mirror_tid >= child->core->update_tid ||
+            child->bref.mirror_tid >= info->sync_tid)) {
+               child->debug_reason = (child->debug_reason & ~255) | 2;
+               goto skip;
        }
+
+       /*
+        * Re-check the flags before continuing.
+        */
        if (child->modify_tid > trans->sync_tid) {
+               child->debug_reason = (child->debug_reason & ~255) | 3;
                hammer2_chain_unlock(child);
                hammer2_chain_drop(child);
                hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE);
@@ -1014,6 +985,13 @@ hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data)
                return (0);
        }
 
+       if ((child->flags & HAMMER2_CHAIN_MODIFIED) == 0 &&
+           (child->bref.mirror_tid >= child->core->update_tid ||
+            child->bref.mirror_tid >= info->sync_tid)) {
+               child->debug_reason = (child->debug_reason & ~255) | 4;
+               goto skip;
+       }
+
        /*
         * The DESTROYED flag can only be initially set on an unreferenced
         * deleted inode and will propagate downward via the mechanic below.
@@ -1038,7 +1016,8 @@ hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data)
                 * at least sync_tid.  Parent's mirror_tid has not yet
                 * been updated.
                 *
-                * Vnode reclamation may have forced update_tid to MAX_TID.
+                * Vnode reclamation may have forced update_tid to MAX_TID
+                * (we do this because there was no transaction at the time).
                 * In this situation bring it down to something reasonable
                 * so the elements being destroyed can be retired.
                 */
@@ -1056,28 +1035,45 @@ hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data)
        diddeferral = info->diddeferral;
        ++info->depth;
        hammer2_chain_flush_core(info, &child);
-#if FLUSH_DEBUG
-       kprintf("flush_core_done parent=%p flags=%08x child=%p.%d %08x\n",
-               parent, parent->flags, child, child->bref.type, child->flags);
-#endif
+
        /*
         * NOTE: If child failed to fully synchronize, child's bref.mirror_tid
         *       will not have been updated.  Bumping diddeferral prevents
         *       the parent chain from updating bref.mirror_tid on the way
         *       back up in order to force a retry later.
         */
-       if (child->bref.mirror_tid < child->core->update_tid)
+       if (child->bref.mirror_tid < child->core->update_tid &&
+           child->bref.mirror_tid < info->sync_tid) {
                ++diddeferral;
+       }
 
        --info->depth;
        info->diddeferral += diddeferral;
-       hammer2_chain_unlock(child);
-       hammer2_chain_drop(child);
 
+skip:
+       /*
+        * Check the conditions that could cause SCAN2 to modify the parent.
+        * Modify the parent here instead of in SCAN2, which would cause
+        * rollup chicken-and-egg races.
+        */
+       if (child->delete_tid <= trans->sync_tid &&
+           child->delete_tid > parent->bref.mirror_tid &&
+           child->modify_tid <= parent->bref.mirror_tid) {
+               info->domodify = 1;
+       } else if (child->delete_tid > trans->sync_tid &&
+                  child->modify_tid > parent->bref.mirror_tid) {
+               info->domodify = 1;
+       }
+
+       /*
+        * Relock to continue the loop
+        */
+       hammer2_chain_unlock(child);
        hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE);
+       hammer2_chain_drop(child);
+       KKASSERT(info->parent == parent);
 
        spin_lock(&parent->core->cst.spin);
-
        return (0);
 }
 
@@ -1102,6 +1098,10 @@ hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data)
  *       a rename).   So a chain marked for deletion is basically considered
  *       to be live until it is explicitly destroyed or until its ref-count
  *       reaches zero (also implying that MOVED and MODIFIED are clear).
+ *
+ * NOTE!  Info->parent will be locked but will only be instantiated/modified
+ *       if it is either MODIFIED or if scan1 determined that block table
+ *       updates will occur.
  */
 static int
 hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data)
@@ -1115,6 +1115,9 @@ hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data)
        int count;
        int ok;
 
+#if FLUSH_DEBUG
+       kprintf("SCAN2 %p.%d %08x mod=%016jx del=%016jx trans=%016jx\n", child, child->bref.type, child->flags, child->modify_tid, child->delete_tid, info->trans->sync_tid);
+#endif
        /*
         * Inodes with stale children that have been converted to DIRECTDATA
         * mode (file extension or hardlink conversion typically) need to
@@ -1145,6 +1148,7 @@ hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data)
         * a higher update_tid than we can set in the current flush.
         */
        if (child->modify_tid > trans->sync_tid) {
+               KKASSERT(child->delete_tid >= child->modify_tid);
                goto finalize;
        }
 
@@ -1159,6 +1163,7 @@ hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data)
         *     we need something better.
         */
        if ((child->flags & HAMMER2_CHAIN_MOVED) == 0) {
+               KKASSERT((child->flags & HAMMER2_CHAIN_MODIFIED) == 0);
                goto finalize;
        }
 
@@ -1201,13 +1206,6 @@ hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data)
         *
         * XXX recursive deletions not optimized.
         */
-       hammer2_chain_modify(trans, &parent, HAMMER2_MODIFY_NO_MODIFY_TID);
-       if (info->parent != parent) {
-               /* extra ref from flush_core */
-               hammer2_chain_drop(info->parent);
-               info->parent = parent;
-               hammer2_chain_ref(info->parent);
-       }
 
        switch(parent->bref.type) {
        case HAMMER2_BREF_TYPE_INODE:
@@ -1222,28 +1220,18 @@ hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data)
                 * set OPFLAG_DIRECTDATA to prevent the indirect and data
                 * blocks from syncing ot the hardlink pointer.
                 */
-#if 0
-               KKASSERT((parent->data->ipdata.op_flags &
-                         HAMMER2_OPFLAG_DIRECTDATA) == 0);
-#endif
-#if 0
-               if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
-                       base = NULL;
-               } else
-#endif
-               {
+               if (parent->data)
                        base = &parent->data->ipdata.u.blockset.blockref[0];
-                       count = HAMMER2_SET_COUNT;
-               }
+               else
+                       base = NULL;
+               count = HAMMER2_SET_COUNT;
                break;
        case HAMMER2_BREF_TYPE_INDIRECT:
        case HAMMER2_BREF_TYPE_FREEMAP_NODE:
-               if (parent->data) {
+               if (parent->data)
                        base = &parent->data->npdata[0];
-               } else {
+               else
                        base = NULL;
-                       KKASSERT(child->flags & HAMMER2_CHAIN_DELETED);
-               }
                count = parent->bytes / sizeof(hammer2_blockref_t);
                break;
        case HAMMER2_BREF_TYPE_VOLUME:
@@ -1271,6 +1259,11 @@ hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data)
         * Otherwise, we need to be COUNTEDBREFS synchronized for the
         * hammer2_base_*() functions.
         */
+#if FLUSH_DEBUG
+       kprintf("SCAN2 base=%p pass=%d PARENT %p.%d DTID=%016jx SYNC=%016jx\n",
+               base,
+               info->pass, parent, parent->bref.type, parent->delete_tid, trans->sync_tid);
+#endif
        if (parent->delete_tid <= trans->sync_tid)
                base = NULL;
        else if ((parent->core->flags & HAMMER2_CORE_COUNTEDBREFS) == 0)
@@ -1286,12 +1279,15 @@ hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data)
         * NOTE! Updates to a parent's blockref table do not adjust the
         *       parent's bref.modify_tid, only its bref.mirror_tid.
         *
+        *       SCAN1 has already put the parent in a modified state
+        *       so if it isn't we panic.
+        *
         * NOTE! chain->modify_tid vs chain->bref.modify_tid.  The chain's
         *       internal modify_tid is always updated based on creation
         *       or delete-duplicate.  However, the bref.modify_tid is NOT
         *       updated due to simple blockref updates.
         */
-#if 0
+#if FLUSH_DEBUG
        kprintf("chain %p->%p pass %d trans %016jx sync %p.%d %016jx/%d C=%016jx D=%016jx PMIRROR %016jx\n",
                parent, child,
                info->pass, trans->sync_tid,
@@ -1311,13 +1307,19 @@ hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data)
                 * (2) The creation occurred before or during the parent's
                 *     last block table synchronization.
                 */
+#if FLUSH_DEBUG
+               kprintf("S2A %p b=%p d/b=%016jx/%016jx m/b=%016jx/%016jx\n",
+                       child, base, child->delete_tid, parent->bref.mirror_tid,
+                       child->modify_tid, parent->bref.mirror_tid);
+#endif
                ok = 1;
                if (base &&
                    child->delete_tid > parent->bref.mirror_tid &&
                    child->modify_tid <= parent->bref.mirror_tid) {
+                       KKASSERT(parent->modify_tid == trans->sync_tid);
                        hammer2_rollup_stats(parent, child, -1);
                        spin_lock(&above->cst.spin);
-#if 0
+#if FLUSH_DEBUG
                        kprintf("trans %jx parent %p.%d child %p.%d m/d %016jx/%016jx "
                                "flg=%08x %016jx/%d delete\n",
                                trans->sync_tid,
@@ -1346,11 +1348,11 @@ hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data)
                 */
                ok = 1;
                if (base &&
-                   child->modify_tid > parent->bref.mirror_tid &&
-                   child->delete_tid > trans->sync_tid) {
+                   child->modify_tid > parent->bref.mirror_tid) {
+                       KKASSERT(parent->modify_tid == trans->sync_tid);
                        hammer2_rollup_stats(parent, child, 1);
                        spin_lock(&above->cst.spin);
-#if 0
+#if FLUSH_DEBUG
                        kprintf("trans %jx parent %p.%d child %p.%d m/d %016jx/%016jx "
                                "flg=%08x %016jx/%d insert\n",
                                trans->sync_tid,
@@ -1371,13 +1373,9 @@ hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data)
        }
 
        if (info->mirror_tid < child->bref.mirror_tid) {
+               KKASSERT(child->bref.mirror_tid <= trans->sync_tid);
                info->mirror_tid = child->bref.mirror_tid;
        }
-       if ((parent->bref.type == HAMMER2_BREF_TYPE_VOLUME ||
-            parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP) &&
-           hmp->voldata.mirror_tid < child->bref.mirror_tid) {
-               hmp->voldata.mirror_tid = child->bref.mirror_tid;
-       }
 
        /*
         * Only clear MOVED once all possible parents have been flushed.
@@ -1396,17 +1394,30 @@ hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data)
                spin_lock(&above->cst.spin);
                TAILQ_FOREACH(scan, &above->ownerq, core_entry) {
                        /*
-                        * Can't destroy the child until all parent's have
-                        * synchronized with its move.
+                        * Can't clear child's MOVED until all parent's have
+                        * synchronized with it.
                         *
-                        * NOTE: A deleted parent will synchronize with a
-                        *       child's move without bothering to update
-                        *       its brefs.
+                        * Ignore our current parent (we use 'ok' from above),
+                        *
+                        * ignore any parents which have been deleted as-of
+                        * our transaction id (their block array doesn't get
+                        * updated).
                         */
                        if (scan == parent ||
                            scan->delete_tid <= trans->sync_tid)
                                continue;
-                       if (scan->bref.mirror_tid < child->modify_tid) {
+
+                       /*
+                        * parent not synchronized if child modified or
+                        * deleted after the parent's last sync point.
+                        *
+                        * (For the purpose of clearing the MOVED bit
+                        *  we do not restrict the tests to just flush
+                        *  transactions).
+                        */
+                       if (scan->bref.mirror_tid < child->modify_tid ||
+                           ((child->flags & HAMMER2_CHAIN_DELETED) &&
+                            scan->bref.mirror_tid < child->delete_tid)) {
                                if (hammer2_debug & 0x4000)
                                        kprintf("(fail scan %p %016jx/%016jx)",
                                                scan, scan->bref.mirror_tid,
@@ -1417,13 +1428,26 @@ hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data)
                if (hammer2_debug & 0x4000)
                        kprintf("\n");
                spin_unlock(&above->cst.spin);
+
+               /*
+                * Can we finally clear MOVED?
+                */
                if (ok) {
                        if (hammer2_debug & 0x4000)
                                kprintf("clear moved %p.%d %016jx/%d\n",
                                        child, child->bref.type,
                                        child->bref.key, child->bref.keybits);
-                       atomic_clear_int(&child->flags, HAMMER2_CHAIN_MOVED);
-                       hammer2_chain_drop(child);      /* flag */
+                       if (child->modify_tid <= trans->sync_tid &&
+                           (child->delete_tid == HAMMER2_MAX_TID ||
+                            child->delete_tid <= trans->sync_tid)) {
+                               atomic_clear_int(&child->flags,
+                                                HAMMER2_CHAIN_MOVED);
+                               hammer2_chain_drop(child);      /* flag */
+                               KKASSERT((child->flags &
+                                               HAMMER2_CHAIN_MODIFIED) == 0);
+                       } else {
+                               kprintf("ok problem child %p %016jx/%016jx vs %016jx\n", child, child->modify_tid, child->delete_tid, trans->sync_tid);
+                       }
                } else {
                        if (hammer2_debug & 0x4000)
                                kprintf("keep  moved %p.%d %016jx/%d\n",
index fbb6445..230e828 100644 (file)
@@ -176,6 +176,12 @@ hammer2_freemap_reserve(hammer2_mount_t *hmp, hammer2_blockref_t *bref,
  *
  * ip and bpref are only used as a heuristic to determine locality of
  * reference.  bref->key may also be used heuristically.
+ *
+ * WARNING! When called from a flush we have to use the 'live' sync_tid
+ *         and not the flush sync_tid.  The live sync_tid is the flush
+ *         sync_tid + 1.  That is, freemap allocations which occur during
+ *         a flush are not part of the flush.  Crash-recovery will restore
+ *         any lost allocations.
  */
 int
 hammer2_freemap_alloc(hammer2_trans_t *trans, hammer2_mount_t *hmp,
@@ -209,10 +215,14 @@ hammer2_freemap_alloc(hammer2_trans_t *trans, hammer2_mount_t *hmp,
                hammer2_freemap_free(trans, hmp, bref, 0);
 
        /*
-        * Normal allocations
+        * Setting ISALLOCATING ensures correct operation even when the
+        * flusher itself is making allocations.
         */
        KKASSERT(bytes >= HAMMER2_MIN_ALLOC && bytes <= HAMMER2_MAX_ALLOC);
+       KKASSERT((trans->flags & HAMMER2_TRANS_ISALLOCATING) == 0);
        atomic_set_int(&trans->flags, HAMMER2_TRANS_ISALLOCATING);
+       if (trans->flags & HAMMER2_TRANS_ISFLUSH)
+               ++trans->sync_tid;
 
        /*
         * Calculate the starting point for our allocation search.
@@ -275,6 +285,8 @@ hammer2_freemap_alloc(hammer2_trans_t *trans, hammer2_mount_t *hmp,
        hammer2_chain_unlock(parent);
 
        atomic_clear_int(&trans->flags, HAMMER2_TRANS_ISALLOCATING);
+       if (trans->flags & HAMMER2_TRANS_ISFLUSH)
+               --trans->sync_tid;
 
        return (error);
 }
@@ -718,6 +730,12 @@ hammer2_freemap_iterate(hammer2_trans_t *trans, hammer2_chain_t **parentp,
  * the moment we depend on the bulk freescan to actually free blocks.  It
  * will still call this routine with a non-zero how to stage possible frees
  * and to do the actual free.
+ *
+ * WARNING! When called from a flush we have to use the 'live' sync_tid
+ *         and not the flush sync_tid.  The live sync_tid is the flush
+ *         sync_tid + 1.  That is, freemap allocations which occur during
+ *         a flush are not part of the flush.  Crash-recovery will restore
+ *         any lost allocations.
  */
 void
 hammer2_freemap_free(hammer2_trans_t *trans, hammer2_mount_t *hmp,
@@ -756,11 +774,14 @@ hammer2_freemap_free(hammer2_trans_t *trans, hammer2_mount_t *hmp,
         * We can't free data allocated by newfs_hammer2.
         * Assert validity.
         */
+       KKASSERT((data_off & HAMMER2_ZONE_MASK64) >= HAMMER2_ZONE_SEG);
        if (data_off < hmp->voldata.allocator_beg)
                return;
-       KKASSERT((data_off & HAMMER2_ZONE_MASK64) >= HAMMER2_ZONE_SEG);
 
+       KKASSERT((trans->flags & HAMMER2_TRANS_ISALLOCATING) == 0);
        atomic_set_int(&trans->flags, HAMMER2_TRANS_ISALLOCATING);
+       if (trans->flags & HAMMER2_TRANS_ISFLUSH)
+               ++trans->sync_tid;
 
        /*
         * Lookup the level1 freemap chain.  The chain must exist.
@@ -872,4 +893,6 @@ hammer2_freemap_free(hammer2_trans_t *trans, hammer2_mount_t *hmp,
        hammer2_chain_unlock(parent);
 
        atomic_clear_int(&trans->flags, HAMMER2_TRANS_ISALLOCATING);
+       if (trans->flags & HAMMER2_TRANS_ISFLUSH)
+               --trans->sync_tid;
 }
index d341acf..2c40a9e 100644 (file)
@@ -632,14 +632,14 @@ retry:
         *
         * NOTE: nipdata will have chain's blockset data.
         */
-       chain->data->ipdata.inum = trans->sync_tid;
+       chain->data->ipdata.inum = trans->inode_tid;
        nip = hammer2_inode_get(dip->pmp, dip, chain);
        nipdata = &chain->data->ipdata;
 
        if (vap) {
                KKASSERT(trans->inodes_created == 0);
                nipdata->type = hammer2_get_obj_type(vap->va_type);
-               nipdata->inum = trans->sync_tid;
+               nipdata->inum = trans->inode_tid;
                ++trans->inodes_created;
 
                switch (nipdata->type) {
@@ -747,6 +747,8 @@ hammer2_chain_refactor(hammer2_chain_t **chainp)
  * then delete it to placemark where the duplicate will go.  Both of
  * these use the inode number for (lhc) (the key), generating the
  * invisible filename.
+ *
+ * The original ochain is deleted.
  */
 static
 hammer2_chain_t *
@@ -789,7 +791,7 @@ retry:
                hammer2_chain_unlock(nchain);
                nchain = NULL;
                *errorp = ENOSPC;
-#if 1
+#if 0
                Debugger("X3");
 #endif
        }
@@ -831,7 +833,7 @@ retry:
        }
 
        /*
-        * Use chain as a placeholder for (lhc), delete it and replace
+        * Use nchain as a placeholder for (lhc), delete it and replace
         * it with our duplication.
         *
         * Gain a second lock on ochain for the duplication function to
@@ -839,7 +841,7 @@ retry:
         *
         * This is a bit messy.
         */
-       hammer2_chain_delete(trans, nchain, HAMMER2_DELETE_WILLDUP);
+       hammer2_chain_delete(trans, nchain, 0);
        hammer2_chain_lock(ochain, HAMMER2_RESOLVE_ALWAYS);
        tmp = ochain;
        bref = tmp->bref;
@@ -851,7 +853,8 @@ retry:
        hammer2_chain_unlock(nchain);   /* no longer needed */
 
        /*
-        * Now set chain to our duplicate and modify it appropriately.
+        * Now set nchain to our duplicate and modify it appropriately.
+        * Note that this may result in a delete-duplicate.
         *
         * Directory entries are inodes but this is a hidden hardlink
         * target.  The name isn't used but to ease debugging give it
@@ -860,7 +863,7 @@ retry:
        nchain = tmp;
        tmp = NULL;     /* safety */
 
-       hammer2_chain_modify(trans, &nchain, HAMMER2_MODIFY_ASSERTNOCOPY);
+       hammer2_chain_modify(trans, &nchain, 0);
        nipdata = &nchain->data->ipdata;
        ksnprintf(nipdata->filename, sizeof(nipdata->filename),
                  "0x%016jx", (intmax_t)nipdata->inum);
index f78a093..78656f6 100644 (file)
@@ -510,7 +510,7 @@ hammer2_ioctl_pfs_create(hammer2_inode_t *ip, void *data)
                return(EINVAL);
        pfs->name[sizeof(pfs->name) - 1] = 0;   /* ensure 0-termination */
 
-       hammer2_trans_init(&trans, ip->pmp, 0);
+       hammer2_trans_init(&trans, ip->pmp, HAMMER2_TRANS_NEWINODE);
        nip = hammer2_inode_create(&trans, hmp->sroot, NULL, NULL,
                                     pfs->name, strlen(pfs->name),
                                     &nchain, &error);
@@ -569,7 +569,7 @@ hammer2_ioctl_pfs_snapshot(hammer2_inode_t *ip, void *data)
 
        hammer2_vfs_sync(ip->pmp->mp, MNT_WAIT);
 
-       hammer2_trans_init(&trans, ip->pmp, 0);
+       hammer2_trans_init(&trans, ip->pmp, HAMMER2_TRANS_NEWINODE);
        parent = hammer2_inode_lock_ex(ip);
        error = hammer2_chain_snapshot(&trans, &parent, pfs);
        hammer2_inode_unlock_ex(ip, parent);
index 68a1e6c..ab7f86d 100644 (file)
@@ -457,8 +457,6 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
                hmp = kmalloc(sizeof(*hmp), M_HAMMER2, M_WAITOK | M_ZERO);
                hmp->ronly = ronly;
                hmp->devvp = devvp;
-               hmp->last_flush_tid = 0;
-               hmp->topo_flush_tid = HAMMER2_MAX_TID;
                kmalloc_create(&hmp->mchain, "HAMMER2-chains");
                TAILQ_INSERT_TAIL(&hammer2_mntlist, hmp, mntentry);
 
@@ -511,6 +509,9 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
                        return error;
                }
 
+               hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid;
+               hmp->fchain.bref.mirror_tid = hmp->voldata.freemap_tid;
+
                /*
                 * First locate the super-root inode, which is key 0
                 * relative to the volume header's blockset.
@@ -728,11 +729,15 @@ hammer2_write_thread(void *arg)
 
                while ((bio = bioq_takefirst(&pmp->wthread_bioq)) != NULL) {
                        /*
-                        * dummy bio for synchronization
+                        * dummy bio for synchronization.  The transaction
+                        * must be reinitialized.
                         */
                        if (bio->bio_buf == NULL) {
                                bio->bio_flags |= BIO_DONE;
                                wakeup(bio);
+                               hammer2_trans_done(&trans);
+                               hammer2_trans_init(&trans, pmp,
+                                                  HAMMER2_TRANS_BUFCACHE);
                                continue;
                        }
 
@@ -1430,7 +1435,8 @@ hammer2_vfs_unmount(struct mount *mp, int mntflags)
                hammer2_voldata_lock(hmp);
                if (((hmp->vchain.flags | hmp->fchain.flags) &
                     HAMMER2_CHAIN_MODIFIED) ||
-                   hmp->vchain.core->update_tid > hmp->voldata.mirror_tid) {
+                   hmp->vchain.core->update_tid > hmp->voldata.mirror_tid ||
+                   hmp->fchain.core->update_tid > hmp->voldata.freemap_tid) {
                        hammer2_voldata_unlock(hmp, 0);
                        hammer2_vfs_sync(mp, MNT_WAIT);
                        hammer2_vfs_sync(mp, MNT_WAIT);
@@ -1438,9 +1444,12 @@ hammer2_vfs_unmount(struct mount *mp, int mntflags)
                        hammer2_voldata_unlock(hmp, 0);
                }
                if (hmp->pmp_count == 0) {
-                       if ((hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) ||
-                           hmp->vchain.core->update_tid >
-                            hmp->voldata.mirror_tid) {
+                       if (((hmp->vchain.flags | hmp->fchain.flags) &
+                            HAMMER2_CHAIN_MODIFIED) ||
+                           (hmp->vchain.core->update_tid >
+                            hmp->voldata.mirror_tid) ||
+                           (hmp->fchain.core->update_tid >
+                            hmp->voldata.freemap_tid)) {
                                kprintf("hammer2_unmount: chains left over "
                                        "after final sync\n");
                                if (hammer2_debug & 0x0010)
@@ -1668,22 +1677,18 @@ hammer2_vfs_sync(struct mount *mp, int waitfor)
         * The reclamation code interlocks with the sync list's token
         * (by removing the vnode from the scan list) before unlocking
         * the inode, giving us time to ref the inode.
-        *
-        * INVFSYNC allows the bioq to drain using the flush transaction's
-        * TID while the ISFLUSH transaction is active.
         */
        /*flags = VMSC_GETVP;*/
        flags = 0;
        if (waitfor & MNT_LAZY)
                flags |= VMSC_ONEPASS;
 
-       hammer2_trans_init(&info.trans, pmp, HAMMER2_TRANS_ISFLUSH |
-                                            HAMMER2_TRANS_INVFSYNC);
-
        /*
-        * vfsync the vnodes.  XXX This will also catch writes for
-        * transactions beyond the current flush.  XXX
+        * Initialize a normal transaction and sync everything out, then
+        * wait for pending I/O to finish (so it gets a transaction id
+        * that the meta-data flush will catch).
         */
+       hammer2_trans_init(&info.trans, pmp, 0);
        info.error = 0;
        info.waitfor = MNT_NOWAIT;
        vsyncscan(mp, flags | VMSC_NOWAIT, hammer2_sync_scan2, &info);
@@ -1693,23 +1698,13 @@ hammer2_vfs_sync(struct mount *mp, int waitfor)
                    vsyncscan(mp, flags, hammer2_sync_scan2, &info);
 
        }
+       hammer2_trans_done(&info.trans);
+       hammer2_bioq_sync(info.trans.pmp);
 
        /*
-        * Wait for pending work to complete, then clear INVFSYNC.  Further
-        * buffer cache synchronization is allowed to run concurrently but
-        * will use a higher sync_tid and is not part of the normal flush.
-        *
-        * These waits are important because
+        * Start the flush transaction and flush all meta-data.
         */
-       hammer2_trans_clear_invfsync(&info.trans);
-
-#if 0
-       if (waitfor == MNT_WAIT) {
-               /* XXX */
-       } else {
-               /* XXX */
-       }
-#endif
+       hammer2_trans_init(&info.trans, pmp, HAMMER2_TRANS_ISFLUSH);
 
        total_error = 0;
        for (i = 0; i < pmp->cluster.nchains; ++i) {
@@ -1738,7 +1733,7 @@ hammer2_vfs_sync(struct mount *mp, int waitfor)
 
                hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
                if ((hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) ||
-                   hmp->vchain.core->update_tid > hmp->voldata.mirror_tid ||
+                   hmp->fchain.core->update_tid > hmp->voldata.freemap_tid ||
                    force_fchain) {
                        /* this will also modify vchain as a side effect */
                        chain = &hmp->fchain;
@@ -1808,8 +1803,8 @@ hammer2_vfs_sync(struct mount *mp, int waitfor)
                if (error)
                        total_error = error;
        }
-
        hammer2_trans_done(&info.trans);
+
        return (total_error);
 }
 
index 647b1db..4e4ee29 100644 (file)
@@ -380,17 +380,15 @@ hammer2_vop_fsync(struct vop_fsync_args *ap)
        ip = VTOI(vp);
 
        /*
-        * TRANS_ISFLUSH allocates two transaction ids, one for concurrent
-        * buffer syncs, and one for our flush.
-        *
-        * WARNING: The vfsync interacts with the buffer cache and might
-        *          block, we can't hold the inode lock and we can't
-        *          have a flush transaction pending.
+        * WARNING: Cannot use TRANS_ISFLUSH for partial syncs.
         */
-       hammer2_trans_init(&trans, ip->pmp, HAMMER2_TRANS_ISFLUSH |
-                                           HAMMER2_TRANS_INVFSYNC);
+#if 0
+       hammer2_trans_init(&trans, ip->pmp, HAMMER2_TRANS_ISFLUSH);
        vfsync(vp, ap->a_waitfor, 1, NULL, NULL);
        hammer2_trans_clear_invfsync(&trans);
+#endif
+       hammer2_trans_init(&trans, ip->pmp, 0);
+       vfsync(vp, ap->a_waitfor, 1, NULL, NULL);
 
        /*
         * Calling chain_flush here creates a lot of duplicative
@@ -406,9 +404,12 @@ hammer2_vop_fsync(struct vop_fsync_args *ap)
        if (ip->flags & (HAMMER2_INODE_RESIZED|HAMMER2_INODE_MTIME))
                hammer2_inode_fsync(&trans, ip, &chain);
 
+#if 0
+       /* XXX creates discontinuity w/modify_tid */
        if (ap->a_flags & VOP_FSYNC_SYSCALL) {
                hammer2_chain_flush(&trans, &chain);
        }
+#endif
        hammer2_inode_unlock_ex(ip, chain);
        hammer2_trans_done(&trans);
 
@@ -1396,7 +1397,7 @@ hammer2_vop_nmkdir(struct vop_nmkdir_args *ap)
        name_len = ncp->nc_nlen;
 
        hammer2_chain_memory_wait(dip->pmp);
-       hammer2_trans_init(&trans, dip->pmp, 0);
+       hammer2_trans_init(&trans, dip->pmp, HAMMER2_TRANS_NEWINODE);
        nip = hammer2_inode_create(&trans, dip, ap->a_vap, ap->a_cred,
                                   name, name_len, &chain, &error);
        if (error) {
@@ -1507,7 +1508,7 @@ hammer2_vop_nlink(struct vop_nlink_args *ap)
         */
        ip = VTOI(ap->a_vp);
        hammer2_chain_memory_wait(ip->pmp);
-       hammer2_trans_init(&trans, ip->pmp, 0);
+       hammer2_trans_init(&trans, ip->pmp, HAMMER2_TRANS_NEWINODE);
 
        chain = hammer2_inode_lock_ex(ip);
        error = hammer2_hardlink_consolidate(&trans, ip, &chain, dip, 1);
@@ -1564,7 +1565,7 @@ hammer2_vop_ncreate(struct vop_ncreate_args *ap)
        name = ncp->nc_name;
        name_len = ncp->nc_nlen;
        hammer2_chain_memory_wait(dip->pmp);
-       hammer2_trans_init(&trans, dip->pmp, 0);
+       hammer2_trans_init(&trans, dip->pmp, HAMMER2_TRANS_NEWINODE);
 
        nip = hammer2_inode_create(&trans, dip, ap->a_vap, ap->a_cred,
                                   name, name_len, &nchain, &error);
@@ -1608,7 +1609,7 @@ hammer2_vop_nmknod(struct vop_nmknod_args *ap)
        name = ncp->nc_name;
        name_len = ncp->nc_nlen;
        hammer2_chain_memory_wait(dip->pmp);
-       hammer2_trans_init(&trans, dip->pmp, 0);
+       hammer2_trans_init(&trans, dip->pmp, HAMMER2_TRANS_NEWINODE);
 
        nip = hammer2_inode_create(&trans, dip, ap->a_vap, ap->a_cred,
                                   name, name_len, &nchain, &error);
@@ -1652,7 +1653,7 @@ hammer2_vop_nsymlink(struct vop_nsymlink_args *ap)
        name = ncp->nc_name;
        name_len = ncp->nc_nlen;
        hammer2_chain_memory_wait(dip->pmp);
-       hammer2_trans_init(&trans, dip->pmp, 0);
+       hammer2_trans_init(&trans, dip->pmp, HAMMER2_TRANS_NEWINODE);
 
        ap->a_vap->va_type = VLNK;      /* enforce type */