hammer2 - refactor filesystem sync 2/N
authorMatthew Dillon <dillon@apollo.backplane.com>
Fri, 9 Nov 2018 01:10:07 +0000 (17:10 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Wed, 5 Dec 2018 18:28:39 +0000 (10:28 -0800)
* Flesh out the flush partitioning code, fixing a number of issues.

* Refactor hammer2_inode_lock() and add hammer2_inode_lock4() to
  interlock against flushes.  This is handled by blocking inode locks
  against SYNCQ, and reordering the inode to the front of the SYNCQ list
  in order to unblock as quickly as possible as the filesystem sync
  progresses.  The result should be relatively few frontend stalls
  during a filesystem sync.

* Disable resource caps for the moment, because synchronous
  operations to prevent resource limits from blowing out break
  the current inode_lock*() code and allow vnode deadlocks to
  occur.

* To avoid deadlocks, the filesystem sync currently must clear SYNCQ
  before locking the inode & vnode, and if it cannot lock a vnode it
  must continue on with the next inode and then restart.  Retried
  vnodes introduce a short delay to give the frontend time to work
  the blocking operation.

  This is necessary because the kernel locks vnodes before entering the
  H2 frontend, and we cannot safely unlock/relock them to work around
  this.  Nor do we necessarily even have full knowledge on which vnodes
  the current thread has locked.

* Does not yet guarantee complete filesystem consistency on-crash.

12 files changed:
sys/vfs/hammer2/hammer2.h
sys/vfs/hammer2/hammer2_admin.c
sys/vfs/hammer2/hammer2_chain.c
sys/vfs/hammer2/hammer2_flush.c
sys/vfs/hammer2/hammer2_freemap.c
sys/vfs/hammer2/hammer2_inode.c
sys/vfs/hammer2/hammer2_ioctl.c
sys/vfs/hammer2/hammer2_strategy.c
sys/vfs/hammer2/hammer2_synchro.c
sys/vfs/hammer2/hammer2_vfsops.c
sys/vfs/hammer2/hammer2_vnops.c
sys/vfs/hammer2/hammer2_xops.c

index 173ef70..99f3901 100644 (file)
@@ -361,12 +361,12 @@ RB_PROTOTYPE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
 #define HAMMER2_CHAIN_DELETED          0x00000010      /* deleted chain */
 #define HAMMER2_CHAIN_INITIAL          0x00000020      /* initial create */
 #define HAMMER2_CHAIN_UPDATE           0x00000040      /* need parent update */
-#define HAMMER2_CHAIN_DEFERRED         0x00000080      /* flush depth defer */
+#define HAMMER2_CHAIN_UNUSED0080       0x00000080
 #define HAMMER2_CHAIN_TESTEDGOOD       0x00000100      /* crc tested good */
 #define HAMMER2_CHAIN_ONFLUSH          0x00000200      /* on a flush list */
 #define HAMMER2_CHAIN_FICTITIOUS       0x00000400      /* unsuitable for I/O */
 #define HAMMER2_CHAIN_VOLUMESYNC       0x00000800      /* needs volume sync */
-#define HAMMER2_CHAIN_DELAYED          0x00001000      /* delayed flush */
+#define HAMMER2_CHAIN_UNUSED1000       0x00001000
 #define HAMMER2_CHAIN_COUNTEDBREFS     0x00002000      /* block table stats */
 #define HAMMER2_CHAIN_ONRBTREE         0x00004000      /* on parent RB tree */
 #define HAMMER2_CHAIN_ONLRU            0x00008000      /* on LRU list */
@@ -778,6 +778,12 @@ typedef struct hammer2_inode hammer2_inode_t;
 #define HAMMER2_INODE_DELETING         0x1000  /* sync interlock, chain topo */
 #define HAMMER2_INODE_CREATING         0x2000  /* sync interlock, chain topo */
 #define HAMMER2_INODE_SYNCQ_WAKEUP     0x4000  /* sync interlock wakeup */
+#define HAMMER2_INODE_SYNCQ_PASS2      0x8000  /* force retry delay */
+
+#define HAMMER2_INODE_DIRTY            (HAMMER2_INODE_MODIFIED |       \
+                                        HAMMER2_INODE_DIRTYDATA |      \
+                                        HAMMER2_INODE_DELETING |       \
+                                        HAMMER2_INODE_CREATING)
 
 int hammer2_inode_cmp(hammer2_inode_t *ip1, hammer2_inode_t *ip2);
 RB_PROTOTYPE2(hammer2_inode_tree, hammer2_inode, rbnode, hammer2_inode_cmp,
@@ -789,15 +795,14 @@ RB_PROTOTYPE2(hammer2_inode_tree, hammer2_inode, rbnode, hammer2_inode_cmp,
 struct hammer2_trans {
        uint32_t                flags;
        uint32_t                sync_wait;
-       int                     fticks;                 /* FPENDING start */
 };
 
 typedef struct hammer2_trans hammer2_trans_t;
 
 #define HAMMER2_TRANS_ISFLUSH          0x80000000      /* flush code */
 #define HAMMER2_TRANS_BUFCACHE         0x40000000      /* bio strategy */
-#define HAMMER2_TRANS_UNUSED20         0x20000000
-#define HAMMER2_TRANS_FPENDING         0x10000000      /* flush pending */
+#define HAMMER2_TRANS_SIDEQ            0x20000000      /* run sideq */
+#define HAMMER2_TRANS_COPYQ            0x10000000      /* sideq->syncq */
 #define HAMMER2_TRANS_WAITING          0x08000000      /* someone waiting */
 #define HAMMER2_TRANS_MASK             0x00FFFFFF      /* count mask */
 
@@ -812,6 +817,7 @@ typedef struct hammer2_trans hammer2_trans_t;
 #define HAMMER2_FLUSH_TOP              0x0001
 #define HAMMER2_FLUSH_ALL              0x0002
 #define HAMMER2_FLUSH_INODE_STOP       0x0004  /* stop at sub-inode */
+#define HAMMER2_FLUSH_FSSYNC           0x0008  /* part of filesystem sync */
 
 
 /*
@@ -1091,6 +1097,7 @@ typedef struct hammer2_xop_group hammer2_xop_group_t;
 #define HAMMER2_XOP_STRATEGY           0x00000002
 #define HAMMER2_XOP_INODE_STOP         0x00000004
 #define HAMMER2_XOP_VOLHDR             0x00000008
+#define HAMMER2_XOP_FSSYNC             0x00000010
 
 /*
  * Global (per partition) management structure, represents a hard block
@@ -1120,7 +1127,6 @@ struct hammer2_dev {
        hammer2_chain_t vchain;         /* anchor chain (topology) */
        hammer2_chain_t fchain;         /* anchor chain (freemap) */
        struct spinlock list_spin;
-       struct h2_flush_list flushq;    /* flush seeds */
        struct hammer2_pfs *spmp;       /* super-root pmp for transactions */
        struct lock     vollk;          /* lockmgr lock */
        struct lock     bulklk;         /* bulkfree operation lock */
@@ -1458,6 +1464,8 @@ int hammer2_signal_check(time_t *timep);
 const char *hammer2_error_str(int error);
 
 void hammer2_inode_lock(hammer2_inode_t *ip, int how);
+void hammer2_inode_lock4(hammer2_inode_t *ip1, hammer2_inode_t *ip2,
+                       hammer2_inode_t *ip3, hammer2_inode_t *ip4);
 void hammer2_inode_unlock(hammer2_inode_t *ip);
 hammer2_chain_t *hammer2_inode_chain(hammer2_inode_t *ip, int clindex, int how);
 hammer2_chain_t *hammer2_inode_chain_and_parent(hammer2_inode_t *ip,
@@ -1514,8 +1522,9 @@ hammer2_inode_t *hammer2_inode_create_normal(hammer2_inode_t *pip,
 hammer2_inode_t *hammer2_inode_create_pfs(hammer2_pfs_t *spmp,
                        const uint8_t *name, size_t name_len,
                        int *errorp);
+int hammer2_inode_chain_ins(hammer2_inode_t *ip);
 int hammer2_inode_chain_sync(hammer2_inode_t *ip);
-int hammer2_inode_chain_flush(hammer2_inode_t *ip);
+int hammer2_inode_chain_flush(hammer2_inode_t *ip, int flags);
 int hammer2_inode_unlink_finisher(hammer2_inode_t *ip, int isopen);
 int hammer2_dirent_create(hammer2_inode_t *dip, const char *name,
                        size_t name_len, hammer2_key_t inum, uint8_t type);
@@ -1575,8 +1584,8 @@ int hammer2_chain_scan(hammer2_chain_t *parent,
                                int *firstp, int flags);
 
 int hammer2_chain_create(hammer2_chain_t **parentp, hammer2_chain_t **chainp,
-                               hammer2_pfs_t *pmp, int methods,
-                               hammer2_key_t key, int keybits,
+                               hammer2_dev_t *hmp, hammer2_pfs_t *pmp,
+                               int methods, hammer2_key_t key, int keybits,
                                int type, size_t bytes, hammer2_tid_t mtid,
                                hammer2_off_t dedup_off, int flags);
 void hammer2_chain_rename(hammer2_chain_t **parentp,
@@ -1603,7 +1612,8 @@ void hammer2_pfs_memory_wakeup(hammer2_pfs_t *pmp);
 
 void hammer2_base_delete(hammer2_chain_t *parent,
                                hammer2_blockref_t *base, int count,
-                               hammer2_chain_t *chain);
+                               hammer2_chain_t *chain,
+                               hammer2_blockref_t *obref);
 void hammer2_base_insert(hammer2_chain_t *parent,
                                hammer2_blockref_t *base, int count,
                                hammer2_chain_t *chain,
@@ -1619,8 +1629,10 @@ void hammer2_delayed_flush(hammer2_chain_t *chain);
  * hammer2_trans.c
  */
 void hammer2_trans_init(hammer2_pfs_t *pmp, uint32_t flags);
+void hammer2_trans_setflags(hammer2_pfs_t *pmp, uint32_t flags);
+void hammer2_trans_clearflags(hammer2_pfs_t *pmp, uint32_t flags);
 hammer2_tid_t hammer2_trans_sub(hammer2_pfs_t *pmp);
-void hammer2_trans_done(hammer2_pfs_t *pmp, int quicksideq);
+void hammer2_trans_done(hammer2_pfs_t *pmp, uint32_t flags);
 hammer2_tid_t hammer2_trans_newinum(hammer2_pfs_t *pmp);
 void hammer2_trans_assert_strategy(hammer2_pfs_t *pmp);
 void hammer2_dedup_record(hammer2_chain_t *chain, hammer2_io_t *dio,
@@ -1728,6 +1740,10 @@ void hammer2_xop_lookup(hammer2_xop_t *xop, void *scratch, int clindex);
 void hammer2_xop_delete(hammer2_xop_t *xop, void *scratch, int clindex);
 void hammer2_xop_inode_mkdirent(hammer2_xop_t *xop, void *scratch, int clindex);
 void hammer2_xop_inode_create(hammer2_xop_t *xop, void *scratch, int clindex);
+void hammer2_xop_inode_create_det(hammer2_xop_t *xop,
+                               void *scratch, int clindex);
+void hammer2_xop_inode_create_ins(hammer2_xop_t *xop,
+                               void *scratch, int clindex);
 void hammer2_xop_inode_destroy(hammer2_xop_t *xop, void *scratch, int clindex);
 void hammer2_xop_inode_chain_sync(hammer2_xop_t *xop, void *scratch,
                                int clindex);
@@ -1793,6 +1809,8 @@ extern hammer2_xop_desc_t hammer2_lookup_desc;
 extern hammer2_xop_desc_t hammer2_delete_desc;
 extern hammer2_xop_desc_t hammer2_inode_mkdirent_desc;
 extern hammer2_xop_desc_t hammer2_inode_create_desc;
+extern hammer2_xop_desc_t hammer2_inode_create_det_desc;
+extern hammer2_xop_desc_t hammer2_inode_create_ins_desc;
 extern hammer2_xop_desc_t hammer2_inode_destroy_desc;
 extern hammer2_xop_desc_t hammer2_inode_chain_sync_desc;
 extern hammer2_xop_desc_t hammer2_inode_unlinkall_desc;
index 4bd9c48..24063db 100644 (file)
@@ -54,6 +54,8 @@ H2XOPDESCRIPTOR(lookup);
 H2XOPDESCRIPTOR(delete);
 H2XOPDESCRIPTOR(inode_mkdirent);
 H2XOPDESCRIPTOR(inode_create);
+H2XOPDESCRIPTOR(inode_create_det);
+H2XOPDESCRIPTOR(inode_create_ins);
 H2XOPDESCRIPTOR(inode_destroy);
 H2XOPDESCRIPTOR(inode_chain_sync);
 H2XOPDESCRIPTOR(inode_unlinkall);
index 1efa337..ba35bc5 100644 (file)
@@ -68,6 +68,13 @@ static hammer2_chain_t *hammer2_chain_create_indirect(
                hammer2_chain_t *parent,
                hammer2_key_t key, int keybits,
                hammer2_tid_t mtid, int for_type, int *errorp);
+static void hammer2_chain_rename_obref(hammer2_chain_t **parentp,
+               hammer2_chain_t *chain, hammer2_tid_t mtid,
+               int flags, hammer2_blockref_t *obref);
+static int hammer2_chain_delete_obref(hammer2_chain_t *parent,
+               hammer2_chain_t *chain,
+               hammer2_tid_t mtid, int flags,
+               hammer2_blockref_t *obref);
 static hammer2_io_t *hammer2_chain_drop_data(hammer2_chain_t *chain);
 static hammer2_chain_t *hammer2_combined_find(
                hammer2_chain_t *parent,
@@ -127,14 +134,15 @@ hammer2_chain_assert_no_data(hammer2_chain_t *chain)
 }
 
 /*
- * Make a chain visible to the flusher.  The flusher needs to be able to
- * do flushes of subdirectory chains or single files so it does a top-down
- * recursion using the ONFLUSH flag for the recursion.  It locates MODIFIED
- * or UPDATE chains and flushes back up the chain to the volume root.
- *
- * This routine sets ONFLUSH upward until it hits the volume root.  For
- * simplicity we ignore PFSROOT boundaries whos rules can be complex.
- * Extra ONFLUSH flagging doesn't hurt the filesystem.
+ * Make a chain visible to the flusher.  The flusher operates using a top-down
+ * recursion based on the ONFLUSH flag.  It locates MODIFIED and UPDATE chains,
+ * flushes them, and updates blocks back to the volume root.
+ *
+ * This routine sets the ONFLUSH flag upward from the triggering chain until
+ * it hits an inode root or the volume root.  Inode chains serve as inflection
+ * points, requiring the flusher to bridge across trees.  Inodes include
+ * regular inodes, PFS roots (pmp->iroot), and the media super root
+ * (spmp->iroot).
  */
 void
 hammer2_chain_setflush(hammer2_chain_t *chain)
@@ -145,6 +153,8 @@ hammer2_chain_setflush(hammer2_chain_t *chain)
                hammer2_spin_sh(&chain->core.spin);
                while ((chain->flags & HAMMER2_CHAIN_ONFLUSH) == 0) {
                        atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH);
+                       if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
+                               break;
                        if ((parent = chain->parent) == NULL)
                                break;
                        hammer2_spin_sh(&parent->core.spin);
@@ -488,31 +498,7 @@ hammer2_chain_lastdrop(hammer2_chain_t *chain, int depth)
        hammer2_dev_t *hmp;
        hammer2_chain_t *parent;
        hammer2_chain_t *rdrop;
-#if 0
-       hammer2_io_t *dio;
-#endif
 
-#if 0
-       /*
-        * On last drop if there is no parent and data_off is good (at
-        * least does not represent the volume root), the modified chain
-        * is probably going to be destroyed.  We have to make sure that
-        * the data area is not registered for dedup.
-        *
-        * XXX removed. In fact, we do not have to make sure that the
-        *     data area is not registered for dedup.  The data area
-        *     can, in fact, still be used for dedup because it is
-        *     still allocated in the freemap and the underlying I/O
-        *     will still be flushed.
-        */
-       if (chain->parent == NULL &&
-           (chain->flags & HAMMER2_CHAIN_MODIFIED) &&
-           (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) {
-               hmp = chain->hmp;
-               hammer2_io_dedup_delete(hmp, chain->bref.type,
-                                       chain->bref.data_off, chain->bytes);
-       }
-#endif
        /*
         * We need chain's spinlock to interlock the sub-tree test.
         * We already have chain's mutex, protecting chain->parent.
@@ -538,11 +524,6 @@ hammer2_chain_lastdrop(hammer2_chain_t *chain, int depth)
                                    HAMMER2_CHAIN_MODIFIED)) {
                        if (atomic_cmpset_int(&chain->refs, 1, 0)) {
                                hammer2_spin_unex(&chain->core.spin);
-#if 0
-                               dio = hammer2_chain_drop_data(chain, 0);
-                               if (dio)
-                                       hammer2_io_bqrelse(&dio);
-#endif
                                hammer2_chain_assert_no_data(chain);
                                hammer2_mtx_unlock(&chain->lock);
                                chain = NULL;
@@ -571,22 +552,18 @@ hammer2_chain_lastdrop(hammer2_chain_t *chain, int depth)
                        atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
 
                /*
-                * If the chain has children we must still flush the chain.
+                * If the chain has children we must propagate the DESTROY
+                * flag downward and rip the disconnected topology apart.
+                * This is accomplished by calling hammer2_flush() on the
+                * chain.
+                *
                 * Any dedup is already handled by the underlying DIO, so
                 * we do not have to specifically flush it here.
-                *
-                * In the case where it has children, the DESTROY flag test
-                * in the flush code will prevent unnecessary flushes of
-                * MODIFIED chains that are not flagged DEDUP so don't worry
-                * about that here.
                 */
                if (chain->core.chain_count) {
-                       /*
-                        * Put on flushq (should ensure refs > 1), retry
-                        * the drop.
-                        */
                        hammer2_spin_unex(&chain->core.spin);
-                       hammer2_delayed_flush(chain);
+                       hammer2_flush(chain, HAMMER2_FLUSH_TOP |
+                                            HAMMER2_FLUSH_ALL);
                        hammer2_mtx_unlock(&chain->lock);
 
                        return(chain);  /* retry drop */
@@ -610,9 +587,6 @@ hammer2_chain_lastdrop(hammer2_chain_t *chain, int depth)
        }
 
        /* spinlock still held */
-#if 0
-       dio = NULL;
-#endif
 
        /*
         * If any children exist we must leave the chain intact with refs == 0.
@@ -689,9 +663,6 @@ hammer2_chain_lastdrop(hammer2_chain_t *chain, int depth)
                /*
                 * Success
                 */
-#if 0
-               dio = hammer2_chain_drop_data(chain, 1);
-#endif
                hammer2_chain_assert_no_data(chain);
 
                /*
@@ -727,10 +698,6 @@ hammer2_chain_lastdrop(hammer2_chain_t *chain, int depth)
                }
                hammer2_spin_unex(&chain->core.spin);
                hammer2_mtx_unlock(&chain->lock);
-#if 0
-               if (dio)
-                       hammer2_io_bqrelse(&dio);
-#endif
 
                /*
                 * lru_list hysteresis (see above for depth overrides).
@@ -767,14 +734,6 @@ hammer2_chain_lastdrop(hammer2_chain_t *chain, int depth)
        if (parent) {
                hammer2_spin_ex(&parent->core.spin);
                if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) {
-#if 0
-                       /* XXX remove, don't try to drop data on fail */
-                       hammer2_spin_unex(&parent->core.spin);
-                       dio = hammer2_chain_drop_data(chain, 0);
-                       hammer2_spin_unex(&chain->core.spin);
-                       if (dio)
-                               hammer2_io_bqrelse(&dio);
-#endif
                        /*
                         * 1->0 transition failed, retry.
                         */
@@ -860,11 +819,6 @@ hammer2_chain_lastdrop(hammer2_chain_t *chain, int depth)
         */
        KKASSERT((chain->flags & (HAMMER2_CHAIN_UPDATE |
                                  HAMMER2_CHAIN_MODIFIED)) == 0);
-#if 0
-       dio = hammer2_chain_drop_data(chain, 1);
-       if (dio)
-               hammer2_io_bqrelse(&dio);
-#endif
 
        /*
         * Once chain resources are gone we can use the now dead chain
@@ -3290,6 +3244,12 @@ done:
  * insertion, based on the supplied key/keybits, and may involve creating
  * indirect blocks and moving other chains around via delete/duplicate.
  *
+ * This call can be made with parent == NULL as long as a non -1 methods
+ * is supplied.  hmp must also be supplied in this situation (otherwise
+ * hmp is extracted from the supplied parent).  The chain will be detached
+ * from the topology.  A later call with both parent and chain can be made
+ * to attach it.
+ *
  * THE CALLER MUST HAVE ALREADY PROPERLY SEEKED (*parentp) TO THE INSERTION
  * POINT SANS ANY REQUIRED INDIRECT BLOCK CREATIONS DUE TO THE ARRAY BEING
  * FULL.  This typically means that the caller is creating the chain after
@@ -3320,11 +3280,10 @@ done:
  */
 int
 hammer2_chain_create(hammer2_chain_t **parentp, hammer2_chain_t **chainp,
-                    hammer2_pfs_t *pmp, int methods,
+                    hammer2_dev_t *hmp, hammer2_pfs_t *pmp, int methods,
                     hammer2_key_t key, int keybits, int type, size_t bytes,
                     hammer2_tid_t mtid, hammer2_off_t dedup_off, int flags)
 {
-       hammer2_dev_t *hmp;
        hammer2_chain_t *chain;
        hammer2_chain_t *parent;
        hammer2_blockref_t *base;
@@ -3338,9 +3297,11 @@ hammer2_chain_create(hammer2_chain_t **parentp, hammer2_chain_t **chainp,
         * Topology may be crossing a PFS boundary.
         */
        parent = *parentp;
-       KKASSERT(hammer2_mtx_owned(&parent->lock));
-       KKASSERT(parent->error == 0);
-       hmp = parent->hmp;
+       if (parent) {
+               KKASSERT(hammer2_mtx_owned(&parent->lock));
+               KKASSERT(parent->error == 0);
+               hmp = parent->hmp;
+       }
        chain = *chainp;
 
        if (chain == NULL) {
@@ -3453,9 +3414,13 @@ hammer2_chain_create(hammer2_chain_t **parentp, hammer2_chain_t **chainp,
        if (flags & HAMMER2_INSERT_PFSROOT)
                chain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT;
 
+       if (parent == NULL)
+               goto skip;
+
        /*
         * Calculate how many entries we have in the blockref array and
-        * determine if an indirect block is required.
+        * determine if an indirect block is required when inserting into
+        * the parent.
         */
 again:
        if (--maxloops == 0)
@@ -3544,6 +3509,10 @@ again:
                goto again;
        }
 
+       /*
+        * fall through if parent, or skip to here if no parent.
+        */
+skip:
        if (chain->flags & HAMMER2_CHAIN_DELETED)
                kprintf("Inserting deleted chain @%016jx\n",
                        chain->bref.key);
@@ -3554,11 +3523,13 @@ again:
        if (chain->parent != NULL)
                panic("hammer2: hammer2_chain_create: chain already connected");
        KKASSERT(chain->parent == NULL);
-       KKASSERT(parent->core.live_count < count);
-       hammer2_chain_insert(parent, chain,
-                            HAMMER2_CHAIN_INSERT_SPIN |
-                            HAMMER2_CHAIN_INSERT_LIVE,
-                            0);
+       if (parent) {
+               KKASSERT(parent->core.live_count < count);
+               hammer2_chain_insert(parent, chain,
+                                    HAMMER2_CHAIN_INSERT_SPIN |
+                                    HAMMER2_CHAIN_INSERT_LIVE,
+                                    0);
+       }
 
        if (allocated) {
                /*
@@ -3610,7 +3581,8 @@ again:
         * already set in the chain (so it won't recurse up to set it in the
         * parent).
         */
-       hammer2_chain_setflush(parent);
+       if (parent)
+               hammer2_chain_setflush(parent);
 
 done:
        *chainp = chain;
@@ -3696,8 +3668,8 @@ hammer2_chain_rename(hammer2_chain_t **parentp, hammer2_chain_t *chain,
                KKASSERT(parent->refs > 0);
                KKASSERT(parent->error == 0);
 
-               hammer2_chain_create(parentp, &chain,
-                                    chain->pmp, HAMMER2_METH_DEFAULT,
+               hammer2_chain_create(parentp, &chain, NULL, chain->pmp,
+                                    HAMMER2_METH_DEFAULT,
                                     bref->key, bref->keybits, bref->type,
                                     chain->bytes, mtid, 0, flags);
                KKASSERT(chain->flags & HAMMER2_CHAIN_UPDATE);
@@ -3705,6 +3677,54 @@ hammer2_chain_rename(hammer2_chain_t **parentp, hammer2_chain_t *chain,
        }
 }
 
+/*
+ * This works in tandem with delete_obref() to install a blockref in
+ * (typically) an indirect block that is associated with the chain being
+ * moved to *parentp.
+ *
+ * The reason we need this function is that the caller needs to maintain
+ * the blockref as it was, and not generate a new blockref for what might
+ * be a modified chain.  Otherwise stuff will leak into the flush that
+ * the flush code's FLUSH_INODE_STOP flag is unable to catch.
+ *
+ * It is EXTREMELY important that we properly set CHAIN_BMAPUPD and
+ * CHAIN_UPDATE.  We must set BMAPUPD if the bref does not match, and
+ * we must clear CHAIN_UPDATE (that was likely set by the chain_rename) if
+ * it does.  Otherwise we can end up in a situation where H2 is unable to
+ * clean up the in-memory chain topology.
+ *
+ * The reason for this is that flushes do not generally flush through
+ * BREF_TYPE_INODE chains and depend on a hammer2_inode_t queued to syncq
+ * or sideq to properly flush and dispose of the related inode chain's flags.
+ * Situations where the inode is not actually modified by the frontend,
+ * but where we have to move the related chains around as we insert or cleanup
+ * indirect blocks, can leave us with a 'dirty' (non-disposable) in-memory
+ * inode chain that does not have a hammer2_inode_t associated with it.
+ */
+void
+hammer2_chain_rename_obref(hammer2_chain_t **parentp, hammer2_chain_t *chain,
+                          hammer2_tid_t mtid, int flags,
+                          hammer2_blockref_t *obref)
+{
+       hammer2_chain_rename(parentp, chain, mtid, flags);
+
+       if (obref->type) {
+               hammer2_blockref_t *tbase;
+               int tcount;
+
+               KKASSERT((chain->flags & HAMMER2_CHAIN_BMAPPED) == 0);
+               hammer2_chain_modify(*parentp, mtid, 0, 0);
+               tbase = hammer2_chain_base_and_count(*parentp, &tcount);
+               hammer2_base_insert(*parentp, tbase, tcount, chain, obref);
+               if (bcmp(obref, &chain->bref, sizeof(chain->bref))) {
+                       atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPUPD |
+                                                     HAMMER2_CHAIN_UPDATE);
+               } else {
+                       atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
+               }
+       }
+}
+
 /*
  * Helper function for deleting chains.
  *
@@ -3715,7 +3735,8 @@ hammer2_chain_rename(hammer2_chain_t **parentp, hammer2_chain_t *chain,
  */
 static int
 _hammer2_chain_delete_helper(hammer2_chain_t *parent, hammer2_chain_t *chain,
-                            hammer2_tid_t mtid, int flags)
+                            hammer2_tid_t mtid, int flags,
+                            hammer2_blockref_t *obref)
 {
        hammer2_dev_t *hmp;
        int error = 0;
@@ -3812,7 +3833,7 @@ _hammer2_chain_delete_helper(hammer2_chain_t *parent, hammer2_chain_t *chain,
                 * undone.  XXX split update possible w/delete in middle?
                 */
                if (base) {
-                       hammer2_base_delete(parent, base, count, chain);
+                       hammer2_base_delete(parent, base, count, chain, obref);
                }
                hammer2_spin_unex(&parent->core.spin);
                hammer2_spin_unex(&chain->core.spin);
@@ -3940,6 +3961,9 @@ hammer2_chain_create_indirect(hammer2_chain_t *parent,
        /*
         * Pre-modify the parent now to avoid having to deal with error
         * processing if we tried to later (in the middle of our loop).
+        *
+        * We are going to be moving bref's around, the indirect blocks
+        * cannot be in an initial state.  Do not pass MODIFY_OPTDATA.
         */
        *errorp = hammer2_chain_modify(parent, mtid, 0, 0);
        if (*errorp) {
@@ -3947,6 +3971,7 @@ hammer2_chain_create_indirect(hammer2_chain_t *parent,
                        *errorp, hammer2_error_str(*errorp));
                return NULL;
        }
+       KKASSERT((parent->flags & HAMMER2_CHAIN_INITIAL) == 0);
 
        /*hammer2_chain_modify(&parent, HAMMER2_MODIFY_OPTDATA);*/
        base = hammer2_chain_base_and_count(parent, &count);
@@ -4065,8 +4090,11 @@ hammer2_chain_create_indirect(hammer2_chain_t *parent,
         * We have to mark it modified to allocate its block, but use
         * OPTDATA to allow it to remain in the INITIAL state.  Otherwise
         * it won't be acted upon by the flush code.
+        *
+        * XXX remove OPTDATA, we need a fully initialized indirect block to
+        * be able to move the original blockref.
         */
-       *errorp = hammer2_chain_modify(ichain, mtid, 0, HAMMER2_MODIFY_OPTDATA);
+       *errorp = hammer2_chain_modify(ichain, mtid, 0, 0);
        if (*errorp) {
                kprintf("hammer2_alloc_indirect: error %08x %s\n",
                        *errorp, hammer2_error_str(*errorp));
@@ -4074,6 +4102,7 @@ hammer2_chain_create_indirect(hammer2_chain_t *parent,
                hammer2_chain_drop(ichain);
                return NULL;
        }
+       KKASSERT((ichain->flags & HAMMER2_CHAIN_INITIAL) == 0);
 
        /*
         * Iterate the original parent and move the matching brefs into
@@ -4200,9 +4229,10 @@ hammer2_chain_create_indirect(hammer2_chain_t *parent,
                 *          in-progress can continue at the current parent
                 *          and will be able to properly find its next key.
                 */
-               error = hammer2_chain_delete(parent, chain, mtid, 0);
+               error = hammer2_chain_delete_obref(parent, chain, mtid, 0,
+                                                  &bcopy);
                KKASSERT(error == 0);
-               hammer2_chain_rename(&ichain, chain, mtid, 0);
+               hammer2_chain_rename_obref(&ichain, chain, mtid, 0, &bcopy);
                hammer2_chain_unlock(chain);
                hammer2_chain_drop(chain);
                KKASSERT(parent->refs > 0);
@@ -4421,12 +4451,13 @@ hammer2_chain_indirect_maintenance(hammer2_chain_t *parent,
                        sub = NULL;     /* safety */
                        continue;
                }
-               error = hammer2_chain_delete(chain, sub,
-                                            sub->bref.modify_tid, 0);
+               error = hammer2_chain_delete_obref(chain, sub,
+                                                  sub->bref.modify_tid, 0,
+                                                  &bcopy);
                KKASSERT(error == 0);
-               hammer2_chain_rename(&parent, sub,
+               hammer2_chain_rename_obref(&parent, sub,
                                     sub->bref.modify_tid,
-                                    HAMMER2_INSERT_SAMEPARENT);
+                                    HAMMER2_INSERT_SAMEPARENT, &bcopy);
                hammer2_chain_unlock(sub);
                hammer2_chain_drop(sub);
                hammer2_spin_ex(&chain->core.spin);
@@ -5066,7 +5097,47 @@ hammer2_chain_delete(hammer2_chain_t *parent, hammer2_chain_t *chain,
                KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0 &&
                         chain->parent == parent);
                error = _hammer2_chain_delete_helper(parent, chain,
-                                                    mtid, flags);
+                                                    mtid, flags, NULL);
+       }
+
+       /*
+        * Permanent deletions mark the chain as destroyed.
+        *
+        * NOTE: We do not setflush the chain unless the deletion is
+        *       permanent, since the deletion of a chain does not actually
+        *       require it to be flushed.
+        */
+       if (error == 0) {
+               if (flags & HAMMER2_DELETE_PERMANENT) {
+                       atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY);
+                       hammer2_chain_setflush(chain);
+               }
+       }
+
+       return error;
+}
+
+static int
+hammer2_chain_delete_obref(hammer2_chain_t *parent, hammer2_chain_t *chain,
+                    hammer2_tid_t mtid, int flags,
+                    hammer2_blockref_t *obref)
+{
+       int error = 0;
+
+       KKASSERT(hammer2_mtx_owned(&chain->lock));
+
+       /*
+        * Nothing to do if already marked.
+        *
+        * We need the spinlock on the core whos RBTREE contains chain
+        * to protect against races.
+        */
+       obref->type = 0;
+       if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) {
+               KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0 &&
+                        chain->parent == parent);
+               error = _hammer2_chain_delete_helper(parent, chain,
+                                                    mtid, flags, obref);
        }
 
        /*
@@ -5289,7 +5360,8 @@ found:
 void
 hammer2_base_delete(hammer2_chain_t *parent,
                    hammer2_blockref_t *base, int count,
-                   hammer2_chain_t *chain)
+                   hammer2_chain_t *chain,
+                   hammer2_blockref_t *obref)
 {
        hammer2_blockref_t *elm = &chain->bref;
        hammer2_blockref_t *scan;
@@ -5369,6 +5441,8 @@ hammer2_base_delete(hammer2_chain_t *parent,
                break;
        }
 
+       if (obref)
+               *obref = *scan;
        bzero(scan, sizeof(*scan));
 
        /*
@@ -5827,15 +5901,13 @@ hammer2_chain_testcheck(hammer2_chain_t *chain, void *bdata)
  *
  * The flags passed in are LOOKUP flags, not RESOLVE flags.
  *
- * If we are unable to locate the hardlink, INVAL is returned and *chainp
- * will be NULL.  *parentp may still be set error or not, or NULL if the
- * parent itself could not be resolved.
+ * If we are unable to locate the inode, HAMMER2_ERROR_EIO is returned and
+ * *chainp will be NULL.  *parentp may still be set error or not, or NULL
+ * if the parent itself could not be resolved.
  *
- * Caller must pass-in a valid (and locked), or NULL *parentp or *chainp.
- * This function replaces *parentp and *chainp.  Generally speaking, if
- * the caller found a directory entry and wants the inode, the caller should
- * pass the parent,chain representing the directory entry so this function
- * can dispose of it properly to avoid any possible lock order reversals.
+ * The caller may pass-in a locked *parentp and/or *chainp, or neither.
+ * They will be unlocked and released by this function.  The *parentp and
+ * *chainp representing the located inode are returned locked.
  */
 int
 hammer2_chain_inode_find(hammer2_pfs_t *pmp, hammer2_key_t inum,
@@ -5845,6 +5917,7 @@ hammer2_chain_inode_find(hammer2_pfs_t *pmp, hammer2_key_t inum,
        hammer2_chain_t *parent;
        hammer2_chain_t *rchain;
        hammer2_key_t key_dummy;
+       hammer2_inode_t *ip;
        int resolve_flags;
        int error;
 
@@ -5865,6 +5938,30 @@ hammer2_chain_inode_find(hammer2_pfs_t *pmp, hammer2_key_t inum,
                *parentp = NULL;
        }
 
+       /*
+        * Be very careful, this is a backend function and we CANNOT
+        * lock any frontend inode structure we find.  But we have to
+        * look the inode up this way first in case it exists but is
+        * detached from the radix tree.
+        */
+       ip = hammer2_inode_lookup(pmp, inum);
+       if (ip) {
+               *chainp = hammer2_inode_chain_and_parent(ip, clindex,
+                                                      parentp,
+                                                      resolve_flags);
+               hammer2_inode_drop(ip);
+               if (*chainp)
+                       return 0;
+               hammer2_chain_unlock(*chainp);
+               hammer2_chain_drop(*chainp);
+               *chainp = NULL;
+               if (*parentp) {
+                       hammer2_chain_unlock(*parentp);
+                       hammer2_chain_drop(*parentp);
+                       *parentp = NULL;
+               }
+       }
+
        /*
         * Inodes hang off of the iroot (bit 63 is clear, differentiating
         * inodes from root directory entries in the key lookup).
index 7e9a0dd..258096f 100644 (file)
@@ -67,7 +67,6 @@
 struct hammer2_flush_info {
        hammer2_chain_t *parent;
        int             depth;
-       long            diddeferral;
        int             error;                  /* cumulative error */
        int             flags;
 #ifdef HAMMER2_SCAN_DEBUG
@@ -77,15 +76,13 @@ struct hammer2_flush_info {
        long            scan_onf_count;
        long            scan_del_count;
        long            scan_btype[7];
-       long            flushq_count;
 #endif
-       struct h2_flush_list flushq;
        hammer2_chain_t *debug;
 };
 
 typedef struct hammer2_flush_info hammer2_flush_info_t;
 
-static void hammer2_flush_core(hammer2_flush_info_t *info,
+static int hammer2_flush_core(hammer2_flush_info_t *info,
                                hammer2_chain_t *chain, int flags);
 static int hammer2_flush_recurse(hammer2_chain_t *child, void *data);
 
@@ -101,30 +98,19 @@ hammer2_trans_manage_init(hammer2_pfs_t *pmp)
  * Transaction support for any modifying operation.  Transactions are used
  * in the pmp layer by the frontend and in the spmp layer by the backend.
  *
- * 0                   - Normal transaction, interlocked against flush
- *                       transaction.
+ * 0                   - Normal transaction.  No interlock currently.
  *
- * TRANS_ISFLUSH       - Flush transaction, interlocked against normal
- *                       transaction.
+ * TRANS_ISFLUSH       - Flush transaction.  Interlocks against other flush
+ *                       transactions.
  *
- * TRANS_BUFCACHE      - Buffer cache transaction, no interlock.
+ * TRANS_BUFCACHE      - Buffer cache transaction.  No interlock.
+ *
+ * TRANS_SIDEQ         - Run the sideq (only tested in trans_done())
  *
  * Initializing a new transaction allocates a transaction ID.  Typically
  * passed a pmp (hmp passed as NULL), indicating a cluster transaction.  Can
  * be passed a NULL pmp and non-NULL hmp to indicate a transaction on a single
  * media target.  The latter mode is used by the recovery code.
- *
- * TWO TRANSACTION IDs can run concurrently, where one is a flush and the
- * other is a set of any number of concurrent filesystem operations.  We
- * can either have <running_fs_ops> + <waiting_flush> + <blocked_fs_ops>
- * or we can have <running_flush> + <concurrent_fs_ops>.
- *
- * During a flush, new fs_ops are only blocked until the fs_ops prior to
- * the flush complete.  The new fs_ops can then run concurrent with the flush.
- *
- * Buffer-cache transactions operate as fs_ops but never block.  A
- * buffer-cache flush will run either before or after the current pending
- * flush depending on its state.
  */
 void
 hammer2_trans_init(hammer2_pfs_t *pmp, uint32_t flags)
@@ -140,11 +126,7 @@ hammer2_trans_init(hammer2_pfs_t *pmp, uint32_t flags)
 
                if (flags & HAMMER2_TRANS_ISFLUSH) {
                        /*
-                        * Requesting flush transaction.  This interlocks
-                        * only with other flush transactions.  Note that
-                        * non-flush modifying transactions can run
-                        * concurrently, but will interlock on any inode
-                        * that are on the SYNCQ.
+                        * Interlock against other flush transactions.
                         */
                        if (oflags & HAMMER2_TRANS_ISFLUSH) {
                                nflags = oflags | HAMMER2_TRANS_WAITING;
@@ -152,15 +134,6 @@ hammer2_trans_init(hammer2_pfs_t *pmp, uint32_t flags)
                        } else {
                                nflags = (oflags | flags) + 1;
                        }
-#if 0
-                       if (oflags & HAMMER2_TRANS_MASK) {
-                               nflags = oflags | HAMMER2_TRANS_FPENDING |
-                                                 HAMMER2_TRANS_WAITING;
-                               dowait = 1;
-                       } else {
-                               nflags = (oflags | flags) + 1;
-                       }
-#endif
                } else if (flags & HAMMER2_TRANS_BUFCACHE) {
                        /*
                         * Requesting strategy transaction from buffer-cache,
@@ -171,47 +144,36 @@ hammer2_trans_init(hammer2_pfs_t *pmp, uint32_t flags)
                        nflags = (oflags | flags) + 1;
                } else {
                        /*
-                        * Requesting a normal modifying transaction.
-                        * Does not interlock with flushes.  Multiple
-                        * modifying transactions can run concurrently.
-                        * These do not mess with the on-media topology
-                        * above the inode.
+                        * Normal transaction.  We currently only interlock
+                        * against COPYQ.  We do not interlock against
+                        * BUFCACHE or ISFLUSH.  COPYQ is used to interlock
+                        * the transfer of SIDEQ into SYNCQ.
                         *
-                        * If a flush is pending for more than one second
-                        * but can't run because many modifying transactions
-                        * are active, we wait for the flush to be granted.
+                        * Note that vnode locks may be held going into
+                        * this call.
                         *
                         * NOTE: Remember that non-modifying operations
                         *       such as read, stat, readdir, etc, do
                         *       not use transactions.
                         */
-#if 0
-                       if ((oflags & HAMMER2_TRANS_FPENDING) &&
-                           (u_int)(ticks - pmp->trans.fticks) >= (u_int)hz) {
-                               nflags = oflags | HAMMER2_TRANS_WAITING;
-                               dowait = 1;
-                       } else if (oflags & HAMMER2_TRANS_ISFLUSH) {
+                       if (oflags & HAMMER2_TRANS_COPYQ) {
                                nflags = oflags | HAMMER2_TRANS_WAITING;
                                dowait = 1;
-                       } else
-#endif
-                       {
+                       } else {
                                nflags = (oflags | flags) + 1;
                        }
                }
                if (dowait)
                        tsleep_interlock(&pmp->trans.sync_wait, 0);
                if (atomic_cmpset_int(&pmp->trans.flags, oflags, nflags)) {
-                       if ((oflags & HAMMER2_TRANS_FPENDING) == 0 &&
-                           (nflags & HAMMER2_TRANS_FPENDING)) {
-                               pmp->trans.fticks = ticks;
-                       }
                        if (dowait == 0)
                                break;
                        tsleep(&pmp->trans.sync_wait, PINTERLOCKED,
                               "h2trans", hz);
+                       /* retry */
                } else {
                        cpu_pause();
+                       /* retry */
                }
                /* retry */
        }
@@ -239,7 +201,33 @@ hammer2_trans_sub(hammer2_pfs_t *pmp)
 }
 
 void
-hammer2_trans_done(hammer2_pfs_t *pmp, int quicksideq)
+hammer2_trans_setflags(hammer2_pfs_t *pmp, uint32_t flags)
+{
+       atomic_set_int(&pmp->trans.flags, flags);
+}
+
+void
+hammer2_trans_clearflags(hammer2_pfs_t *pmp, uint32_t flags)
+{
+       uint32_t oflags;
+       uint32_t nflags;
+
+       for (;;) {
+               oflags = pmp->trans.flags;
+               cpu_ccfence();
+               nflags = oflags & ~flags;
+               if (atomic_cmpset_int(&pmp->trans.flags, oflags, nflags)) {
+                       if ((oflags ^ nflags) & HAMMER2_TRANS_WAITING)
+                               wakeup(&pmp->trans.sync_wait);
+                       break;
+               }
+               cpu_pause();
+               /* retry */
+       }
+}
+
+void
+hammer2_trans_done(hammer2_pfs_t *pmp, uint32_t flags)
 {
        uint32_t oflags;
        uint32_t nflags;
@@ -250,11 +238,11 @@ hammer2_trans_done(hammer2_pfs_t *pmp, int quicksideq)
         * due to potential deadlocks, so we have to deal with them from
         * inside other nominal modifying front-end transactions.
         */
-       if (quicksideq && pmp->sideq_count > (pmp->inum_count >> 3) && pmp->mp)
+       if ((flags & HAMMER2_TRANS_SIDEQ) &&
+           pmp->sideq_count > (pmp->inum_count >> 3) &&
+           pmp->mp) {
                speedup_syncer(pmp->mp);
-#if 0
-               hammer2_inode_run_sideq(pmp, 0);
-#endif
+       }
 
        /*
         * Clean-up the transaction
@@ -263,29 +251,17 @@ hammer2_trans_done(hammer2_pfs_t *pmp, int quicksideq)
                oflags = pmp->trans.flags;
                cpu_ccfence();
                KKASSERT(oflags & HAMMER2_TRANS_MASK);
-               if ((oflags & HAMMER2_TRANS_MASK) == 1) {
-                       /*
-                        * This was the last transaction
-                        */
-                       nflags = (oflags - 1) & ~(HAMMER2_TRANS_ISFLUSH |
-                                                 HAMMER2_TRANS_BUFCACHE |
-                                                 HAMMER2_TRANS_FPENDING |
-                                                 HAMMER2_TRANS_WAITING);
-               } else {
-                       /*
-                        * Still transactions pending
-                        */
-                       nflags = oflags - 1;
+
+               nflags = (oflags - 1) & ~flags;
+               if (flags & HAMMER2_TRANS_ISFLUSH) {
+                       nflags &= ~HAMMER2_TRANS_WAITING;
                }
                if (atomic_cmpset_int(&pmp->trans.flags, oflags, nflags)) {
-                       if ((nflags & HAMMER2_TRANS_MASK) == 0 &&
-                           (oflags & HAMMER2_TRANS_WAITING)) {
+                       if ((oflags ^ nflags) & HAMMER2_TRANS_WAITING)
                                wakeup(&pmp->trans.sync_wait);
-                       }
                        break;
-               } else {
-                       cpu_pause();
                }
+               cpu_pause();
                /* retry */
        }
 }
@@ -318,38 +294,6 @@ hammer2_trans_assert_strategy(hammer2_pfs_t *pmp)
 #endif
 }
 
-
-/*
- * Chains undergoing destruction are removed from the in-memory topology.
- * To avoid getting lost these chains are placed on the delayed flush
- * queue which will properly dispose of them.
- *
- * We do this instead of issuing an immediate flush in order to give
- * recursive deletions (rm -rf, etc) a chance to remove more of the
- * hierarchy, potentially allowing an enormous amount of write I/O to
- * be avoided.
- *
- * NOTE: The flush code tests HAMMER2_CHAIN_DESTROY to differentiate
- *      between these chains and the deep-recursion requeue.
- */
-void
-hammer2_delayed_flush(hammer2_chain_t *chain)
-{
-       if ((chain->flags & HAMMER2_CHAIN_DELAYED) == 0) {
-               hammer2_spin_ex(&chain->hmp->list_spin);
-               if ((chain->flags & (HAMMER2_CHAIN_DELAYED |
-                                    HAMMER2_CHAIN_DEFERRED)) == 0) {
-                       atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELAYED |
-                                                     HAMMER2_CHAIN_DEFERRED);
-                       TAILQ_INSERT_TAIL(&chain->hmp->flushq,
-                                         chain, flush_node);
-                       hammer2_chain_ref(chain);
-               }
-               hammer2_spin_unex(&chain->hmp->list_spin);
-               hammer2_voldata_modify(chain->hmp);
-       }
-}
-
 /*
  * Flush the chain and all modified sub-chains through the specified
  * synchronization point, propagating blockref updates back up.  As
@@ -383,7 +327,6 @@ hammer2_delayed_flush(hammer2_chain_t *chain)
 int
 hammer2_flush(hammer2_chain_t *chain, int flags)
 {
-       hammer2_chain_t *scan;
        hammer2_flush_info_t info;
        hammer2_dev_t *hmp;
        int loops;
@@ -397,7 +340,6 @@ hammer2_flush(hammer2_chain_t *chain, int flags)
         * for re-execution after the stack has been popped.
         */
        bzero(&info, sizeof(info));
-       TAILQ_INIT(&info.flushq);
        info.flags = flags & ~HAMMER2_FLUSH_TOP;
 
        /*
@@ -417,65 +359,10 @@ hammer2_flush(hammer2_chain_t *chain, int flags)
        loops = 0;
 
        for (;;) {
-               /*
-                * Move hmp->flushq to info.flushq if non-empty so it can
-                * be processed.
-                */
-               if (TAILQ_FIRST(&hmp->flushq) != NULL) {
-                       hammer2_spin_ex(&chain->hmp->list_spin);
-                       TAILQ_CONCAT(&info.flushq, &hmp->flushq, flush_node);
-                       hammer2_spin_unex(&chain->hmp->list_spin);
-               }
-
-               /*
-                * Unwind deep recursions which had been deferred.  This
-                * can leave the FLUSH_* bits set for these chains, which
-                * will be handled when we [re]flush chain after the unwind.
-                */
-               while ((scan = TAILQ_FIRST(&info.flushq)) != NULL) {
-                       KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED);
-                       TAILQ_REMOVE(&info.flushq, scan, flush_node);
-#ifdef HAMMER2_SCAN_DEBUG
-                       ++info.flushq_count;
-#endif
-                       atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED |
-                                                      HAMMER2_CHAIN_DELAYED);
-
-                       /*
-                        * Now that we've popped back up we can do a secondary
-                        * recursion on the deferred elements.
-                        *
-                        * NOTE: hmp->flushq chains (marked DESTROY) must be
-                        *       handled unconditionally so they can be cleaned
-                        *       out.
-                        *
-                        * NOTE: hammer2_flush() may replace scan.
-                        */
-                       if (hammer2_debug & 0x0040)
-                               kprintf("deferred flush %p\n", scan);
-                       hammer2_chain_lock(scan, HAMMER2_RESOLVE_MAYBE);
-                       if (scan->error == 0) {
-                               if (scan->flags & HAMMER2_CHAIN_DESTROY) {
-                                       hammer2_flush(scan,
-                                                   flags |
-                                                   HAMMER2_FLUSH_TOP |
-                                                   HAMMER2_FLUSH_ALL);
-                               } else {
-                                       hammer2_flush(scan,
-                                                   flags & ~HAMMER2_FLUSH_TOP);
-                               }
-                       } else {
-                               info.error |= scan->error;
-                       }
-                       hammer2_chain_unlock(scan);
-                       hammer2_chain_drop(scan);/* ref from defer */
-               }
-
                /*
                 * [re]flush chain as the deep recursion may have generated
                 * additional modifications.
                 */
-               info.diddeferral = 0;
                if (info.parent != chain->parent) {
                        if (hammer2_debug & 0x0040) {
                                kprintf("LOST CHILD4 %p->%p "
@@ -486,12 +373,7 @@ hammer2_flush(hammer2_chain_t *chain, int flags)
                        info.parent = chain->parent;
                        hammer2_chain_ref(info.parent);
                }
-               hammer2_flush_core(&info, chain, flags);
-
-               /*
-                * Only loop if deep recursions have been deferred.
-                */
-               if (TAILQ_EMPTY(&info.flushq))
+               if (hammer2_flush_core(&info, chain, flags) == 0)
                        break;
 
                if (++loops % 1000 == 0) {
@@ -504,7 +386,7 @@ hammer2_flush(hammer2_chain_t *chain, int flags)
 #ifdef HAMMER2_SCAN_DEBUG
        if (info.scan_count >= 10)
        kprintf("hammer2_flush: scan_count %ld (%ld,%ld,%ld,%ld) "
-               "bt(%ld,%ld,%ld,%ld,%ld,%ld) flushq %ld\n",
+               "bt(%ld,%ld,%ld,%ld,%ld,%ld)\n",
                info.scan_count,
                info.scan_mod_count,
                info.scan_upd_count,
@@ -515,8 +397,7 @@ hammer2_flush(hammer2_chain_t *chain, int flags)
                info.scan_btype[3],
                info.scan_btype[4],
                info.scan_btype[5],
-               info.scan_btype[6],
-               info.flushq_count);
+               info.scan_btype[6]);
 #endif
        hammer2_chain_drop(chain);
        if (info.parent)
@@ -533,6 +414,9 @@ hammer2_flush(hammer2_chain_t *chain, int flags)
  * Upon return, the caller can test the UPDATE bit on the chain to determine
  * if the parent needs updating.
  *
+ * If non-zero is returned, the chain's parent changed during the flush and
+ * the caller must retry the operation.
+ *
  * (1) Determine if this node is a candidate for the flush, return if it is
  *     not.  fchain and vchain are always candidates for the flush.
  *
@@ -565,13 +449,16 @@ hammer2_flush(hammer2_chain_t *chain, int flags)
  * consistent during synchronization.  mirror_tid is consistent across the
  * block device regardless of the PFS.
  */
-static void
+static int
 hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain,
                   int flags)
 {
        hammer2_chain_t *parent;
        hammer2_dev_t *hmp;
        int save_error;
+       int retry;
+
+       retry = 0;
 
        /*
         * (1) Optimize downward recursion to locate nodes needing action.
@@ -582,7 +469,7 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain,
                        if (info->debug == NULL)
                                info->debug = chain;
                } else {
-                       return;
+                       return 0;
                }
        }
 
@@ -598,19 +485,13 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain,
         * Downward search recursion
         *
         * We must be careful on cold stops.  If CHAIN_UPDATE is set and
-        * we stop cold (verses a deferral which will re-run the chain later),
-        * the update can wind up never being applied.  This situation most
-        * typically occurs on inode boundaries due to the way
+        * we stop cold, the update can wind up never being applied.  This
+        * situation most typically occurs on inode boundaries due to the way
         * hammer2_vfs_sync() breaks-up the flush.  As a safety, we
-        * flush-through such situations.
+        * flush-through such situations. XXX removed
         */
-       if (chain->flags & (HAMMER2_CHAIN_DEFERRED | HAMMER2_CHAIN_DELAYED)) {
-               /*
-                * Already deferred.
-                */
-               ++info->diddeferral;
-       } else if ((chain->flags & HAMMER2_CHAIN_PFSBOUNDARY) &&
-                  (chain->flags & HAMMER2_CHAIN_UPDATE) == 0 &&
+       if ((chain->flags & HAMMER2_CHAIN_PFSBOUNDARY) &&
+                  /* (chain->flags & HAMMER2_CHAIN_UPDATE) == 0 && */
                   (flags & HAMMER2_FLUSH_ALL) == 0 &&
                   (flags & HAMMER2_FLUSH_TOP) == 0 &&
                   chain->pmp && chain->pmp->mp) {
@@ -632,9 +513,6 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain,
                 *
                 * NOTE: The volume root, vchain, does not set PFSBOUNDARY.
                 *
-                * NOTE: This test must be done before the depth-limit test,
-                *       else it might become the top on a flushq iteration.
-                *
                 * NOTE: We must re-set ONFLUSH in the parent to retain if
                 *       this chain (that we are skipping) requires work.
                 */
@@ -643,12 +521,21 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain,
                                    HAMMER2_CHAIN_MODIFIED)) {
                        hammer2_chain_setflush(parent);
                }
+               goto done;
        } else if (chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
-                  (chain->flags & HAMMER2_CHAIN_UPDATE) == 0 &&
+                  /* (chain->flags & HAMMER2_CHAIN_UPDATE) == 0 && */
                   (flags & HAMMER2_FLUSH_INODE_STOP) &&
                   (flags & HAMMER2_FLUSH_ALL) == 0 &&
                   (flags & HAMMER2_FLUSH_TOP) == 0 &&
                   chain->pmp && chain->pmp->mp) {
+               /*
+                * When FLUSH_INODE_STOP is specified we are being asked not
+                * to include any inode changes for inodes we encounter,
+                * with the exception of the inode that the flush began with.
+                * So: INODE, INODE_STOP, and TOP==0 basically.
+                */
+               goto done;
+#if 0
                /*
                 * If FLUSH_INODE_STOP is specified and both ALL and TOP
                 * are clear, we must not flush the chain.  The chain should
@@ -664,15 +551,12 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain,
                        if (parent)
                                hammer2_chain_setflush(parent);
                }
+#endif
        } else if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) {
                /*
                 * Recursion depth reached.
                 */
-               KKASSERT((chain->flags & HAMMER2_CHAIN_DELAYED) == 0);
-               hammer2_chain_ref(chain);
-               TAILQ_INSERT_TAIL(&info->flushq, chain, flush_node);
-               atomic_set_int(&chain->flags, HAMMER2_CHAIN_DEFERRED);
-               ++info->diddeferral;
+               panic("hammer2: flush depth limit");
        } else if (chain->flags & (HAMMER2_CHAIN_ONFLUSH |
                                   HAMMER2_CHAIN_DESTROY)) {
                /*
@@ -696,9 +580,7 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain,
 
                /*
                 * We may have to do this twice to catch any indirect
-                * block maintenance that occurs.  Other conditions which
-                * can keep setting ONFLUSH (such as deferrals) ought to
-                * be handled by the flushq code.  XXX needs more help
+                * block maintenance that occurs.
                 */
                hammer2_spin_ex(&chain->core.spin);
                RB_SCAN(hammer2_chain_tree, &chain->core.rbtree,
@@ -719,8 +601,6 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain,
                if (info->error)
                        hammer2_chain_setflush(chain);
                info->error |= save_error;
-               if (info->diddeferral)
-                       hammer2_chain_setflush(chain);
 
                /*
                 * If we lost the parent->chain association we have to
@@ -738,15 +618,12 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain,
        /*
         * Now we are in the bottom-up part of the recursion.
         *
-        * Do not update chain if lower layers were deferred.  We continue
-        * to try to update the chain on lower-level errors, but the flush
-        * code may decide not to flush the volume root.
+        * We continue to try to update the chain on lower-level errors, but
+        * the flush code may decide not to flush the volume root.
         *
         * XXX should we continue to try to update the chain if an error
         *     occurred?
         */
-       if (info->diddeferral)
-               goto done;
 
        /*
         * Both parent and chain must be locked in order to flush chain,
@@ -783,12 +660,7 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain,
                }
                KKASSERT(parent != NULL);
                hammer2_chain_unlock(parent);
-               if ((chain->flags & HAMMER2_CHAIN_DELAYED) == 0) {
-                       hammer2_chain_ref(chain);
-                       TAILQ_INSERT_TAIL(&info->flushq, chain, flush_node);
-                       atomic_set_int(&chain->flags, HAMMER2_CHAIN_DEFERRED);
-                       ++info->diddeferral;
-               }
+               retry = 1;
                goto done;
        }
 
@@ -1063,9 +935,14 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain,
         * This can fail if the hammer2_chain_modify() fails.
         *
         * NOTE: UPDATE may be set on vchain or fchain in which case
-        *       parent could be NULL.  It's easiest to allow the case
-        *       and test for NULL.  parent can also wind up being NULL
-        *       due to a deletion so we need to handle the case anyway.
+        *       parent could be NULL, or on an inode that has not yet
+        *       been inserted into the radix tree.  It's easiest to allow
+        *       the case and test for NULL.  parent can also wind up being
+        *       NULL due to a deletion so we need to handle the case anyway.
+        *
+        * NOTE: UPDATE can be set when chains are renamed into or out of
+        *       an indirect block, without the chain itself being flagged
+        *       MODIFIED.
         *
         * If no parent exists we can just clear the UPDATE bit.  If the
         * chain gets reattached later on the bit will simply get set
@@ -1075,7 +952,24 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain,
                atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
 
        /*
-        * The chain may need its blockrefs updated in the parent.
+        * When flushing an inode outside of a FLUSH_FSSYNC we must NOT
+        * update the parent block table to point at the flushed inode.
+        * The block table should only ever be updated by the filesystem
+        * sync code.  If we do, inode<->inode dependencies (such as
+        * directory entries vs inode nlink count) can wind up not being
+        * flushed together and result in a broken topology if a crash/reboot
+        * occurs at the wrong time.
+        */
+       if (chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
+           (flags & HAMMER2_FLUSH_FSSYNC) == 0 &&
+           (flags & HAMMER2_FLUSH_ALL) == 0 &&
+           chain->pmp && chain->pmp->mp) {
+               goto skipupdate;
+       }
+
+       /*
+        * The chain may need its blockrefs updated in the parent, normal
+        * path.
         */
        if (chain->flags & HAMMER2_CHAIN_UPDATE) {
                hammer2_blockref_t *base;
@@ -1201,7 +1095,8 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain,
                if (base && (chain->flags & HAMMER2_CHAIN_BMAPUPD)) {
                        if (chain->flags & HAMMER2_CHAIN_BMAPPED) {
                                hammer2_spin_ex(&parent->core.spin);
-                               hammer2_base_delete(parent, base, count, chain);
+                               hammer2_base_delete(parent, base, count, chain,
+                                                   NULL);
                                hammer2_spin_unex(&parent->core.spin);
                                /* base_delete clears both bits */
                        } else {
@@ -1230,6 +1125,7 @@ done:
                if (info->debug == chain)
                        info->debug = NULL;
        }
+       return retry;
 }
 
 /*
@@ -1310,6 +1206,28 @@ hammer2_flush_recurse(hammer2_chain_t *child, void *data)
        if (child->flags & HAMMER2_CHAIN_DESTROY)
                ++info->scan_del_count;
 #endif
+       /*
+        * Special handling of the root inode.  Because the root inode
+        * contains an index of all the inodes in the PFS in addition to
+        * its normal directory entries, any flush that is not part of a
+        * filesystem sync must only flush the directory entries, and not
+        * anything else.
+        *
+        * The child might be an indirect block, but H2 guarantees that
+        * the key-range will fully partition the inode index from the
+        * directory entries so the case just works naturally.
+        */
+       if ((parent->bref.flags & HAMMER2_BREF_FLAG_PFSROOT) &&
+           (child->flags & HAMMER2_CHAIN_DESTROY) == 0 &&
+           parent->bref.type == HAMMER2_BREF_TYPE_INODE &&
+           (info->flags & HAMMER2_FLUSH_FSSYNC) == 0) {
+               if ((child->bref.key & HAMMER2_DIRHASH_VISIBLE) == 0) {
+                       if (child->flags & HAMMER2_CHAIN_FLUSH_MASK) {
+                               hammer2_chain_setflush(parent);
+                       }
+                       goto done;
+               }
+       }
 
        /*
         * Recurse and collect deferral data.  We're in the media flush,
@@ -1357,20 +1275,16 @@ done:
  *
  * Flushes chain topology for the specified inode.
  *
- * If HAMMER2_XOP_FLUSH is set we flush all chains from the current inode
- * through but stop at sub-inodes (we flush the inode chains for sub-inodes,
- * but do not go further as deeper modifications do not belong to the current
- * flush cycle).
- *
- * If HAMMER2_XOP_FLUSH is not set we flush the current inode's chains only
- * and do not recurse through sub-inodes, including not including those
- * sub-inodes.
+ * HAMMER2_XOP_INODE_STOP      The flush recursion stops at inode boundaries.
+ *                             Inodes belonging to the same flush are flushed
+ *                             separately.
  *
- * Remember that HAMMER2 is currently using a flat inode model, so directory
- * hierarchies do not translate to inode hierarchies.  PFS ROOTs, however,
- * do.
+ * HAMMER2_XOP_PARENTONFLUSH   After flushing if the starting chain indicates
+ *                             a parent update is needed, we setflush the
+ *                             parent to propogate the flush request across
+ *                             the inode.
  *
- * chain->parent can be NULL, usually due to destroy races.
+ * chain->parent can be NULL, usually due to destroy races or detached inodes.
  *
  * Primarily called from vfs_sync().
  */
@@ -1379,7 +1293,6 @@ hammer2_xop_inode_flush(hammer2_xop_t *arg, void *scratch __unused, int clindex)
 {
        hammer2_xop_flush_t *xop = &arg->xop_flush;
        hammer2_chain_t *chain;
-       hammer2_chain_t *parent;
        hammer2_dev_t *hmp;
        int flush_error = 0;
        int fsync_error = 0;
@@ -1391,6 +1304,8 @@ hammer2_xop_inode_flush(hammer2_xop_t *arg, void *scratch __unused, int clindex)
        xflags = HAMMER2_FLUSH_TOP;
        if (xop->head.flags & HAMMER2_XOP_INODE_STOP)
                xflags |= HAMMER2_FLUSH_INODE_STOP;
+       if (xop->head.flags & HAMMER2_XOP_FSSYNC)
+               xflags |= HAMMER2_FLUSH_FSSYNC;
 
        /*
         * Flush core chains
@@ -1399,12 +1314,30 @@ hammer2_xop_inode_flush(hammer2_xop_t *arg, void *scratch __unused, int clindex)
                                    HAMMER2_RESOLVE_ALWAYS);
        if (chain) {
                hmp = chain->hmp;
-               if ((chain->flags & HAMMER2_CHAIN_FLUSH_MASK) ||
-                   TAILQ_FIRST(&hmp->flushq) != NULL) {
+               if (chain->flags & HAMMER2_CHAIN_FLUSH_MASK) {
+                       /*
+                        * Due to flush partitioning the chain topology
+                        * above the inode's chain may no longer be flagged.
+                        * When asked to flush an inode, remark the topology
+                        * leading to that inode.
+                        */
+                       if (chain->parent)
+                               hammer2_chain_setflush(chain->parent);
                        hammer2_flush(chain, xflags);
-                       parent = chain->parent;
-                       if (parent)
-                               hammer2_chain_setflush(parent);
+
+#if 0
+                       /*
+                        * Propogate upwards but only cross an inode boundary
+                        * for inodes associated with the current filesystem
+                        * sync.
+                        */
+                       if ((xop->head.flags & HAMMER2_XOP_PARENTONFLUSH) ||
+                           chain->bref.type != HAMMER2_BREF_TYPE_INODE) {
+                               parent = chain->parent;
+                               if (parent)
+                                       hammer2_chain_setflush(parent);
+                       }
+#endif
                }
                if (chain->flags & HAMMER2_CHAIN_PFSBOUNDARY)
                        ispfsroot = 1;
@@ -1560,7 +1493,8 @@ hammer2_xop_inode_flush(hammer2_xop_t *arg, void *scratch __unused, int clindex)
        if (fsync_error)
                total_error = hammer2_errno_to_error(fsync_error);
 
-       hammer2_trans_done(hmp->spmp, 0);  /* spmp trans */
+       /* spmp trans */
+       hammer2_trans_done(hmp->spmp, HAMMER2_TRANS_ISFLUSH);
 skip:
        hammer2_xop_feed(&xop->head, NULL, clindex, total_error);
 }
index a7fd903..30e87b4 100644 (file)
@@ -369,8 +369,8 @@ hammer2_freemap_try_alloc(hammer2_chain_t **parentp,
                kprintf("freemap create L1 @ %016jx bpref %016jx\n",
                        key, iter->bpref);
 #endif
-               error = hammer2_chain_create(parentp, &chain,
-                                    hmp->spmp, HAMMER2_METH_DEFAULT,
+               error = hammer2_chain_create(parentp, &chain, NULL, hmp->spmp,
+                                    HAMMER2_METH_DEFAULT,
                                     key, HAMMER2_FREEMAP_LEVEL1_RADIX,
                                     HAMMER2_BREF_TYPE_FREEMAP_LEAF,
                                     HAMMER2_FREEMAP_LEVELN_PSIZE,
@@ -1061,8 +1061,8 @@ hammer2_freemap_adjust(hammer2_dev_t *hmp, hammer2_blockref_t *bref,
         * bref.check.freemap structure.
         */
        if (chain == NULL && how == HAMMER2_FREEMAP_DORECOVER) {
-               error = hammer2_chain_create(&parent, &chain,
-                                    hmp->spmp, HAMMER2_METH_DEFAULT,
+               error = hammer2_chain_create(&parent, &chain, NULL, hmp->spmp,
+                                    HAMMER2_METH_DEFAULT,
                                     key, HAMMER2_FREEMAP_LEVEL1_RADIX,
                                     HAMMER2_BREF_TYPE_FREEMAP_LEAF,
                                     HAMMER2_FREEMAP_LEVELN_PSIZE,
index c4818bb..12044c7 100644 (file)
@@ -86,13 +86,15 @@ hammer2_inode_delayed_sideq(hammer2_inode_t *ip)
 }
 
 /*
- * HAMMER2 inode locks
+ * Lock an inode, with SYNCQ semantics.
  *
  * HAMMER2 offers shared and exclusive locks on inodes.  Pass a mask of
  * flags for options:
  *
  *     - pass HAMMER2_RESOLVE_SHARED if a shared lock is desired.  The
  *       inode locking function will automatically set the RDONLY flag.
+ *       shared locks are not subject to SYNCQ semantics, exclusive locks
+ *       are.
  *
  *     - pass HAMMER2_RESOLVE_ALWAYS if you need the inode's meta-data.
  *       Most front-end inode locks do.
@@ -146,20 +148,25 @@ hammer2_inode_lock(hammer2_inode_t *ip, int how)
        /*
         * Inode structure mutex - Exclusive lock
         *
-        * The exclusive lock must wait for inodes on SYNCQ to flush
-        * first, to ensure that meta-data dependencies such as the
-        * nlink count and related directory entries are not split
+        * An exclusive lock (if not recursive) must wait for inodes on
+        * SYNCQ to flush first, to ensure that meta-data dependencies such
+        * as the nlink count and related directory entries are not split
         * across flushes.
+        *
+        * If the vnode is locked by the current thread it must be unlocked
+        * across the tsleep() to avoid a deadlock.
         */
        hammer2_mtx_ex(&ip->lock);
+       if (hammer2_mtx_refs(&ip->lock) > 1)
+               return;
        while ((ip->flags & HAMMER2_INODE_SYNCQ) && pmp) {
                hammer2_spin_ex(&pmp->list_spin);
                if (ip->flags & HAMMER2_INODE_SYNCQ) {
+                       tsleep_interlock(&ip->flags, 0);
                        atomic_set_int(&ip->flags, HAMMER2_INODE_SYNCQ_WAKEUP);
                        TAILQ_REMOVE(&pmp->syncq, ip, entry);
                        TAILQ_INSERT_HEAD(&pmp->syncq, ip, entry);
                        hammer2_spin_unex(&pmp->list_spin);
-                       tsleep_interlock(&ip->flags, 0);
                        hammer2_mtx_unlock(&ip->lock);
                        tsleep(&ip->flags, PINTERLOCKED, "h2sync", 0);
                        hammer2_mtx_ex(&ip->lock);
@@ -170,6 +177,76 @@ hammer2_inode_lock(hammer2_inode_t *ip, int how)
        }
 }
 
+/*
+ * Exclusively lock up to four inodes, in order, with SYNCQ semantics.
+ * ip1 and ip2 must not be NULL.  ip3 and ip4 may be NULL, but if ip3 is
+ * NULL then ip4 must also be NULL.
+ */
+void
+hammer2_inode_lock4(hammer2_inode_t *ip1, hammer2_inode_t *ip2,
+                   hammer2_inode_t *ip3, hammer2_inode_t *ip4)
+{
+       hammer2_inode_t *ips[4];
+       hammer2_inode_t *iptmp;
+       hammer2_pfs_t *pmp;
+       size_t count;
+       size_t i;
+       size_t j;
+
+       pmp = ip1->pmp;                 /* may be NULL */
+       KKASSERT(pmp == ip2->pmp);
+
+       ips[0] = ip1;
+       ips[1] = ip2;
+       if (ip3 == NULL) {
+               count = 2;
+       } else if (ip4 == NULL) {
+               count = 3;
+               ips[2] = ip3;
+               KKASSERT(pmp == ip3->pmp);
+       } else {
+               count = 4;
+               ips[2] = ip3;
+               ips[3] = ip4;
+               KKASSERT(pmp == ip3->pmp);
+               KKASSERT(pmp == ip4->pmp);
+       }
+
+       for (i = 0; i < count; ++i)
+               hammer2_inode_ref(ips[i]);
+
+restart:
+       for (i = 0; i < count; ++i) {
+               iptmp = ips[i];
+               hammer2_mtx_ex(&iptmp->lock);
+               if (hammer2_mtx_refs(&iptmp->lock) > 1)
+                       continue;
+               if ((iptmp->flags & HAMMER2_INODE_SYNCQ) == 0 || pmp == NULL)
+                       continue;
+               tsleep_interlock(&iptmp->flags, 0);
+               hammer2_spin_ex(&pmp->list_spin);
+               if ((iptmp->flags & HAMMER2_INODE_SYNCQ) == 0) {
+                       hammer2_spin_unex(&pmp->list_spin);
+                       continue;
+               }
+               atomic_set_int(&iptmp->flags, HAMMER2_INODE_SYNCQ_WAKEUP);
+               TAILQ_REMOVE(&pmp->syncq, iptmp, entry);
+               TAILQ_INSERT_HEAD(&pmp->syncq, iptmp, entry);
+               hammer2_spin_unex(&pmp->list_spin);
+
+               /*
+                * Unlock everything (including the current index) and wait
+                * for our wakeup.
+                */
+               for (j = 0; j <= i; ++j)
+                       hammer2_mtx_unlock(&ips[j]->lock);
+               tsleep(&iptmp->flags, PINTERLOCKED, "h2sync", 0);
+               /*tsleep(&iptmp->flags, 0, "h2sync2", 1);*/
+
+               goto restart;
+       }
+}
+
 /*
  * Release an inode lock.  If another thread is blocked on SYNCQ_WAKEUP
  * we wake them up.
@@ -862,6 +939,13 @@ done2:
        return (nip);
 }
 
+/*
+ * Create a new, normal inode.  This function will create the inode,
+ * the media chains, but will not insert the chains onto the media topology
+ * (doing so would require a flush transaction and cause long stalls).
+ *
+ * Caller must be in a normal transaction.
+ */
 hammer2_inode_t *
 hammer2_inode_create_normal(hammer2_inode_t *pip,
                            struct vattr *vap, struct ucred *cred,
@@ -882,7 +966,6 @@ hammer2_inode_create_normal(hammer2_inode_t *pip,
 
        dip = pip->pmp->iroot;
        KKASSERT(dip != NULL);
-       nip = NULL;
 
        *errorp = 0;
 
@@ -896,82 +979,101 @@ hammer2_inode_create_normal(hammer2_inode_t *pip,
        pip_inum = (pip == pip->pmp->iroot) ? 1 : pip->meta.inum;
 
        /*
-        * Create the inode using (inum) as the key.
+        * Create the in-memory hammer2_inode structure for the specified
+        * inode.
         */
-       xop = hammer2_xop_alloc(dip, HAMMER2_XOP_MODIFYING);
-       xop->lhc = inum;
-       xop->flags = 0;
-       bzero(&xop->meta, sizeof(xop->meta));
-       KKASSERT(vap);
+       nip = hammer2_inode_get(dip->pmp, NULL, inum, -1);
+       nip->comp_heuristic = 0;
+       KKASSERT((nip->flags & HAMMER2_INODE_CREATING) == 0 &&
+                nip->cluster.nchains == 0);
+       atomic_set_int(&nip->flags, HAMMER2_INODE_CREATING);
 
        /*
         * Setup the inode meta-data
         */
-       xop->meta.type = hammer2_get_obj_type(vap->va_type);
+       nip->meta.type = hammer2_get_obj_type(vap->va_type);
 
-       switch (xop->meta.type) {
+       switch (nip->meta.type) {
        case HAMMER2_OBJTYPE_CDEV:
        case HAMMER2_OBJTYPE_BDEV:
-               xop->meta.rmajor = vap->va_rmajor;
-               xop->meta.rminor = vap->va_rminor;
+               nip->meta.rmajor = vap->va_rmajor;
+               nip->meta.rminor = vap->va_rminor;
                break;
        default:
                break;
        }
-       type = xop->meta.type;
+       type = nip->meta.type;
 
-       xop->meta.inum = inum;
-       xop->meta.iparent = pip_inum;
+       KKASSERT(nip->meta.inum == inum);
+       nip->meta.iparent = pip_inum;
        
        /* Inherit parent's inode compression mode. */
-       xop->meta.comp_algo = pip_comp_algo;
-       xop->meta.check_algo = pip_check_algo;
-       xop->meta.version = HAMMER2_INODE_VERSION_ONE;
-       hammer2_update_time(&xop->meta.ctime);
-       xop->meta.mtime = xop->meta.ctime;
-       xop->meta.mode = vap->va_mode;
-       xop->meta.nlinks = 1;
+       nip->meta.comp_algo = pip_comp_algo;
+       nip->meta.check_algo = pip_check_algo;
+       nip->meta.version = HAMMER2_INODE_VERSION_ONE;
+       hammer2_update_time(&nip->meta.ctime);
+       nip->meta.mtime = nip->meta.ctime;
+       nip->meta.mode = vap->va_mode;
+       nip->meta.nlinks = 1;
 
        xuid = hammer2_to_unix_xid(&pip_uid);
        xuid = vop_helper_create_uid(dip->pmp->mp, pip_mode,
                                     xuid, cred,
                                     &vap->va_mode);
        if (vap->va_vaflags & VA_UID_UUID_VALID)
-               xop->meta.uid = vap->va_uid_uuid;
+               nip->meta.uid = vap->va_uid_uuid;
        else if (vap->va_uid != (uid_t)VNOVAL)
-               hammer2_guid_to_uuid(&xop->meta.uid, vap->va_uid);
+               hammer2_guid_to_uuid(&nip->meta.uid, vap->va_uid);
        else
-               hammer2_guid_to_uuid(&xop->meta.uid, xuid);
+               hammer2_guid_to_uuid(&nip->meta.uid, xuid);
 
        if (vap->va_vaflags & VA_GID_UUID_VALID)
-               xop->meta.gid = vap->va_gid_uuid;
+               nip->meta.gid = vap->va_gid_uuid;
        else if (vap->va_gid != (gid_t)VNOVAL)
-               hammer2_guid_to_uuid(&xop->meta.gid, vap->va_gid);
+               hammer2_guid_to_uuid(&nip->meta.gid, vap->va_gid);
        else
-               xop->meta.gid = pip_gid;
+               nip->meta.gid = pip_gid;
 
        /*
         * Regular files and softlinks allow a small amount of data to be
         * directly embedded in the inode.  This flag will be cleared if
         * the size is extended past the embedded limit.
         */
-       if (xop->meta.type == HAMMER2_OBJTYPE_REGFILE ||
-           xop->meta.type == HAMMER2_OBJTYPE_SOFTLINK) {
-               xop->meta.op_flags |= HAMMER2_OPFLAG_DIRECTDATA;
+       if (nip->meta.type == HAMMER2_OBJTYPE_REGFILE ||
+           nip->meta.type == HAMMER2_OBJTYPE_SOFTLINK) {
+               nip->meta.op_flags |= HAMMER2_OPFLAG_DIRECTDATA;
        }
 
+       /*
+        * Create the inode using (inum) as the key.  Pass pip for
+        * method inheritance.
+        */
+       xop = hammer2_xop_alloc(pip, HAMMER2_XOP_MODIFYING);
+       xop->lhc = inum;
+       xop->flags = 0;
+       xop->meta = nip->meta;
+       KKASSERT(vap);
+
        xop->meta.name_len = hammer2_xop_setname_inum(&xop->head, inum);
        xop->meta.name_key = inum;
+       nip->meta.name_len = xop->meta.name_len;
+       nip->meta.name_key = xop->meta.name_key;
+       hammer2_inode_modify(nip);
 
        /*
-        * Create the inode media chains
+        * Create the inode media chains but leave them detached.  We are
+        * not in a flush transaction so we can't mess with media topology
+        * above normal inodes (i.e. the index of the inodes themselves).
+        *
+        * We've already set the INODE_CREATING flag.  The inode's media
+        * chains will be inserted onto the media topology on the next
+        * filesystem sync.
         */
-       hammer2_xop_start(&xop->head, &hammer2_inode_create_desc);
+       hammer2_xop_start(&xop->head, &hammer2_inode_create_det_desc);
 
        error = hammer2_xop_collect(&xop->head, 0);
 #if INODE_DEBUG
-       kprintf("CREATE INODE %*.*s\n",
-               (int)name_len, (int)name_len, name);
+       kprintf("create inode type %d error %d\n", nip->meta.type, error);
 #endif
 
        if (error) {
@@ -980,18 +1082,10 @@ hammer2_inode_create_normal(hammer2_inode_t *pip,
        }
 
        /*
-        * Set up the new inode if not a hardlink pointer.
-        *
-        * NOTE: *_get() integrates chain's lock into the inode lock.
-        *
-        * NOTE: Only one new inode can currently be created per
-        *       transaction.  If the need arises we can adjust
-        *       hammer2_trans_init() to allow more.
-        *
-        * NOTE: nipdata will have chain's blockset data.
+        * Associate the media chains created by the backend with the
+        * frontend inode.
         */
-       nip = hammer2_inode_get(dip->pmp, &xop->head, -1, -1);
-       nip->comp_heuristic = 0;
+       hammer2_inode_repoint(nip, NULL, &xop->head.cluster);
 done:
        hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
        hammer2_inode_unlock(dip);
@@ -1387,6 +1481,37 @@ hammer2_inode_chain_sync(hammer2_inode_t *ip)
        return error;
 }
 
+/*
+ * When an inode is flagged INODE_CREATING its chains have not actually
+ * been inserting into the on-media tree yet.
+ */
+int
+hammer2_inode_chain_ins(hammer2_inode_t *ip)
+{
+       int error;
+
+       error = 0;
+       if (ip->flags & HAMMER2_INODE_CREATING) {
+               hammer2_xop_create_t *xop;
+
+               atomic_clear_int(&ip->flags, HAMMER2_INODE_CREATING);
+               xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
+               xop->lhc = ip->meta.inum;
+               xop->flags = 0;
+               hammer2_xop_start(&xop->head, &hammer2_inode_create_ins_desc);
+               error = hammer2_xop_collect(&xop->head, 0);
+               hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
+               if (error == HAMMER2_ERROR_ENOENT)
+                       error = 0;
+               if (error) {
+                       kprintf("hammer2: backend unable to "
+                               "insert inode %p %ld\n", ip, ip->meta.inum);
+                       /* XXX return error somehow? */
+               }
+       }
+       return error;
+}
+
 /*
  * Flushes the inode's chain and its sub-topology to media.  Interlocks
  * HAMMER2_INODE_DIRTYDATA by clearing it prior to the flush.  Any strategy
@@ -1396,14 +1521,13 @@ hammer2_inode_chain_sync(hammer2_inode_t *ip)
  * inode must be locked.
  */
 int
-hammer2_inode_chain_flush(hammer2_inode_t *ip)
+hammer2_inode_chain_flush(hammer2_inode_t *ip, int flags)
 {
        hammer2_xop_fsync_t *xop;
        int error;
 
        atomic_clear_int(&ip->flags, HAMMER2_INODE_DIRTYDATA);
-       xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING |
-                                   HAMMER2_XOP_INODE_STOP);
+       xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING | flags);
        hammer2_xop_start(&xop->head, &hammer2_inode_flush_desc);
        error = hammer2_xop_collect(&xop->head, HAMMER2_XOP_COLLECT_WAITALL);
        hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
index 8e78a9a..600c725 100644 (file)
@@ -658,7 +658,8 @@ hammer2_ioctl_pfs_create(hammer2_inode_t *ip, void *data)
                hammer2_inode_ref(nip);
                hammer2_inode_unlock(nip);
                hammer2_inode_chain_sync(nip);
-               hammer2_inode_chain_flush(nip);
+               hammer2_inode_chain_flush(nip, HAMMER2_XOP_INODE_STOP |
+                                              HAMMER2_XOP_FSSYNC);
                KKASSERT(nip->refs == 1);
                hammer2_inode_drop(nip);
 
@@ -678,7 +679,8 @@ hammer2_ioctl_pfs_create(hammer2_inode_t *ip, void *data)
                hammer2_chain_drop(nchain);
 
        }
-       hammer2_trans_done(hmp->spmp, 1);
+       hammer2_trans_done(hmp->spmp, HAMMER2_TRANS_ISFLUSH |
+                                     HAMMER2_TRANS_SIDEQ);
 
        return (error);
 }
@@ -775,7 +777,7 @@ hammer2_ioctl_pfs_delete(hammer2_inode_t *ip, void *data)
 #endif
        hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
 
-       hammer2_trans_done(spmp, 1);
+       hammer2_trans_done(spmp, HAMMER2_TRANS_SIDEQ);
 
        return (hammer2_error_to_errno(error));
 }
@@ -898,7 +900,8 @@ hammer2_ioctl_pfs_snapshot(hammer2_inode_t *ip, void *data)
                hammer2_inode_ref(nip);
                hammer2_inode_unlock(nip);
                hammer2_inode_chain_sync(nip);
-               hammer2_inode_chain_flush(nip);
+               hammer2_inode_chain_flush(nip, HAMMER2_XOP_INODE_STOP |
+                                              HAMMER2_XOP_FSSYNC);
                KKASSERT(nip->refs == 1);
                hammer2_inode_drop(nip);
 
@@ -919,7 +922,7 @@ hammer2_ioctl_pfs_snapshot(hammer2_inode_t *ip, void *data)
        hammer2_chain_drop(chain);
 
        hammer2_inode_unlock(ip);
-       hammer2_trans_done(pmp, 1);
+       hammer2_trans_done(pmp, HAMMER2_TRANS_ISFLUSH | HAMMER2_TRANS_SIDEQ);
 
        lockmgr(&hmp->bulklk, LK_RELEASE);
 
@@ -1007,7 +1010,7 @@ hammer2_ioctl_inode_set(hammer2_inode_t *ip, void *data)
                ip->meta.ncopies = ino->ip_data.meta.ncopies;
        }
        hammer2_inode_unlock(ip);
-       hammer2_trans_done(ip->pmp, 1);
+       hammer2_trans_done(ip->pmp, HAMMER2_TRANS_SIDEQ);
 
        return (hammer2_error_to_errno(error));
 }
@@ -1112,7 +1115,8 @@ hammer2_ioctl_bulkfree_scan(hammer2_inode_t *ip, void *data)
                hammer2_chain_bulkdrop(vchain);
        } else {
                hammer2_chain_drop(vchain);
-               hammer2_trans_done(pmp, 1);
+               hammer2_trans_done(pmp, HAMMER2_TRANS_ISFLUSH |
+                                       HAMMER2_TRANS_SIDEQ);
        }
        error = hammer2_error_to_errno(error);
 
@@ -1170,7 +1174,7 @@ hammer2_ioctl_destroy(hammer2_inode_t *ip, void *data)
                error = hammer2_error_to_errno(error);
                hammer2_inode_unlock(ip);
                hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
-               hammer2_trans_done(pmp, 1);
+               hammer2_trans_done(pmp, HAMMER2_TRANS_SIDEQ);
                }
                break;
        case HAMMER2_DELETE_INUM:
@@ -1193,7 +1197,7 @@ hammer2_ioctl_destroy(hammer2_inode_t *ip, void *data)
                error = hammer2_xop_collect(&xop->head, 0);
                error = hammer2_error_to_errno(error);
                hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
-               hammer2_trans_done(pmp, 1);
+               hammer2_trans_done(pmp, HAMMER2_TRANS_SIDEQ);
                }
                break;
        default:
index 63af632..11f67d0 100644 (file)
@@ -664,7 +664,7 @@ hammer2_xop_strategy_write(hammer2_xop_t *arg, void *scratch, int clindex)
        hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
        hammer2_trans_assert_strategy(ip->pmp);
        hammer2_lwinprog_drop(ip->pmp);
-       hammer2_trans_done(ip->pmp, 0);
+       hammer2_trans_done(ip->pmp, HAMMER2_TRANS_BUFCACHE);
 }
 
 /*
@@ -741,8 +741,7 @@ hammer2_assign_physical(hammer2_inode_t *ip, hammer2_chain_t **parentp,
                 */
                dedup_off = hammer2_dedup_lookup((*parentp)->hmp, datap,
                                                 pblksize);
-               *errorp |= hammer2_chain_create(parentp, &chain,
-                                               ip->pmp,
+               *errorp |= hammer2_chain_create(parentp, &chain, NULL, ip->pmp,
                                       HAMMER2_ENC_CHECK(ip->meta.check_algo) |
                                       HAMMER2_ENC_COMP(HAMMER2_COMP_NONE),
                                                lbase, HAMMER2_PBUFRADIX,
index e75d33d..090090e 100644 (file)
@@ -758,8 +758,8 @@ hammer2_sync_insert(hammer2_thread_t *thr,
        KKASSERT(chain == NULL);
 
        chain = NULL;
-       error = hammer2_chain_create(parentp, &chain,
-                                    thr->pmp, focus->bref.methods,
+       error = hammer2_chain_create(parentp, &chain, NULL, thr->pmp,
+                                    focus->bref.methods,
                                     focus->bref.key, focus->bref.keybits,
                                     focus->bref.type, focus->bytes,
                                     mtid, 0, 0);
index 76950e2..0284826 100644 (file)
@@ -1159,7 +1159,6 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
                RB_INIT(&hmp->iotree);
                spin_init(&hmp->io_spin, "hm2mount_io");
                spin_init(&hmp->list_spin, "hm2mount_list");
-               TAILQ_INIT(&hmp->flushq);
 
                lockinit(&hmp->vollk, "h2vol", 0, 0);
                lockinit(&hmp->bulklk, "h2bulk", 0, 0);
@@ -2397,6 +2396,15 @@ hammer2_vfs_sync(struct mount *mp, int waitfor)
        return error;
 }
 
+/*
+ * Because frontend operations lock vnodes before we get a chance to
+ * lock the related inode, we can't just acquire a vnode lock without
+ * risking a deadlock.  The frontend may be holding a vnode lock while
+ * also blocked on our SYNCQ flag while trying to get the inode lock.
+ *
+ * To deal with this situation we can check the vnode lock situation
+ * after locking the inode and perform a work-around.
+ */
 int
 hammer2_vfs_sync_pmp(hammer2_pfs_t *pmp, int waitfor)
 {
@@ -2405,36 +2413,35 @@ hammer2_vfs_sync_pmp(hammer2_pfs_t *pmp, int waitfor)
        /*struct hammer2_sync_info info;*/
        hammer2_inode_t *iroot;
        hammer2_inode_t *ip;
+       hammer2_inode_t *ipdrop;
        struct vnode *vp;
-       int flags;
+       uint32_t pass2;
        int error;
+       int dorestart;
 
        mp = pmp->mp;
        iroot = pmp->iroot;
        KKASSERT(iroot);
        KKASSERT(iroot->pmp == pmp);
 
-       /*
-        * We can't acquire locks on existing vnodes while in a transaction
-        * without risking a deadlock.  This assumes that vfsync() can be
-        * called without the vnode locked (which it can in DragonFly).
-        * Otherwise we'd have to implement a multi-pass or flag the lock
-        * failures and retry.
-        *
-        * The reclamation code interlocks with the sync list's token
-        * (by removing the vnode from the scan list) before unlocking
-        * the inode, giving us time to ref the inode.
-        */
-       /*flags = VMSC_GETVP;*/
-       flags = 0;
-       if (waitfor & MNT_LAZY)
-               flags |= VMSC_ONEPASS;
-
        /*
         * Move all inodes on sideq to syncq.  This will clear sideq.
         * This should represent all flushable inodes.  These inodes
-        * will already have refs due to being on syncq or sideq.
+        * will already have refs due to being on syncq or sideq.  We
+        * must do this all at once to ensure that inode dependencies
+        * are part of the same flush.
+        *
+        * We should be able to do this asynchronously from frontend
+        * operations because we will be locking the inodes later on
+        * to actually flush them, and that will partition any frontend
+        * op using the same inode.  Either it has already locked the
+        * inode and we will block, or it has not yet locked the inode
+        * and it will block until we are finished flushing that inode.
         */
+       hammer2_trans_init(pmp, HAMMER2_TRANS_ISFLUSH);
+restart:
+       hammer2_trans_setflags(pmp, HAMMER2_TRANS_COPYQ);
+       dorestart = 0;
        hammer2_spin_ex(&pmp->list_spin);
        TAILQ_FOREACH(ip, &pmp->sideq, entry) {
                KKASSERT(ip->flags & HAMMER2_INODE_SIDEQ);
@@ -2444,64 +2451,104 @@ hammer2_vfs_sync_pmp(hammer2_pfs_t *pmp, int waitfor)
        TAILQ_CONCAT(&pmp->syncq, &pmp->sideq, entry);
        pmp->sideq_count = 0;
        hammer2_spin_unex(&pmp->list_spin);
+       hammer2_trans_clearflags(pmp, HAMMER2_TRANS_COPYQ |
+                                     HAMMER2_TRANS_WAITING);
 
        /*
-        * Flush transactions only interlock with other flush transactions.
-        * Any concurrent frontend operations will block when obtaining an
-        * exclusive inode lock on any inode on SYNCQ, and we will block here
-        * when we ourselves obtain the exclusive lock.
-        *
         * Now run through all inodes on syncq.
+        *
+        * Flush transactions only interlock with other flush transactions.
+        * Any conflicting frontend operations will block on the inode, but
+        * may hold a vnode lock while doing so.
         */
-       hammer2_trans_init(pmp, HAMMER2_TRANS_ISFLUSH);
-       ip = NULL;
-       for (;;) {
-               if (ip == NULL) {
-                       hammer2_spin_ex(&pmp->list_spin);
-                       ip = TAILQ_FIRST(&pmp->syncq);
-                       if (ip == NULL) {
-                               hammer2_spin_unex(&pmp->list_spin);
-                               break;
-                       }
-                       TAILQ_REMOVE(&pmp->syncq, ip, entry);
-                       atomic_clear_int(&ip->flags, HAMMER2_INODE_SYNCQ);
-                       hammer2_spin_unex(&pmp->list_spin);
-                       /* leave's ip with a ref from being on SYNCQ */
+       ipdrop = NULL;
+
+       hammer2_spin_ex(&pmp->list_spin);
+       while ((ip = TAILQ_FIRST(&pmp->syncq)) != NULL) {
+               /*
+                * Remove the inode from the SYNCQ, transfer the syncq ref
+                * to us.  We must clear SYNCQ to allow any potential
+                * front-end deadlock to proceed.
+                */
+               pass2 = ip->flags;
+               cpu_ccfence();
+               if (atomic_cmpset_int(&ip->flags,
+                             pass2,
+                             pass2 & ~(HAMMER2_INODE_SYNCQ |
+                                       HAMMER2_INODE_SYNCQ_WAKEUP |
+                                       HAMMER2_INODE_SYNCQ_PASS2)) == 0) {
+                       continue;
+               }
+               if (pass2 & HAMMER2_INODE_SYNCQ_WAKEUP)
+                       wakeup(&ip->flags);
+               TAILQ_REMOVE(&pmp->syncq, ip, entry);
+               hammer2_spin_unex(&pmp->list_spin);
+               if (ipdrop) {
+                       hammer2_inode_drop(ipdrop);
+                       ipdrop = NULL;
                }
+               hammer2_mtx_ex(&ip->lock);
 
                /*
-                * We hold a ref on ip, SYNCQ flag has been cleared, and
-                * since we own the flush transaction it cannot get set
-                * again (though the ip can be put on SIDEQ again).
+                * We need the vp in order to vfsync() dirty buffers, so if
+                * one isn't attached we can skip it.
                 *
-                * Acquire the vnode and inode exclusively.  Be careful
-                * of order.
+                * Ordering the inode lock and then the vnode lock has the
+                * potential to deadlock.  If we had left SYNCQ set that could
+                * also deadlock us against the frontend even if we don't hold
+                * any locks, but the latter is not a problem now since we
+                * cleared it.  igetv will temporarily release the inode lock
+                * in a safe manner to work-around the deadlock.
+                *
+                * Unfortunately it is still possible to deadlock when the
+                * frontend obtains multiple inode locks, because all the
+                * related vnodes are already locked (nor can the vnode locks
+                * be released and reacquired without messing up RECLAIM and
+                * INACTIVE sequencing).
+                *
+                * The solution for now is to move the vp back onto SIDEQ
+                * and set dorestart, which will restart the flush after we
+                * exhaust the current SYNCQ.  Note that additional
+                * dependencies may build up, so we definitely need to move
+                * the whole SIDEQ back to SYNCQ when we restart.
                 */
-               if ((vp = ip->vp) != NULL) {
-                       vhold(vp);
-                       if (vget(vp, LK_EXCLUSIVE)) {
-                                vdrop(vp);
-                               hammer2_inode_drop(ip);
+               vp = ip->vp;
+               if (vp) {
+                       if (vget(vp, LK_EXCLUSIVE|LK_NOWAIT)) {
+                               /*
+                                * Failed, move to SIDEQ
+                                */
+                               vp = NULL;
+                               dorestart = 1;
+                               hammer2_spin_ex(&pmp->list_spin);
+                               if ((ip->flags & (HAMMER2_INODE_SYNCQ |
+                                                 HAMMER2_INODE_SIDEQ)) == 0) {
+                                       atomic_set_int(&ip->flags,
+                                                  HAMMER2_INODE_SIDEQ |
+                                                  HAMMER2_INODE_SYNCQ_PASS2);
+                                       TAILQ_INSERT_TAIL(&pmp->sideq, ip,
+                                                         entry);
+                                       hammer2_spin_unex(&pmp->list_spin);
+                                       hammer2_mtx_unlock(&ip->lock);
+                               } else {
+                                       hammer2_spin_unex(&pmp->list_spin);
+                                       hammer2_mtx_unlock(&ip->lock);
+                                       hammer2_inode_drop(ip);
+                               }
+                               if (pass2 & HAMMER2_INODE_SYNCQ_PASS2) {
+                                       tsleep(&dorestart, 0, "h2syndel", 2);
+                               }
+                               hammer2_spin_ex(&pmp->list_spin);
                                continue;
                        }
-                       vdrop(vp);
-                       hammer2_mtx_ex(&ip->lock);
-                       if (ip->vp != vp) {
-                               hammer2_mtx_unlock(&ip->lock);  /* unlock */
-                               vput(vp);
-                               continue;                       /* retry w/ip */
-                       }
                } else {
-                       hammer2_mtx_ex(&ip->lock);
-                       if (ip->vp != NULL) {
-                               hammer2_mtx_unlock(&ip->lock);  /* unlock */
-                               continue;                       /* retry w/ip */
-                       }
+                       vp = NULL;
                }
 
                /*
-                * Ok, we hold the inode and vnode exclusively locked,
-                * inside a flush transaction, and can now flush them.
+                * Ok we have the inode exclusively locked and if vp is
+                * not NULL that will also be exclusively locked.  Do the
+                * meat of the flush.
                 *
                 * vp token needed for v_rbdirty_tree check / vclrisdirty
                 * sequencing.  Though we hold the vnode exclusively so
@@ -2510,11 +2557,21 @@ hammer2_vfs_sync_pmp(hammer2_pfs_t *pmp, int waitfor)
                if (vp) {
                        vfsync(vp, MNT_WAIT, 1, NULL, NULL);
                        bio_track_wait(&vp->v_track_write, 0, 0); /* XXX */
-                       lwkt_gettoken(&vp->v_token);
+               }
+
+               /*
+                * If the inode has not yet been inserted into the tree
+                * we must do so.  Then sync and flush it.  The flush should
+                * update the parent.
+                */
+               if (ip->flags & HAMMER2_INODE_CREATING) {
+                       hammer2_inode_chain_ins(ip);
                }
                hammer2_inode_chain_sync(ip);
-               hammer2_inode_chain_flush(ip);
+               hammer2_inode_chain_flush(ip, HAMMER2_XOP_INODE_STOP |
+                                             HAMMER2_XOP_FSSYNC);
                if (vp) {
+                       lwkt_gettoken(&vp->v_token);
                        if ((ip->flags & (HAMMER2_INODE_MODIFIED |
                                          HAMMER2_INODE_RESIZED |
                                          HAMMER2_INODE_DIRTYDATA)) == 0 &&
@@ -2526,8 +2583,40 @@ hammer2_vfs_sync_pmp(hammer2_pfs_t *pmp, int waitfor)
                        vput(vp);
                }
                hammer2_inode_unlock(ip);       /* unlock+drop */
-               ip = NULL;                      /* next ip */
+               /* ip pointer invalid */
+
+               /*
+                * If the inode got dirted after we dropped our locks,
+                * it will have already been moved back to the SIDEQ.
+                */
+               hammer2_spin_ex(&pmp->list_spin);
+       }
+       hammer2_spin_unex(&pmp->list_spin);
+       if (ipdrop) {
+               hammer2_inode_drop(ipdrop);
+               ipdrop = NULL;
+       }
+       if (dorestart)
+               goto restart;
+
+       /*
+        * We have to flush iroot last, even if it does not appear to be
+        * dirty, because all the inodes in the PFS are indexed under the
+        * iroot.  The normal flushing of iroot above would only occur if
+        * directory entries under the root were changed.
+        */
+       if ((ip = pmp->iroot) != NULL) {
+               hammer2_inode_ref(ip);
+               hammer2_mtx_ex(&ip->lock);
+               hammer2_inode_chain_sync(ip);
+               hammer2_inode_chain_flush(ip, HAMMER2_XOP_INODE_STOP |
+                                             HAMMER2_XOP_FSSYNC);
+               hammer2_inode_unlock(ip);       /* unlock+drop */
        }
+
+       /*
+        * device bioq sync
+        */
        hammer2_bioq_sync(pmp);
 
 #if 0
@@ -2582,88 +2671,11 @@ hammer2_vfs_sync_pmp(hammer2_pfs_t *pmp, int waitfor)
        } else {
                error = 0;
        }
-       hammer2_trans_done(pmp, 0);
+       hammer2_trans_done(pmp, HAMMER2_TRANS_ISFLUSH);
 
        return (error);
 }
 
-#if 0
-/*
- * Sync passes.
- *
- * Note that we ignore the tranasction mtid we got above.  Instead,
- * each vfsync below will ultimately get its own via TRANS_BUFCACHE
- * transactions.
- *
- * WARNING! The frontend might be waiting on chnmem (limit_dirty_chains)
- * while holding a vnode locked.  When this situation occurs we cannot
- * safely test whether it is ok to clear the dirty bit on the vnode.
- * However, we can still flush the inode's topology.
- */
-static int
-hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data)
-{
-       struct hammer2_sync_info *info = data;
-       hammer2_inode_t *ip;
-       int error;
-
-       /*
-        * Degenerate cases.  Note that ip == NULL typically means the
-        * syncer vnode itself and we don't want to vclrisdirty() in that
-        * situation.
-        */
-       ip = VTOI(vp);
-       if (ip == NULL) {
-               return(0);
-       }
-       if (vp->v_type == VNON || vp->v_type == VBAD) {
-               vclrisdirty(vp);
-               return(0);
-       }
-
-       /*
-        * Synchronize the buffer cche and inode meta-data to the backing
-        * chain topology.
-        *
-        * vfsync is not necessarily synchronous, so it is best NOT to try
-        * to flush the backing topology to media at this point.
-        */
-       hammer2_inode_ref(ip);
-       if ((ip->flags & (HAMMER2_INODE_RESIZED|HAMMER2_INODE_MODIFIED)) ||
-           !RB_EMPTY(&vp->v_rbdirty_tree)) {
-               if (info->pass == 1)
-                       vfsync(vp, info->waitfor, 1, NULL, NULL);
-               else
-                       bio_track_wait(&vp->v_track_write, 0, 0);
-       }
-       if (info->pass == 2 && (vp->v_flag & VISDIRTY)) {
-               /*
-                * v_token is needed to interlock v_rbdirty_tree.
-                */
-               lwkt_gettoken(&vp->v_token);
-               hammer2_inode_lock(ip, 0);
-               hammer2_inode_chain_sync(ip);
-               hammer2_inode_chain_flush(ip);
-               if ((ip->flags & (HAMMER2_INODE_MODIFIED |
-                                 HAMMER2_INODE_RESIZED |
-                                 HAMMER2_INODE_DIRTYDATA)) == 0 &&
-                   RB_EMPTY(&vp->v_rbdirty_tree) &&
-                   !bio_track_active(&vp->v_track_write)) {
-                       vclrisdirty(vp);
-               }
-               hammer2_inode_unlock(ip);
-               lwkt_reltoken(&vp->v_token);
-       }
-       hammer2_inode_drop(ip);
-#if 1
-       error = 0;
-       if (error)
-               info->error = error;
-#endif
-       return(0);
-}
-#endif
-
 static
 int
 hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp)
@@ -2894,6 +2906,9 @@ hammer2_lwinprog_wait(hammer2_pfs_t *pmp, int flush_pipe)
  * We do not want sysads to feel that they have to torpedo kern.maxvnodes
  * to solve this problem, so we implement vfs.hammer2.limit_dirty_inodes
  * (per-mount-basis) and default it to something reasonable.
+ *
+ * XXX we cannot safely block here because we might be holding locks that
+ * the syncer needs.
  */
 static void
 hammer2_pfs_moderate(hammer2_inode_t *ip, int always_moderate)
@@ -2902,7 +2917,8 @@ hammer2_pfs_moderate(hammer2_inode_t *ip, int always_moderate)
        struct mount *mp = pmp->mp;
 
        if (mp && vn_syncer_count(mp) > hammer2_limit_dirty_inodes) {
-               vn_syncer_one(mp);
+               speedup_syncer(mp);
+               /*vn_syncer_one(mp);*/
        }
 }
 
@@ -2924,6 +2940,8 @@ hammer2_pfs_memory_wait(hammer2_inode_t *ip, int always_moderate)
        static int zzticks;
 #endif
 
+       return; /* XXX */
+
        /*
         * Moderate the number of dirty inodes
         */
index c0afe80..645e470 100644 (file)
@@ -180,7 +180,9 @@ hammer2_vop_reclaim(struct vop_reclaim_args *ap)
        if ((ip->flags & (HAMMER2_INODE_ISUNLINKED |
                          HAMMER2_INODE_MODIFIED |
                          HAMMER2_INODE_RESIZED |
-                         HAMMER2_INODE_DIRTYDATA)) &&
+                         HAMMER2_INODE_DIRTYDATA |
+                         HAMMER2_INODE_CREATING |
+                         HAMMER2_INODE_DELETING)) &&
            (ip->flags & HAMMER2_INODE_ISDELETED) == 0) {
                hammer2_spin_ex(&pmp->list_spin);
                if ((ip->flags & (HAMMER2_INODE_SYNCQ |
@@ -253,11 +255,11 @@ hammer2_vop_fsync(struct vop_fsync_args *ap)
        /*
         * Flush dirty chains related to the inode.
         *
-        * NOTE! XXX We do not currently flush to the volume root, ultimately
-        *       we will want to have a shortcut for the flushed inode stored
-        *       in the volume root for recovery purposes.
+        * NOTE! We are not in a flush transaction, so we should not use the
+        *       PARENTONFLUSH flag.  The inode remains on the sideq so the
+        *       filesystem syncer can synchronize it to the volume root.
         */
-       error2 = hammer2_inode_chain_flush(ip);
+       error2 = hammer2_inode_chain_flush(ip, HAMMER2_XOP_INODE_STOP);
        if (error2)
                error1 = error2;
 
@@ -517,7 +519,7 @@ done:
         * Cleanup.
         */
        hammer2_inode_unlock(ip);
-       hammer2_trans_done(ip->pmp, 1);
+       hammer2_trans_done(ip->pmp, HAMMER2_TRANS_SIDEQ);
        hammer2_knote(ip->vp, kflags);
 
        return (error);
@@ -829,7 +831,11 @@ hammer2_vop_write(struct vop_write_args *ap)
                hammer2_trans_init(ip->pmp, 0);
        }
        error = hammer2_write_file(ip, uio, ioflag, seqcount);
-       hammer2_trans_done(ip->pmp, 1);
+       if (uio->uio_segflg == UIO_NOCOPY)
+               hammer2_trans_done(ip->pmp, HAMMER2_TRANS_BUFCACHE |
+                                           HAMMER2_TRANS_SIDEQ);
+       else
+               hammer2_trans_done(ip->pmp, HAMMER2_TRANS_SIDEQ);
 
        return (error);
 }
@@ -1430,7 +1436,7 @@ hammer2_vop_nmkdir(struct vop_nmkdir_args *ap)
                hammer2_inode_unlock(dip);
        }
 
-       hammer2_trans_done(dip->pmp, 1);
+       hammer2_trans_done(dip->pmp, HAMMER2_TRANS_SIDEQ);
 
        if (error == 0) {
                cache_setunresolved(ap->a_nch);
@@ -1525,8 +1531,7 @@ hammer2_vop_nlink(struct vop_nlink_args *ap)
         * Can return NULL and error == EXDEV if the common parent
         * crosses a directory with the xlink flag set.
         */
-       hammer2_inode_lock(tdip, 0);
-       hammer2_inode_lock(ip, 0);
+       hammer2_inode_lock4(tdip, ip, NULL, NULL);
 
        /*
         * Create the directory entry and bump nlinks.
@@ -1553,7 +1558,7 @@ hammer2_vop_nlink(struct vop_nlink_args *ap)
        hammer2_inode_unlock(ip);
        hammer2_inode_unlock(tdip);
 
-       hammer2_trans_done(ip->pmp, 1);
+       hammer2_trans_done(ip->pmp, HAMMER2_TRANS_SIDEQ);
        hammer2_knote(ap->a_vp, NOTE_LINK);
        hammer2_knote(ap->a_dvp, NOTE_WRITE);
 
@@ -1631,7 +1636,7 @@ hammer2_vop_ncreate(struct vop_ncreate_args *ap)
                hammer2_inode_unlock(dip);
        }
 
-       hammer2_trans_done(dip->pmp, 1);
+       hammer2_trans_done(dip->pmp, HAMMER2_TRANS_SIDEQ);
 
        if (error == 0) {
                cache_setunresolved(ap->a_nch);
@@ -1703,7 +1708,7 @@ hammer2_vop_nmknod(struct vop_nmknod_args *ap)
                hammer2_inode_unlock(dip);
        }
 
-       hammer2_trans_done(dip->pmp, 1);
+       hammer2_trans_done(dip->pmp, HAMMER2_TRANS_SIDEQ);
 
        if (error == 0) {
                cache_setunresolved(ap->a_nch);
@@ -1761,7 +1766,7 @@ hammer2_vop_nsymlink(struct vop_nsymlink_args *ap)
                        nip = NULL;
                }
                *ap->a_vpp = NULL;
-               hammer2_trans_done(dip->pmp, 1);
+               hammer2_trans_done(dip->pmp, HAMMER2_TRANS_SIDEQ);
                return error;
        }
        *ap->a_vpp = hammer2_igetv(nip, &error);
@@ -1807,7 +1812,7 @@ hammer2_vop_nsymlink(struct vop_nsymlink_args *ap)
                hammer2_inode_unlock(dip);
        }
 
-       hammer2_trans_done(dip->pmp, 1);
+       hammer2_trans_done(dip->pmp, HAMMER2_TRANS_SIDEQ);
 
        /*
         * Finalize namecache
@@ -1903,7 +1908,7 @@ hammer2_vop_nremove(struct vop_nremove_args *ap)
                hammer2_inode_unlock(dip);
        }
 
-       hammer2_trans_done(dip->pmp, 1);
+       hammer2_trans_done(dip->pmp, HAMMER2_TRANS_SIDEQ);
        if (error == 0) {
                cache_unlink(ap->a_nch);
                hammer2_knote(ap->a_dvp, NOTE_WRITE);
@@ -1980,7 +1985,7 @@ hammer2_vop_nrmdir(struct vop_nrmdir_args *ap)
                hammer2_inode_unlock(dip);
        }
 
-       hammer2_trans_done(dip->pmp, 1);
+       hammer2_trans_done(dip->pmp, HAMMER2_TRANS_SIDEQ);
        if (error == 0) {
                cache_unlink(ap->a_nch);
                hammer2_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
@@ -2058,23 +2063,21 @@ hammer2_vop_nrename(struct vop_nrename_args *ap)
         * test.  (tip) can be NULL.
         */
        error = 0;
-       if (fdip <= tdip) {
-               hammer2_inode_lock(fdip, 0);
-               hammer2_inode_lock(tdip, 0);
-       } else {
-               hammer2_inode_lock(tdip, 0);
-               hammer2_inode_lock(fdip, 0);
-       }
-       if (tip) {
-               if (ip <= tip) {
-                       hammer2_inode_lock(ip, 0);
-                       hammer2_inode_lock(tip, 0);
-               } else {
-                       hammer2_inode_lock(tip, 0);
-                       hammer2_inode_lock(ip, 0);
+       {
+               hammer2_inode_t *ip1 = fdip;
+               hammer2_inode_t *ip2 = tdip;
+               hammer2_inode_t *ip3 = ip;
+               hammer2_inode_t *ip4 = tip;     /* may be NULL */
+
+               if (fdip > tdip) {
+                       ip1 = tdip;
+                       ip2 = fdip;
                }
-       } else {
-               hammer2_inode_lock(ip, 0);
+               if (tip && ip > tip) {
+                       ip3 = tip;
+                       ip4 = ip;
+               }
+               hammer2_inode_lock4(ip1, ip2, ip3, ip4);
        }
 
        /*
@@ -2195,7 +2198,7 @@ done2:
        hammer2_inode_unlock(tdip);
        hammer2_inode_unlock(fdip);
        hammer2_inode_drop(ip);
-       hammer2_trans_done(tdip->pmp, 1);
+       hammer2_trans_done(tdip->pmp, HAMMER2_TRANS_SIDEQ);
 
        /*
         * Issue the namecache update after unlocking all the internal
index 1406906..d8fe3bb 100644 (file)
@@ -289,7 +289,7 @@ hammer2_xop_nresolve(hammer2_xop_t *arg, void *scratch, int clindex)
        }
 
        /*
-        * If the entry is a hardlink pointer, resolve it.
+        * Locate the target inode for a directory entry
         */
        if (chain && chain->error == 0) {
                if (chain->bref.type == HAMMER2_BREF_TYPE_DIRENT) {
@@ -544,10 +544,15 @@ hammer2_xop_nrename(hammer2_xop_t *arg, void *scratch, int clindex)
                        parent = NULL;
                        goto done;
                }
-               parent = hammer2_chain_getparent(chain, HAMMER2_RESOLVE_ALWAYS);
-               if (parent == NULL) {
-                       error = HAMMER2_ERROR_EIO;
-                       goto done;
+               if (ip->flags & HAMMER2_INODE_CREATING) {
+                       parent = NULL;
+               } else {
+                       parent = hammer2_chain_getparent(chain,
+                                                   HAMMER2_RESOLVE_ALWAYS);
+                       if (parent == NULL) {
+                               error = HAMMER2_ERROR_EIO;
+                               goto done;
+                       }
                }
        } else {
                /*
@@ -756,8 +761,8 @@ hammer2_xop_nrename(hammer2_xop_t *arg, void *scratch, int clindex)
                                           xop->lhc, xop->lhc,
                                           &error, 0);
                KKASSERT(tmp == NULL);
-               error = hammer2_chain_create(&parent, &chain,
-                                            pmp, HAMMER2_METH_DEFAULT,
+               error = hammer2_chain_create(&parent, &chain, NULL, pmp,
+                                            HAMMER2_METH_DEFAULT,
                                             xop->lhc, 0,
                                             HAMMER2_BREF_TYPE_INODE,
                                             HAMMER2_INODE_BYTES,
@@ -1035,8 +1040,8 @@ hammer2_xop_inode_mkdirent(hammer2_xop_t *arg, void *scratch, int clindex)
        else
                data_len = HAMMER2_ALLOC_MIN;
 
-       error = hammer2_chain_create(&parent, &chain,
-                                    xop->head.ip1->pmp, HAMMER2_METH_DEFAULT,
+       error = hammer2_chain_create(&parent, &chain, NULL, xop->head.ip1->pmp,
+                                    HAMMER2_METH_DEFAULT,
                                     xop->lhc, 0,
                                     HAMMER2_BREF_TYPE_DIRENT,
                                     data_len,
@@ -1108,8 +1113,8 @@ hammer2_xop_inode_create(hammer2_xop_t *arg, void *scratch, int clindex)
                goto fail;
        }
 
-       error = hammer2_chain_create(&parent, &chain,
-                                    xop->head.ip1->pmp, HAMMER2_METH_DEFAULT,
+       error = hammer2_chain_create(&parent, &chain, NULL, xop->head.ip1->pmp,
+                                    HAMMER2_METH_DEFAULT,
                                     xop->lhc, 0,
                                     HAMMER2_BREF_TYPE_INODE,
                                     HAMMER2_INODE_BYTES,
@@ -1140,6 +1145,166 @@ fail:
        }
 }
 
+/*
+ * Create inode as above but leave it detached from the hierarchy.
+ */
+void
+hammer2_xop_inode_create_det(hammer2_xop_t *arg, void *scratch, int clindex)
+{
+       hammer2_xop_create_t *xop = &arg->xop_create;
+       hammer2_chain_t *parent;
+       hammer2_chain_t *chain;
+       hammer2_chain_t *null_parent;
+       hammer2_key_t key_next;
+       hammer2_inode_t *pip;
+       hammer2_inode_t *iroot;
+       int error;
+
+       if (hammer2_debug & 0x0001)
+               kprintf("inode_create_det lhc %016jx clindex %d\n",
+                       xop->lhc, clindex);
+
+       pip = xop->head.ip1;
+       iroot = pip->pmp->iroot;
+
+       parent = hammer2_inode_chain(iroot, clindex, HAMMER2_RESOLVE_ALWAYS);
+
+       if (parent == NULL) {
+               error = HAMMER2_ERROR_EIO;
+               chain = NULL;
+               goto fail;
+       }
+       chain = hammer2_chain_lookup(&parent, &key_next,
+                                    xop->lhc, xop->lhc,
+                                    &error, 0);
+       if (chain) {
+               error = HAMMER2_ERROR_EEXIST;
+               goto fail;
+       }
+
+       /*
+        * Create as a detached chain with no parent.  We must specify
+        * methods
+        */
+       null_parent = NULL;
+       error = hammer2_chain_create(&null_parent, &chain,
+                                    parent->hmp, pip->pmp,
+                                    HAMMER2_ENC_COMP(pip->meta.comp_algo) +
+                                    HAMMER2_ENC_CHECK(pip->meta.check_algo),
+                                    xop->lhc, 0,
+                                    HAMMER2_BREF_TYPE_INODE,
+                                    HAMMER2_INODE_BYTES,
+                                    xop->head.mtid, 0, xop->flags);
+       if (error == 0) {
+               error = hammer2_chain_modify(chain, xop->head.mtid, 0, 0);
+               if (error == 0) {
+                       chain->data->ipdata.meta = xop->meta;
+                       if (xop->head.name1) {
+                               bcopy(xop->head.name1,
+                                     chain->data->ipdata.filename,
+                                     xop->head.name1_len);
+                               chain->data->ipdata.meta.name_len =
+                                       xop->head.name1_len;
+                       }
+                       chain->data->ipdata.meta.name_key = xop->lhc;
+               }
+       }
+fail:
+       if (parent) {
+               hammer2_chain_unlock(parent);
+               hammer2_chain_drop(parent);
+       }
+       hammer2_xop_feed(&xop->head, chain, clindex, error);
+       if (chain) {
+               hammer2_chain_unlock(chain);
+               hammer2_chain_drop(chain);
+       }
+}
+
+/*
+ * Take a detached chain and insert it into the topology
+ */
+void
+hammer2_xop_inode_create_ins(hammer2_xop_t *arg, void *scratch, int clindex)
+{
+       hammer2_xop_create_t *xop = &arg->xop_create;
+       hammer2_chain_t *parent;
+       hammer2_chain_t *chain;
+       hammer2_key_t key_next;
+       int error;
+
+       if (hammer2_debug & 0x0001)
+               kprintf("inode_create_ins lhc %016jx clindex %d\n",
+                       xop->lhc, clindex);
+
+       /*
+        * (parent) will be the insertion point for inode under iroot
+        */
+       parent = hammer2_inode_chain(xop->head.ip1->pmp->iroot, clindex,
+                                    HAMMER2_RESOLVE_ALWAYS);
+       if (parent == NULL) {
+               error = HAMMER2_ERROR_EIO;
+               chain = NULL;
+               goto fail;
+       }
+       chain = hammer2_chain_lookup(&parent, &key_next,
+                                    xop->lhc, xop->lhc,
+                                    &error, 0);
+       if (chain) {
+               error = HAMMER2_ERROR_EEXIST;
+               goto fail;
+       }
+
+       /*
+        * (chain) is the detached inode that is being inserted
+        */
+       chain = hammer2_inode_chain(xop->head.ip1, clindex,
+                                    HAMMER2_RESOLVE_ALWAYS);
+       if (chain == NULL) {
+               error = HAMMER2_ERROR_EIO;
+               chain = NULL;
+               goto fail;
+       }
+
+       /*
+        * This create call will insert the non-NULL chain into parent.
+        * Most of the auxillary fields are ignored since the chain already
+        * exists.
+        */
+       error = hammer2_chain_create(&parent, &chain, NULL, xop->head.ip1->pmp,
+                                    HAMMER2_METH_DEFAULT,
+                                    xop->lhc, 0,
+                                    HAMMER2_BREF_TYPE_INODE,
+                                    HAMMER2_INODE_BYTES,
+                                    xop->head.mtid, 0, xop->flags);
+#if 0
+       if (error == 0) {
+               error = hammer2_chain_modify(chain, xop->head.mtid, 0, 0);
+               if (error == 0) {
+                       chain->data->ipdata.meta = xop->meta;
+                       if (xop->head.name1) {
+                               bcopy(xop->head.name1,
+                                     chain->data->ipdata.filename,
+                                     xop->head.name1_len);
+                               chain->data->ipdata.meta.name_len =
+                                       xop->head.name1_len;
+                       }
+                       chain->data->ipdata.meta.name_key = xop->lhc;
+               }
+       }
+#endif
+fail:
+       if (parent) {
+               hammer2_chain_unlock(parent);
+               hammer2_chain_drop(parent);
+       }
+       hammer2_xop_feed(&xop->head, chain, clindex, error);
+       if (chain) {
+               hammer2_chain_unlock(chain);
+               hammer2_chain_drop(chain);
+       }
+}
+
 /*
  * Inode delete helper (backend, threaded)
  *
@@ -1167,10 +1332,22 @@ hammer2_xop_inode_destroy(hammer2_xop_t *arg, void *scratch, int clindex)
                error = HAMMER2_ERROR_EIO;
                goto done;
        }
-       parent = hammer2_chain_getparent(chain, HAMMER2_RESOLVE_ALWAYS);
-       if (parent == NULL) {
-               error = HAMMER2_ERROR_EIO;
-               goto done;
+
+       if (ip->flags & HAMMER2_INODE_CREATING) {
+               /*
+                * Inode's chains are not linked into the media topology
+                * because it is a new inode (which is now being destroyed).
+                */
+               parent = NULL;
+       } else {
+               /*
+                * Inode's chains are linked into the media topology
+                */
+               parent = hammer2_chain_getparent(chain, HAMMER2_RESOLVE_ALWAYS);
+               if (parent == NULL) {
+                       error = HAMMER2_ERROR_EIO;
+                       goto done;
+               }
        }
        KKASSERT(chain->parent == parent);
 
@@ -1298,8 +1475,8 @@ hammer2_xop_inode_connect(hammer2_xop_t *arg, void *scratch, int clindex)
        /*
         * Reconnect the chain to the new parent directory
         */
-       error = hammer2_chain_create(&parent, &chain,
-                                    pmp, HAMMER2_METH_DEFAULT,
+       error = hammer2_chain_create(&parent, &chain, NULL, pmp,
+                                    HAMMER2_METH_DEFAULT,
                                     xop->lhc, 0,
                                     HAMMER2_BREF_TYPE_INODE,
                                     HAMMER2_INODE_BYTES,