hammer2 - Cleanup various races, better flush
authorMatthew Dillon <dillon@apollo.backplane.com>
Thu, 13 Dec 2012 08:08:05 +0000 (00:08 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Thu, 13 Dec 2012 08:08:05 +0000 (00:08 -0800)
* Cleanup various topological scan races

* Temporarily release the parent lock when diving a child.

* Start work on a chain movement (disconnect from parent) interlock,
  which we need for stability during flushes, renames, and hardlink
  operations.  This will likely be rewritten.

sys/vfs/hammer2/hammer2.h
sys/vfs/hammer2/hammer2_ccms.h
sys/vfs/hammer2/hammer2_chain.c
sys/vfs/hammer2/hammer2_inode.c
sys/vfs/hammer2/hammer2_vnops.c

index 724ad85..ddfe68f 100644 (file)
@@ -124,6 +124,7 @@ struct hammer2_chain {
        hammer2_media_data_t *data;     /* modified copy of data (rw) */
        u_int           bytes;          /* physical size of data */
        int             index;          /* index in parent */
+       u_int           movelock;
        u_int           refs;
        u_int           flags;
 };
@@ -156,6 +157,7 @@ RB_PROTOTYPE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
 #define HAMMER2_CHAIN_MODIFIED_AUX     0x00000800      /* hmp->vchain only */
 #define HAMMER2_CHAIN_MODIFY_TID       0x00001000      /* mod updates field */
 #define HAMMER2_CHAIN_MOUNTED          0x00002000      /* PFS is mounted */
+#define HAMMER2_CHAIN_MOVELOCK_WAITING 0x00004000      /* movelock stalled */
 
 /*
  * Flags passed to hammer2_chain_lookup() and hammer2_chain_next()
@@ -163,6 +165,7 @@ RB_PROTOTYPE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
 #define HAMMER2_LOOKUP_NOLOCK          0x00000001      /* ref only */
 #define HAMMER2_LOOKUP_NODATA          0x00000002      /* data left NULL */
 #define HAMMER2_LOOKUP_SHARED          0x00000100
+#define HAMMER2_LOOKUP_MAYDELETE       0x00000200
 
 /*
  * Flags passed to hammer2_chain_modify() and hammer2_chain_resize()
@@ -187,6 +190,7 @@ RB_PROTOTYPE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
 #define HAMMER2_RESOLVE_MASK           0x0F
 
 #define HAMMER2_RESOLVE_SHARED         0x10
+#define HAMMER2_RESOLVE_MAYDELETE      0x20
 
 /*
  * Cluster different types of storage together for allocations
@@ -244,7 +248,6 @@ struct hammer2_inode {
        hammer2_chain_t         chain;
        struct hammer2_inode_data ip_data;
        struct lockf            advlock;
-       u_int                   depth;          /* directory depth */
        hammer2_off_t           delta_dcount;   /* adjust data_count */
        hammer2_off_t           delta_icount;   /* adjust inode_count */
 };
@@ -421,6 +424,8 @@ int hammer2_inode_duplicate(hammer2_inode_t *dip,
                        const uint8_t *name, size_t name_len);
 int hammer2_inode_connect(hammer2_inode_t *dip, hammer2_inode_t *oip,
                        const uint8_t *name, size_t name_len);
+hammer2_inode_t *hammer2_inode_common_parent(hammer2_mount_t *hmp,
+                       hammer2_inode_t *fdip, hammer2_inode_t *tdip);
 
 int hammer2_unlink_file(hammer2_inode_t *dip,
                        const uint8_t *name, size_t name_len,
@@ -441,6 +446,8 @@ void hammer2_chain_free(hammer2_mount_t *hmp, hammer2_chain_t *chain);
 void hammer2_chain_ref(hammer2_mount_t *hmp, hammer2_chain_t *chain);
 void hammer2_chain_drop(hammer2_mount_t *hmp, hammer2_chain_t *chain);
 int hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain, int how);
+int hammer2_chain_lock_pair(hammer2_mount_t *hmp, hammer2_chain_t *parent,
+                               hammer2_chain_t *chain, int how);
 void hammer2_chain_moved(hammer2_mount_t *hmp, hammer2_chain_t *chain);
 void hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain,
                                int flags);
index 5da3799..4e6eb07 100644 (file)
@@ -182,8 +182,15 @@ struct ccms_lock {
  * 64 byte area might be represented as (0,63).  The offsets are UNSIGNED
  * entities.
  *
- * count - negative value indicates active exclusive lock, positive value
+ * High level CST locks must be obtained top-down.
+ *
+ * count - Negative value indicates active exclusive lock, positive value
  *        indicates active shared lock.
+ *
+ * spin  - Structural spinlock, typically just one is held at a time.
+ *        However, to complement the top-down nature of the higher level
+ *        lock we allow the spin lock to be held recursively in a bottom-up
+ *        fashion for race-to-root flags updates and lastdrop iterations.
  */
 struct ccms_cst {
        struct spinlock spin;           /* thread spinlock */
index 673c0e7..e6d73e7 100644 (file)
@@ -53,6 +53,12 @@ static int hammer2_indirect_optimize;        /* XXX SYSCTL */
 static hammer2_chain_t *hammer2_chain_create_indirect(
                        hammer2_mount_t *hmp, hammer2_chain_t *parent,
                        hammer2_key_t key, int keybits);
+static void hammer2_chain_movelock(hammer2_mount_t *hmp,
+                       hammer2_chain_t *chain);
+static void hammer2_chain_moveunlock(hammer2_mount_t *hmp,
+                       hammer2_chain_t *chain);
+static void hammer2_chain_movewait(hammer2_mount_t *hmp,
+                       hammer2_chain_t *chain);
 
 /*
  * We use a red-black tree to guarantee safe lookups under shared locks.
@@ -73,7 +79,9 @@ hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2)
  *
  * SUBMODIFIED is not set on the chain passed in.
  *
- * XXX rename of parent can create a SMP race
+ * The chain->cst.spin lock can be held to stabilize the chain->parent
+ * pointer.  The first parent is stabilized by virtue of chain being
+ * fully locked.
  */
 static void
 hammer2_chain_parent_setsubmod(hammer2_mount_t *hmp, hammer2_chain_t *chain)
@@ -81,9 +89,18 @@ hammer2_chain_parent_setsubmod(hammer2_mount_t *hmp, hammer2_chain_t *chain)
        hammer2_chain_t *parent;
 
        parent = chain->parent;
-       while (parent && (parent->flags & HAMMER2_CHAIN_SUBMODIFIED) == 0) {
-               atomic_set_int(&parent->flags, HAMMER2_CHAIN_SUBMODIFIED);
-               parent = parent->parent;
+       if (parent && (parent->flags & HAMMER2_CHAIN_SUBMODIFIED) == 0) {
+               spin_lock(&parent->cst.spin);
+               for (;;) {
+                       atomic_set_int(&parent->flags,
+                                      HAMMER2_CHAIN_SUBMODIFIED);
+                       if ((chain = parent->parent) == NULL)
+                               break;
+                       spin_lock(&chain->cst.spin);    /* upward interlock */
+                       spin_unlock(&parent->cst.spin);
+                       parent = chain;
+               }
+               spin_unlock(&parent->cst.spin);
        }
 }
 
@@ -169,12 +186,15 @@ hammer2_chain_dealloc(hammer2_mount_t *hmp, hammer2_chain_t *chain)
        hammer2_chain_t *parent;
        hammer2_chain_t *child;
 
+       /*
+        * NOTE: All movelock holders hold a ref so it shouldn't be possible
+        *       for movelock to be non-zero here.
+        */
        KKASSERT(chain->refs == 0);
        KKASSERT((chain->flags &
                  (HAMMER2_CHAIN_MOVED | HAMMER2_CHAIN_MODIFIED)) == 0);
+       KKASSERT(chain->movelock == 0);
 
-       parent = chain->parent;
-       chain->parent = NULL;
        if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
                ip = chain->u.ip;
        else
@@ -192,12 +212,20 @@ hammer2_chain_dealloc(hammer2_mount_t *hmp, hammer2_chain_t *chain)
        /*
         * If the DELETED flag is not set the chain must be removed from
         * its parent's tree.
+        *
+        * WARNING! chain->cst.spin must be held when chain->parent is
+        *          modified, even though we own the full blown lock,
+        *          to deal with setsubmod and rename races.
         */
        if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) {
+               spin_lock(&chain->cst.spin);    /* shouldn't be needed */
+               parent = chain->parent;
                RB_REMOVE(hammer2_chain_tree, &parent->rbhead, chain);
                atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
                if (ip)
                        ip->pip = NULL;
+               chain->parent = NULL;
+               spin_unlock(&chain->cst.spin);
        }
 
        /*
@@ -303,16 +331,14 @@ hammer2_chain_drop(hammer2_mount_t *hmp, hammer2_chain_t *chain)
                KKASSERT(refs > 0);
                if (refs == 1) {
                        /*
-                        * (1) lastdrop successfully drops the chain and
-                        *     returns the parent, we recursively drop the
-                        *     parent.
+                        * (1) lastdrop successfully drops the chain to 0
+                        *     refs and may may not have destroyed it.
+                        *     lastdrop will return the parent so we can
+                        *     recursively drop the implied ref from the
+                        *     1->0 transition.
                         *
                         * (2) lastdrop fails to transition refs from 1 to 0
                         *     and returns the same chain, we retry.
-                        *
-                        * (3) lastdrop fails to drop the chain and returns
-                        *     NULL, leaving the ref intact for a deferred
-                        *     drop later on.
                         */
                        chain = hammer2_chain_lastdrop(hmp, chain);
                } else {
@@ -329,12 +355,12 @@ hammer2_chain_drop(hammer2_mount_t *hmp, hammer2_chain_t *chain)
 }
 
 /*
- * On the last drop we have to stabilize chain->parent, which we can do
- * by acquiring the chain->cst.spin lock.  If we get a full-blown lock
- * it messes up the chain_unlock() code's ccms_thread_unlock_zero() call.
+ * Handle SMP races during the last drop.  We must obtain a lock on
+ * chain->parent to stabilize the last pointer reference to chain
+ * (if applicable).  This reference does not have a parallel ref count,
+ * that is idle chains in the topology can have a ref count of 0.
  *
- * Once the spinlock has been obtained we can drop the refs and become the
- * owner of the implied ref on the parent, allowing us to return the parent.
+ * The 1->0 transition implies a ref on the parent.
  */
 static
 hammer2_chain_t *
@@ -343,55 +369,71 @@ hammer2_chain_lastdrop(hammer2_mount_t *hmp, hammer2_chain_t *chain)
        hammer2_chain_t *parent;
 
        /*
-        * gain lock, drop refs, return chain to retry if we were unable
-        * to drop the refs from 1 to 0.
+        * Stablize chain->parent with the chain cst's spinlock.
+        * (parent can be NULL here).
         */
        spin_lock(&chain->cst.spin);
-       if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) {
-               spin_unlock(&chain->cst.spin);
-               return (chain);
-       }
+       parent = chain->parent;
 
        /*
-        * Refs is 0 and we own the implied ref on the parent.  The
-        * chain can still be accessed at this point but any cycling
-        * of its refs will simply build-up more implied refs on the
-        * parent.
+        * CST spin locks are allowed to be held recursively bottom-up
+        * (whereas full CST thread locks can only be held recursively
+        * top-down).
         *
-        * Thus the parent pointer is valid.
+        * This makes things fairly easy.  We still must not block while
+        * obtaining the CST lock on the parent.  If this fails we have to
+        * unwind.
         */
-       parent = chain->parent;
-       spin_unlock(&chain->cst.spin);
+       if (parent) {
+               if (ccms_thread_lock_nonblock(&parent->cst,
+                                             CCMS_STATE_EXCLUSIVE)) {
+                       /* parent cst lock attempt failed */
+                       if (atomic_cmpset_int(&chain->refs, 1, 0)) {
+                               spin_unlock(&chain->cst.spin);  /* success */
+                               return(parent);
+                       } else {
+                               spin_unlock(&chain->cst.spin);  /* failure */
+                               return(chain);
+                       }
+               }
+       }
 
        /*
-        * Attempt to acquire an exclusive lock on the parent.  If this
-        * fails we just leave chain alone but still return the parent
-        * for the drop recursion.
+        * With the parent now held we control the last pointer reference
+        * to chain ONLY IF this is the 1->0 drop.  If we fail to transition
+        * from 1->0 we unwind and retry at chain.
         */
-       if (parent &&
-           ccms_thread_lock_nonblock(&parent->cst, CCMS_STATE_EXCLUSIVE)) {
-               return (parent);
+       spin_unlock(&chain->cst.spin);
+       if (!atomic_cmpset_int(&chain->refs, 1, 0)) {
+               if (parent)
+                       ccms_thread_unlock(&parent->cst);
+               return(chain);
        }
 
        /*
-        * With an exclusive lock on the parent in-hand if chain->refs is
-        * still 0 then its impossible for anyone new to access it (or any
-        * of its children), and it can be deallocated.
+        * The flusher is allowe to set movelock on a child and then release
+        * the parent's lock, and ultimately also release the child's lock
+        * while still holding it referenced.  The reference should prevent
+        * this case from being hit.
         */
-       if (chain->refs == 0) {
-               ccms_thread_lock(&chain->cst, CCMS_STATE_EXCLUSIVE);
-               hammer2_chain_dealloc(hmp, chain);
-       }
+       KKASSERT(chain->movelock == 0);
 
        /*
-        * drop recursion, return parent so the caller can eat the implied
-        * ref we own on it.  We have to use hammer2_chain_unlock() (which
-        * also does a drop so we also need a ref on parent).
+        * Ok, we succeeded.  We now own the implied ref on the parent
+        * associated with the 1->0 transition of the child.  It should not
+        * be possible for ANYTHING to access the child now, as we own the
+        * lock on the parent, so we should be able to safely lock the
+        * child and destroy it.
         */
-       if (parent) {
-               hammer2_chain_ref(hmp, parent);
-               hammer2_chain_unlock(hmp, parent);
-       }
+       ccms_thread_lock(&chain->cst, CCMS_STATE_EXCLUSIVE);
+       hammer2_chain_dealloc(hmp, chain);
+
+       /*
+        * We want to return parent with its implied ref to the caller
+        * to recurse and drop the parent.
+        */
+       if (parent)
+               ccms_thread_unlock(&parent->cst);
        return (parent);
 }
 
@@ -422,6 +464,10 @@ hammer2_chain_lastdrop(hammer2_mount_t *hmp, hammer2_chain_t *chain)
  * HAMMER2_RESOLVE_SHARED- (flag) The chain is locked shared, otherwise
  *                        it will be locked exclusive.
  *
+ * HAMMER2_RESOLVE_MAYDELETE - The caller may attempt to delete the element
+ *                            being locked.  We block as long as the
+ *                            parent's movelock is non-zero.
+ *
  * NOTE: Embedded elements (volume header, inodes) are always resolved
  *      regardless.
  *
@@ -450,10 +496,19 @@ hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain, int how)
         * Ref and lock the element.  Recursive locks are allowed.
         */
        hammer2_chain_ref(hmp, chain);
-       if (how & HAMMER2_RESOLVE_SHARED)
-               ccms_thread_lock(&chain->cst, CCMS_STATE_SHARED);
-       else
-               ccms_thread_lock(&chain->cst, CCMS_STATE_EXCLUSIVE);
+       for (;;) {
+               if (how & HAMMER2_RESOLVE_SHARED)
+                       ccms_thread_lock(&chain->cst, CCMS_STATE_SHARED);
+               else
+                       ccms_thread_lock(&chain->cst, CCMS_STATE_EXCLUSIVE);
+               if ((how & HAMMER2_RESOLVE_MAYDELETE) == 0 ||
+                   chain->parent == NULL ||
+                   chain->parent->movelock == 0) {
+                       break;
+               }
+               ccms_thread_unlock(&chain->cst);
+               hammer2_chain_movewait(hmp, chain);
+       }
 
        /*
         * If we already have a valid data pointer no further action is
@@ -602,6 +657,27 @@ hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain, int how)
        return (0);
 }
 
+/*
+ * Similar to the normal chain_lock but handles HAMMER2_RESOLVE_MAYDELETE
+ * in a more intelligent fashion.
+ */
+int
+hammer2_chain_lock_pair(hammer2_mount_t *hmp, hammer2_chain_t *parent,
+                       hammer2_chain_t *chain, int how)
+{
+       int error;
+
+       error = hammer2_chain_lock(hmp, parent,
+                                  how & ~HAMMER2_RESOLVE_MAYDELETE);
+       if (error == 0) {
+               error = hammer2_chain_lock(hmp, chain, how);
+               if (error) {
+                       hammer2_chain_unlock(hmp, parent);
+               }
+       }
+       return error;
+}
+
 /*
  * Unlock and deref a chain element.
  *
@@ -724,6 +800,55 @@ hammer2_chain_unlock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
        hammer2_chain_drop(hmp, chain);
 }
 
+/*
+ * Called on a locked chain element.  Allows the caller to temporarily
+ * unlock the element (caller must be sure to hold an extra ref on it
+ * to prevent destruction), thus allowing other accessors to lock it,
+ * but disallows deletions.
+ */
+static void
+hammer2_chain_movelock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
+{
+       atomic_add_int(&chain->movelock, 1);
+}
+
+static void
+hammer2_chain_moveunlock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
+{
+       u_int omovelock;
+       u_int nmovelock;
+
+       for (;;) {
+               omovelock = chain->movelock;
+               cpu_ccfence();
+               nmovelock = (omovelock - 1) & 0x7FFFFFFF;
+               if (atomic_cmpset_int(&chain->movelock, omovelock, nmovelock)) {
+                       if (omovelock & 0x80000000)
+                               wakeup(&chain->movelock);
+                       break;
+               }
+       }
+}
+
+static void
+hammer2_chain_movewait(hammer2_mount_t *hmp, hammer2_chain_t *chain)
+{
+       u_int omovelock;
+       u_int nmovelock;
+
+       for (;;) {
+               omovelock = chain->movelock;
+               cpu_ccfence();
+               if (omovelock == 0)
+                       break;
+               nmovelock = omovelock | 0x80000000;
+               tsleep_interlock(&chain->movelock, 0);
+               if (atomic_cmpset_int(&chain->movelock, omovelock, nmovelock)) {
+                       tsleep(&chain->movelock, PINTERLOCKED, "movelk", 0);
+               }
+       }
+}
+
 /*
  * Resize the chain's physical storage allocation.  Chains can be resized
  * smaller without reallocating the storage.  Resizing larger will reallocate
@@ -1093,6 +1218,8 @@ hammer2_chain_get(hammer2_mount_t *hmp, hammer2_chain_t *parent,
                how = HAMMER2_RESOLVE_MAYBE;
        if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK))
                how |= HAMMER2_RESOLVE_SHARED;
+       if (flags & HAMMER2_LOOKUP_MAYDELETE)
+               how |= HAMMER2_RESOLVE_MAYDELETE;
 
        /*
         * First see if we have a (possibly modified) chain element cached
@@ -1200,7 +1327,6 @@ hammer2_chain_get(hammer2_mount_t *hmp, hammer2_chain_t *parent,
                if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) {
                        ip->pip = parent->u.ip;
                        ip->pmp = parent->u.ip->pmp;
-                       ip->depth = parent->u.ip->depth + 1;
                        ccms_cst_init(&ip->topo_cst, &ip->chain);
                }
        }
@@ -1268,6 +1394,10 @@ hammer2_chain_lookup(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
                how_maybe |= HAMMER2_RESOLVE_SHARED;
                how_always |= HAMMER2_RESOLVE_SHARED;
        }
+       if (flags & HAMMER2_LOOKUP_MAYDELETE) {
+               how_maybe |= HAMMER2_RESOLVE_MAYDELETE;
+               how_always |= HAMMER2_RESOLVE_MAYDELETE;
+       }
 
        /*
         * Recurse (*parentp) upward if necessary until the parent completely
@@ -1427,6 +1557,8 @@ hammer2_chain_next(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
 
        if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK))
                how_maybe |= HAMMER2_RESOLVE_SHARED;
+       if (flags & HAMMER2_LOOKUP_MAYDELETE)
+               how_maybe |= HAMMER2_RESOLVE_MAYDELETE;
 
        parent = *parentp;
 
@@ -1792,7 +1924,6 @@ again:
                if (scan->bref.type == HAMMER2_BREF_TYPE_INODE) {
                        ip->pip = scan->u.ip;
                        ip->pmp = scan->u.ip->pmp;
-                       ip->depth = scan->u.ip->depth + 1;
                        ip->pip->delta_icount += ip->ip_data.inode_count;
                        ip->pip->delta_dcount += ip->ip_data.data_count;
                        ++ip->pip->delta_icount;
@@ -2139,15 +2270,21 @@ hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
                 * We must still set SUBMODIFIED in the parent but we do
                 * that after the loop.
                 *
+                * WARNING! chain->cst.spin must be held when chain->parent is
+                *          modified, even though we own the full blown lock,
+                *          to deal with setsubmod and rename races.
+                *
                 * XXX we really need a lock here but we don't need the
                 *     data.  NODATA feature needed.
                 */
                chain = hammer2_chain_get(hmp, parent, i,
                                          HAMMER2_LOOKUP_NODATA);
+               spin_lock(&chain->cst.spin);
                RB_REMOVE(hammer2_chain_tree, &parent->rbhead, chain);
                if (RB_INSERT(hammer2_chain_tree, &ichain->rbhead, chain))
                        panic("hammer2_chain_create_indirect: collision");
                chain->parent = ichain;
+               spin_unlock(&chain->cst.spin);
                if (base)
                        bzero(&base[i], sizeof(base[i]));
                atomic_add_int(&parent->refs, -1);
@@ -2232,11 +2369,14 @@ hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
  * referenced.  (*parentp) will be modified in a manner similar to a lookup
  * or iteration when indirect blocks are also deleted as a side effect.
  *
+ * The caller must ensure that the chain is locked with the MAYDELETE
+ * flag to interlock chain->movelock.
+ *
  * XXX This currently does not adhere to the MOVED flag protocol in that
  *     the removal is immediately indicated in the parent's blockref[]
  *     array.
  *
- * Must be called with an exclusively locked parent.
+ * Must be called with an exclusively locked parent and chain.
  */
 void
 hammer2_chain_delete(hammer2_mount_t *hmp, hammer2_chain_t *parent,
@@ -2246,9 +2386,17 @@ hammer2_chain_delete(hammer2_mount_t *hmp, hammer2_chain_t *parent,
        hammer2_inode_t *ip;
        int count;
 
+       /*
+        * NOTE: Caller is responsible for using MAYDELETE flags when
+        *       acquiring chain elements that it desires to delete.
+        *       This flag should interlock the movelock flag.  Parent
+        *       chain must remain locked (the flusher can set movelock
+        *       on children otherwise).
+        */
        if (chain->parent != parent)
                panic("hammer2_chain_delete: parent mismatch");
        KKASSERT(ccms_thread_lock_owned(&parent->cst));
+       KKASSERT(chain->movelock == 0);
 
        /*
         * Mark the parent modified so our base[] pointer remains valid
@@ -2287,16 +2435,22 @@ hammer2_chain_delete(hammer2_mount_t *hmp, hammer2_chain_t *parent,
        /*
         * Disconnect the bref in the parent, remove the chain, and
         * disconnect in-memory fields from the parent.
+        *
+        * WARNING! chain->cst.spin must be held when chain->parent is
+        *          modified, even though we own the full blown lock,
+        *          to deal with setsubmod and rename races.
         */
        KKASSERT(chain->index >= 0 && chain->index < count);
        if (base)
                bzero(&base[chain->index], sizeof(*base));
 
+       spin_lock(&chain->cst.spin);
        RB_REMOVE(hammer2_chain_tree, &parent->rbhead, chain);
        atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
        atomic_add_int(&parent->refs, -1);      /* for red-black entry */
        chain->index = -1;
        chain->parent = NULL;
+       spin_unlock(&chain->cst.spin);
 
        /*
         * Cumulative adjustments must be propagated to the parent inode
@@ -2319,9 +2473,10 @@ hammer2_chain_delete(hammer2_mount_t *hmp, hammer2_chain_t *parent,
                        ip->delta_icount = 0;
                        ip->delta_dcount = 0;
                        --ip->pip->delta_icount;
+                       spin_lock(&chain->cst.spin); /* XXX */
                        ip->pip = NULL;
+                       spin_unlock(&chain->cst.spin);
                }
-               chain->u.ip->depth = 0;
        }
 
        /*
@@ -2419,7 +2574,7 @@ hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *chain,
         */
        if (chain->flags & HAMMER2_CHAIN_SUBMODIFIED) {
                hammer2_chain_t *child;
-               hammer2_chain_t *next;
+               hammer2_chain_t *saved;
                hammer2_blockref_t *base;
                int count;
 
@@ -2430,22 +2585,28 @@ hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *chain,
                 * synchronizing block updates which occurred.
                 *
                 * We don't want to set our chain to MODIFIED gratuitously.
+                *
+                * We need an extra ref on chain because we are going to
+                * release its lock temporarily in our child loop.
                 */
                /* XXX SUBMODIFIED not interlocked, can race */
                atomic_clear_int(&chain->flags, HAMMER2_CHAIN_SUBMODIFIED);
+               hammer2_chain_ref(hmp, chain);
 
                /*
                 * Flush the children and update the blockrefs in the chain.
                 * Be careful of ripouts during the loop.
                 */
-               next = RB_MIN(hammer2_chain_tree, &chain->rbhead);
-               if (next)
-                       hammer2_chain_ref(hmp, next);
-               while ((child = next) != NULL) {
-                       next = RB_NEXT(hammer2_chain_tree,
-                                      &chain->rbhead, child);
-                       if (next)
-                               hammer2_chain_ref(hmp, next);
+               saved = NULL;
+               RB_FOREACH(child, hammer2_chain_tree, &chain->rbhead) {
+                       KKASSERT(child->parent == chain);
+
+                       if (saved) {
+                               hammer2_chain_moveunlock(hmp, saved);
+                               hammer2_chain_drop(hmp, saved);
+                               saved = NULL;
+                       }
+
                        /*
                         * We only recurse if SUBMODIFIED (internal node)
                         * or MODIFIED (internal node or leaf) is set.
@@ -2456,16 +2617,20 @@ hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *chain,
                        if ((child->flags & (HAMMER2_CHAIN_SUBMODIFIED |
                                             HAMMER2_CHAIN_MODIFIED |
                                            HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
-                               hammer2_chain_drop(hmp, child);
                                continue;
                        }
+                       saved = child;
+                       hammer2_chain_ref(hmp, child);
+                       hammer2_chain_movelock(hmp, child);
+                       hammer2_chain_unlock(hmp, chain);
                        hammer2_chain_lock(hmp, child, HAMMER2_RESOLVE_MAYBE);
-                       hammer2_chain_drop(hmp, child);
-                       if (child->parent != chain ||
-                           (child->flags & (HAMMER2_CHAIN_SUBMODIFIED |
+                       KKASSERT(child->parent == chain);
+                       if ((child->flags & (HAMMER2_CHAIN_SUBMODIFIED |
                                             HAMMER2_CHAIN_MODIFIED |
                                            HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
                                hammer2_chain_unlock(hmp, child);
+                               hammer2_chain_lock(hmp, chain,
+                                                  HAMMER2_RESOLVE_MAYBE);
                                continue;
                        }
 
@@ -2483,27 +2648,24 @@ hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *chain,
                        hammer2_chain_flush_pass1(hmp, child, info);
                        --info->depth;
                        hammer2_chain_unlock(hmp, child);
+                       hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_MAYBE);
+                       KKASSERT(child->parent == chain);
+               }
+               if (saved) {
+                       hammer2_chain_moveunlock(hmp, saved);
+                       hammer2_chain_drop(hmp, saved);
+                       /*saved = NULL; not needed */
                }
 
                /*
                 * Now synchronize any block updates.
                 */
-               next = RB_MIN(hammer2_chain_tree, &chain->rbhead);
-               if (next)
-                       hammer2_chain_ref(hmp, next);
-               while ((child = next) != NULL) {
-                       next = RB_NEXT(hammer2_chain_tree,
-                                      &chain->rbhead, child);
-                       if (next)
-                               hammer2_chain_ref(hmp, next);
-                       if ((child->flags & HAMMER2_CHAIN_MOVED) == 0) {
-                               hammer2_chain_drop(hmp, child);
+               RB_FOREACH(child, hammer2_chain_tree, &chain->rbhead) {
+                       if ((child->flags & HAMMER2_CHAIN_MOVED) == 0)
                                continue;
-                       }
                        hammer2_chain_lock(hmp, child, HAMMER2_RESOLVE_NEVER);
-                       hammer2_chain_drop(hmp, child);
-                       if (child->parent != chain ||
-                           (child->flags & HAMMER2_CHAIN_MOVED) == 0) {
+                       KKASSERT(child->parent == chain);
+                       if ((child->flags & HAMMER2_CHAIN_MOVED) == 0) {
                                hammer2_chain_unlock(hmp, child);
                                continue;
                        }
@@ -2554,6 +2716,7 @@ hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *chain,
                        hammer2_chain_drop(hmp, child); /* MOVED flag */
                        hammer2_chain_unlock(hmp, child);
                }
+               hammer2_chain_drop(hmp, chain);
        }
 
        /*
@@ -2842,6 +3005,8 @@ hammer2_chain_flush_pass2(hammer2_mount_t *hmp, hammer2_chain_t *chain)
  * If modify_tid is 0 (usual case), a new modify_tid is allocated and
  * applied to the flush.  The depth-limit handling code is the only
  * code which passes a non-zero modify_tid to hammer2_chain_flush().
+ *
+ * chain is locked on call and will remain locked on return.
  */
 void
 hammer2_chain_flush(hammer2_mount_t *hmp, hammer2_chain_t *chain,
index e02d432..5299a3c 100644 (file)
@@ -447,7 +447,7 @@ hammer2_inode_duplicate(hammer2_inode_t *dip, hammer2_inode_t *oip,
  * If (oip) is already connected we create a OBJTYPE_HARDLINK entry which
  * points to (oip)'s inode number.  (oip) is expected to be the terminus of
  * the hardlink sitting as a hidden file in a common parent directory
- * in this situation (thus the lock order is correct).
+ * in this situation.
  */
 int
 hammer2_inode_connect(hammer2_inode_t *dip, hammer2_inode_t *oip,
@@ -461,6 +461,26 @@ hammer2_inode_connect(hammer2_inode_t *dip, hammer2_inode_t *oip,
        int error;
        int hlink;
 
+       /*
+        * (oip) is the terminus of the hardlink sitting in the common
+        * parent directory.  This means that if oip->pip != dip then
+        * the already locked oip is ABOVE dip.
+        *
+        * But if the common parent directory IS dip, then we would have
+        * a lock-order reversal and must rearrange the lock ordering.
+        * For now the caller deals with this for us by locking dip in
+        * that case (and our lock here winds up just being recursive)
+        */
+       parent = &dip->chain;
+       if (oip->pip == dip) {
+               hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
+               hammer2_chain_lock(hmp, &oip->chain, HAMMER2_RESOLVE_ALWAYS);
+       } else {
+               hammer2_chain_lock(hmp, &oip->chain, HAMMER2_RESOLVE_ALWAYS);
+               hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
+       }
+
+
        lhc = hammer2_dirhash(name, name_len);
        hlink = (oip->chain.parent != NULL);
 
@@ -475,9 +495,6 @@ hammer2_inode_connect(hammer2_inode_t *dip, hammer2_inode_t *oip,
         * entry in.  At the same time check for key collisions
         * and iterate until we don't get one.
         */
-       parent = &dip->chain;
-       hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
-
        error = 0;
        while (error == 0) {
                chain = hammer2_chain_lookup(hmp, &parent, lhc, lhc, 0);
@@ -519,6 +536,7 @@ hammer2_inode_connect(hammer2_inode_t *dip, hammer2_inode_t *oip,
         */
        if (error) {
                KKASSERT(chain == NULL);
+               hammer2_chain_unlock(hmp, &oip->chain);
                return (error);
        }
 
@@ -581,7 +599,7 @@ hammer2_inode_connect(hammer2_inode_t *dip, hammer2_inode_t *oip,
                }
                oip->ip_data.nlinks = 1;
        }
-
+       hammer2_chain_unlock(hmp, &oip->chain);
        return (0);
 }
 
@@ -620,7 +638,7 @@ hammer2_unlink_file(hammer2_inode_t *dip,
        hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
        chain = hammer2_chain_lookup(hmp, &parent,
                                     lhc, lhc + HAMMER2_DIRHASH_LOMASK,
-                                    0);
+                                    HAMMER2_LOOKUP_MAYDELETE);
        while (chain) {
                if (chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
                    chain->u.ip &&
@@ -630,7 +648,7 @@ hammer2_unlink_file(hammer2_inode_t *dip,
                }
                chain = hammer2_chain_next(hmp, &parent, chain,
                                           lhc, lhc + HAMMER2_DIRHASH_LOMASK,
-                                          0);
+                                          HAMMER2_LOOKUP_MAYDELETE);
        }
 
        /*
@@ -705,8 +723,9 @@ hammer2_unlink_file(hammer2_inode_t *dip,
                 * pointer entry.
                 */
                parent = oip->chain.parent;
-               hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
-               hammer2_chain_lock(hmp, &oip->chain, HAMMER2_RESOLVE_ALWAYS);
+               hammer2_chain_lock_pair(hmp, parent, &oip->chain,
+                                       HAMMER2_RESOLVE_ALWAYS |
+                                       HAMMER2_RESOLVE_MAYDELETE);
                hammer2_chain_delete(hmp, parent, &oip->chain,
                                    (retain_ip == oip));
                hammer2_chain_unlock(hmp, &oip->chain);
@@ -721,9 +740,9 @@ hammer2_unlink_file(hammer2_inode_t *dip,
                        dparent = chain->parent;
                        hammer2_chain_ref(hmp, chain);
                        hammer2_chain_unlock(hmp, chain);
-                       hammer2_chain_lock(hmp, dparent,
-                                          HAMMER2_RESOLVE_ALWAYS);
-                       hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_ALWAYS);
+                       hammer2_chain_lock_pair(hmp, dparent, chain,
+                                          HAMMER2_RESOLVE_ALWAYS |
+                                          HAMMER2_RESOLVE_MAYDELETE);
                        hammer2_chain_drop(hmp, chain);
                        hammer2_chain_modify(hmp, chain, 0);
                        --ip->ip_data.nlinks;
@@ -807,6 +826,7 @@ hammer2_hardlink_consolidate(hammer2_inode_t **ipp, hammer2_inode_t *tdip)
        hammer2_inode_t *oip = *ipp;
        hammer2_inode_t *nip = NULL;
        hammer2_inode_t *fdip;
+       hammer2_inode_t *cdip;
        hammer2_chain_t *parent;
        int error;
 
@@ -817,30 +837,14 @@ hammer2_hardlink_consolidate(hammer2_inode_t **ipp, hammer2_inode_t *tdip)
        if (hammer2_hardlink_enable == 0)
                return (ENOTSUP);
 
-       /*
-        * Find the common parent directory
-        */
        fdip = oip->pip;
-       while (fdip->depth > tdip->depth) {
-               fdip = fdip->pip;
-               KKASSERT(fdip != NULL);
-       }
-       while (tdip->depth > fdip->depth) {
-               tdip = tdip->pip;
-               KKASSERT(tdip != NULL);
-       }
-       while (fdip != tdip) {
-               fdip = fdip->pip;
-               tdip = tdip->pip;
-               KKASSERT(fdip != NULL);
-               KKASSERT(tdip != NULL);
-       }
+       cdip = hammer2_inode_common_parent(hmp, fdip, tdip);
 
        /*
         * Nothing to do (except bump the link count) if the hardlink has
         * already been consolidated in the correct place.
         */
-       if (oip->pip == fdip &&
+       if (cdip == fdip &&
            (oip->ip_data.name_key & HAMMER2_DIRHASH_VISIBLE) == 0) {
                kprintf("hardlink already consolidated correctly\n");
                nip = oip;
@@ -848,6 +852,7 @@ hammer2_hardlink_consolidate(hammer2_inode_t **ipp, hammer2_inode_t *tdip)
                hammer2_chain_modify(hmp, &nip->chain, 0);
                ++nip->ip_data.nlinks;
                hammer2_inode_unlock_ex(nip);
+               hammer2_inode_drop(cdip);
                return (0);
        }
 
@@ -859,13 +864,13 @@ hammer2_hardlink_consolidate(hammer2_inode_t **ipp, hammer2_inode_t *tdip)
         * under oip to the new hardlink target inode, retiring all chains
         * related to oip before returning.  XXX vp->ip races.
         */
-       error = hammer2_inode_duplicate(fdip, oip, &nip, NULL, 0);
+       error = hammer2_inode_duplicate(cdip, oip, &nip, NULL, 0);
        if (error == 0) {
                /*
                 * Bump nlinks on duplicated hidden inode.
                 */
                kprintf("hardlink consolidation success in parent dir %s\n",
-                       fdip->ip_data.filename);
+                       cdip->ip_data.filename);
                hammer2_inode_lock_nlinks(nip);
                hammer2_inode_unlock_nlinks(oip);
                hammer2_chain_modify(hmp, &nip->chain, 0);
@@ -933,6 +938,7 @@ hammer2_hardlink_consolidate(hammer2_inode_t **ipp, hammer2_inode_t *tdip)
        } else {
                KKASSERT(nip == NULL);
        }
+       hammer2_inode_drop(cdip);
 
        return (error);
 }
@@ -987,7 +993,7 @@ hammer2_hardlink_find(hammer2_inode_t *dip, hammer2_chain_t **chainp,
                hammer2_chain_unlock(hmp, parent);
                if (chain)
                        break;
-               pip = pip->pip;
+               pip = pip->pip; /* XXX SMP RACE */
        }
        *chainp = chain;
        if (chain) {
@@ -998,3 +1004,45 @@ hammer2_hardlink_find(hammer2_inode_t *dip, hammer2_chain_t **chainp,
                return (EIO);
        }
 }
+
+/*
+ * Find the directory common to both fdip and tdip, hold and return
+ * its inode.
+ */
+hammer2_inode_t *
+hammer2_inode_common_parent(hammer2_mount_t *hmp,
+                           hammer2_inode_t *fdip, hammer2_inode_t *tdip)
+{
+       hammer2_inode_t *scan1;
+       hammer2_inode_t *scan2;
+
+       /*
+        * We used to have a depth field but it complicated matters too
+        * much for directory renames.  So now its ugly.  Check for
+        * simple cases before giving up and doing it the expensive way.
+        *
+        * XXX need a bottom-up topology stability lock
+        */
+       if (fdip == tdip || fdip == tdip->pip) {
+               hammer2_inode_ref(fdip);
+               return(fdip);
+       }
+       if (fdip->pip == tdip) {
+               hammer2_inode_ref(tdip);
+               return(tdip);
+       }
+       for (scan1 = fdip; scan1->pmp == fdip->pmp; scan1 = scan1->pip) {
+               scan2 = tdip;
+               while (scan2->pmp == tdip->pmp) {
+                       if (scan1 == scan2) {
+                               hammer2_inode_ref(scan1);
+                               return(scan1);
+                       }
+                       scan2 = scan2->pip;
+               }
+       }
+       panic("hammer2_inode_common_parent: no common parent %p %p\n",
+             fdip, tdip);
+       /* NOT REACHED */
+       return(NULL);
+}
index f51dc65..377032d 100644 (file)
@@ -1148,7 +1148,8 @@ hammer2_truncate_file(hammer2_inode_t *ip, hammer2_key_t nsize)
        lbase = (nsize + HAMMER2_PBUFMASK64) & ~HAMMER2_PBUFMASK64;
        chain = hammer2_chain_lookup(hmp, &parent,
                                     lbase, (hammer2_key_t)-1,
-                                    HAMMER2_LOOKUP_NODATA);
+                                    HAMMER2_LOOKUP_NODATA |
+                                    HAMMER2_LOOKUP_MAYDELETE);
        while (chain) {
                /*
                 * Degenerate embedded data case, nothing to loop on.
@@ -1168,7 +1169,8 @@ hammer2_truncate_file(hammer2_inode_t *ip, hammer2_key_t nsize)
                /* XXX check parent if empty indirect block & delete */
                chain = hammer2_chain_next(hmp, &parent, chain,
                                           lbase, (hammer2_key_t)-1,
-                                          HAMMER2_LOOKUP_NODATA);
+                                          HAMMER2_LOOKUP_NODATA |
+                                          HAMMER2_LOOKUP_MAYDELETE);
        }
        hammer2_chain_unlock(hmp, parent);
 }
@@ -1392,6 +1394,8 @@ hammer2_vop_nresolve(struct vop_nresolve_args *ap)
                        vn_unlock(vp);
                        cache_setvp(ap->a_nch, vp);
                        vrele(vp);
+               } else {
+                       cache_setvp(ap->a_nch, NULL);
                }
                hammer2_chain_unlock(hmp, chain);
        } else {
@@ -1663,9 +1667,7 @@ hammer2_vop_nlink(struct vop_nlink_args *ap)
         *
         * We must reconnect the vp.
         */
-       hammer2_chain_lock(hmp, &ip->chain, HAMMER2_RESOLVE_ALWAYS);
        error = hammer2_inode_connect(dip, ip, name, name_len);
-       hammer2_chain_unlock(hmp, &ip->chain);
        if (error == 0) {
                cache_setunresolved(ap->a_nch);
                cache_setvp(ap->a_nch, ap->a_vp);
@@ -1957,10 +1959,7 @@ hammer2_vop_nrename(struct vop_nrename_args *ap)
         *          deadlocks we want to unlock before issuing a cache_*()
         *          op (that might have to lock a vnode).
         */
-       hammer2_chain_lock(hmp, &ip->chain, HAMMER2_RESOLVE_ALWAYS);
        error = hammer2_inode_connect(tdip, ip, tname, tname_len);
-       hammer2_chain_unlock(hmp, &ip->chain);
-
        if (error == 0) {
                cache_rename(ap->a_fnch, ap->a_tnch);
        }