hammer2 - cleanup data load, unlink optimization
authorMatthew Dillon <dillon@apollo.backplane.com>
Tue, 26 May 2015 04:42:00 +0000 (21:42 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Tue, 26 May 2015 04:42:00 +0000 (21:42 -0700)
* Clean up data loading on chain lock.  Use chain flags to interlock data
  loading with either a shared or exclusive lock.

* We no longer upgrade a shared lock to exclusive in order to load data,
  preventing a potentially unexpected deadlock from occuring.

* Clean up the chain_core (chain->core) structure.  Remove flags and
  move the main lock from the core to the main chain structure proper.

* Attempt to avoid I/O when unlinking files by not updating the inode
  on the final 1->0 transition of nlinks.  This greatly reduces the amount
  of write I/O occuring during a rm -rf and improves performance by a
  factor of 3x.

sys/vfs/hammer2/hammer2.h
sys/vfs/hammer2/hammer2_chain.c
sys/vfs/hammer2/hammer2_inode.c
sys/vfs/hammer2/hammer2_subr.c
sys/vfs/hammer2/hammer2_vfsops.c
sys/vfs/hammer2/hammer2_vnops.c

index 7c2ad88..93e30ff 100644 (file)
@@ -242,12 +242,13 @@ TAILQ_HEAD(h2_iocb_list, hammer2_iocb);
 #define CHAIN_CORE_DELETE_BMAP_ENTRIES \
        (HAMMER2_PBUFSIZE / sizeof(hammer2_blockref_t) / sizeof(uint32_t))
 
+/*
+ * Core topology for chain (embedded in chain).  Protected by a spinlock.
+ */
 struct hammer2_chain_core {
-       hammer2_mtx_t   lock;
        hammer2_spin_t  spin;
        struct hammer2_chain_tree rbtree; /* sub-chains */
        int             live_zero;      /* blockref array opt */
-       u_int           flags;
        u_int           live_count;     /* live (not deleted) chains in tree */
        u_int           chain_count;    /* live + deleted chains under core */
        int             generation;     /* generation number (inserts only) */
@@ -255,9 +256,6 @@ struct hammer2_chain_core {
 
 typedef struct hammer2_chain_core hammer2_chain_core_t;
 
-#define HAMMER2_CORE_UNUSED0001                0x0001
-#define HAMMER2_CORE_COUNTEDBREFS      0x0002
-
 RB_HEAD(hammer2_io_tree, hammer2_io);
 
 /*
@@ -319,6 +317,7 @@ typedef struct hammer2_io hammer2_io_t;
  * Primary chain structure keeps track of the topology in-memory.
  */
 struct hammer2_chain {
+       hammer2_mtx_t           lock;
        hammer2_chain_core_t    core;
        RB_ENTRY(hammer2_chain) rbnode;         /* live chain(s) */
        hammer2_blockref_t      bref;
@@ -381,15 +380,15 @@ RB_PROTOTYPE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
 #define HAMMER2_CHAIN_FICTITIOUS       0x00000400      /* unsuitable for I/O */
 #define HAMMER2_CHAIN_VOLUMESYNC       0x00000800      /* needs volume sync */
 #define HAMMER2_CHAIN_DELAYED          0x00001000      /* delayed flush */
-#define HAMMER2_CHAIN_UNUSED00002000   0x00002000
+#define HAMMER2_CHAIN_COUNTEDBREFS     0x00002000      /* block table stats */
 #define HAMMER2_CHAIN_ONRBTREE         0x00004000      /* on parent RB tree */
 #define HAMMER2_CHAIN_UNUSED00008000   0x00008000
 #define HAMMER2_CHAIN_EMBEDDED         0x00010000      /* embedded data */
 #define HAMMER2_CHAIN_RELEASE          0x00020000      /* don't keep around */
 #define HAMMER2_CHAIN_BMAPPED          0x00040000      /* present in blkmap */
 #define HAMMER2_CHAIN_BMAPUPD          0x00080000      /* +needs updating */
-#define HAMMER2_CHAIN_UNUSED00100000   0x00100000
-#define HAMMER2_CHAIN_UNUSED00200000   0x00200000
+#define HAMMER2_CHAIN_IOINPROG         0x00100000      /* I/O interlock */
+#define HAMMER2_CHAIN_IOSIGNAL         0x00200000      /* I/O interlock */
 #define HAMMER2_CHAIN_PFSBOUNDARY      0x00400000      /* super->pfs inode */
 
 #define HAMMER2_CHAIN_FLUSH_MASK       (HAMMER2_CHAIN_MODIFIED |       \
@@ -682,8 +681,8 @@ struct hammer2_inode {
        u_int                   flags;
        u_int                   refs;           /* +vpref, +flushref */
        uint8_t                 comp_heuristic;
-       hammer2_off_t           size;
-       uint64_t                mtime;
+       hammer2_off_t           size;           /* cache file size */
+       uint64_t                mtime;          /* cache mtime */
 };
 
 typedef struct hammer2_inode hammer2_inode_t;
@@ -1144,6 +1143,7 @@ void hammer2_chain_core_init(hammer2_chain_t *chain);
 void hammer2_chain_ref(hammer2_chain_t *chain);
 void hammer2_chain_drop(hammer2_chain_t *chain);
 void hammer2_chain_lock(hammer2_chain_t *chain, int how);
+void hammer2_chain_load_data(hammer2_chain_t *chain);
 const hammer2_media_data_t *hammer2_chain_rdata(hammer2_chain_t *chain);
 hammer2_media_data_t *hammer2_chain_wdata(hammer2_chain_t *chain);
 
index 97c811e..8d16305 100644 (file)
@@ -231,14 +231,12 @@ hammer2_chain_alloc(hammer2_dev_t *hmp, hammer2_pfs_t *pmp,
 void
 hammer2_chain_core_init(hammer2_chain_t *chain)
 {
-       hammer2_chain_core_t *core = &chain->core;
-
        /*
         * Fresh core under nchain (no multi-homing of ochain's
         * sub-tree).
         */
-       RB_INIT(&core->rbtree); /* live chains */
-       hammer2_mtx_init(&core->lock, "h2chain");
+       RB_INIT(&chain->core.rbtree);   /* live chains */
+       hammer2_mtx_init(&chain->lock, "h2chain");
 }
 
 /*
@@ -554,35 +552,30 @@ hammer2_chain_drop_data(hammer2_chain_t *chain, int lastdrop)
  *      a logical file buffer.  However, if ALWAYS is specified the
  *      device buffer will be instantiated anyway.
  *
- * WARNING! If data must be fetched a shared lock will temporarily be
- *         upgraded to exclusive.  However, a deadlock can occur if
- *         the caller owns more than one shared lock.
+ * WARNING! This function blocks on I/O if data needs to be fetched.  This
+ *         blocking can run concurrent with other compatible lock holders
+ *         who do not need data returning.  The lock is not upgraded to
+ *         exclusive during a data fetch, a separate bit is used to
+ *         interlock I/O.  However, an exclusive lock holder can still count
+ *         on being interlocked against an I/O fetch managed by a shared
+ *         lock holder.
  */
 void
 hammer2_chain_lock(hammer2_chain_t *chain, int how)
 {
-       hammer2_dev_t *hmp;
-       hammer2_blockref_t *bref;
-       hammer2_mtx_state_t ostate;
-       char *bdata;
-       int error;
-
        /*
         * Ref and lock the element.  Recursive locks are allowed.
         */
        KKASSERT(chain->refs > 0);
        atomic_add_int(&chain->lockcnt, 1);
 
-       hmp = chain->hmp;
-       KKASSERT(hmp != NULL);
-
        /*
         * Get the appropriate lock.
         */
        if (how & HAMMER2_RESOLVE_SHARED)
-               hammer2_mtx_sh(&chain->core.lock);
+               hammer2_mtx_sh(&chain->lock);
        else
-               hammer2_mtx_ex(&chain->core.lock);
+               hammer2_mtx_ex(&chain->lock);
 
        /*
         * If we already have a valid data pointer no further action is
@@ -614,16 +607,69 @@ hammer2_chain_lock(hammer2_chain_t *chain, int how)
        }
 
        /*
-        * Upgrade to an exclusive lock so we can safely manipulate the
-        * buffer cache.  If another thread got to it before us we
-        * can just return.
+        * Caller requires data
+        */
+       hammer2_chain_load_data(chain);
+}
+
+/*
+ * Issue I/O and install chain->data.  Caller must hold a chain lock, lock
+ * may be of any type.
+ *
+ * Once chain->data is set it cannot be disposed of until all locks are
+ * released.
+ */
+void
+hammer2_chain_load_data(hammer2_chain_t *chain)
+{
+       hammer2_blockref_t *bref;
+       hammer2_dev_t *hmp;
+       char *bdata;
+       int error;
+
+       /*
+        * Degenerate case, data already present.
         */
-       ostate = hammer2_mtx_upgrade(&chain->core.lock);
-       if (chain->data) {
-               hammer2_mtx_downgrade(&chain->core.lock, ostate);
+       if (chain->data)
                return;
+
+       hmp = chain->hmp;
+       KKASSERT(hmp != NULL);
+
+       /*
+        * Gain the IOINPROG bit, interlocked block.
+        */
+       for (;;) {
+               u_int oflags;
+               u_int nflags;
+
+               oflags = chain->flags;
+               cpu_ccfence();
+               if (oflags & HAMMER2_CHAIN_IOINPROG) {
+                       nflags = oflags | HAMMER2_CHAIN_IOSIGNAL;
+                       tsleep_interlock(&chain->flags, 0);
+                       if (atomic_cmpset_int(&chain->flags, oflags, nflags)) {
+                               tsleep(&chain->flags, PINTERLOCKED,
+                                       "h2iocw", 0);
+                       }
+                       /* retry */
+               } else {
+                       nflags = oflags | HAMMER2_CHAIN_IOINPROG;
+                       if (atomic_cmpset_int(&chain->flags, oflags, nflags)) {
+                               break;
+                       }
+                       /* retry */
+               }
        }
 
+       /*
+        * We own CHAIN_IOINPROG
+        *
+        * Degenerate case if we raced another load.
+        */
+       if (chain->data)
+               goto done;
+
        /*
         * We must resolve to a device buffer, either by issuing I/O or
         * by creating a zero-fill element.  We do not mark the buffer
@@ -656,8 +702,7 @@ hammer2_chain_lock(hammer2_chain_t *chain, int how)
                kprintf("hammer2_chain_lock: I/O error %016jx: %d\n",
                        (intmax_t)bref->data_off, error);
                hammer2_io_bqrelse(&chain->dio);
-               hammer2_mtx_downgrade(&chain->core.lock, ostate);
-               return;
+               goto done;
        }
        chain->error = 0;
 
@@ -702,6 +747,9 @@ hammer2_chain_lock(hammer2_chain_t *chain, int how)
         * The buffer is not retained when copying to an embedded data
         * structure in order to avoid potential deadlocks or recursions
         * on the same physical buffer.
+        *
+        * WARNING! Other threads can start using the data the instant we
+        *          set chain->data non-NULL.
         */
        switch (bref->type) {
        case HAMMER2_BREF_TYPE_VOLUME:
@@ -723,7 +771,25 @@ hammer2_chain_lock(hammer2_chain_t *chain, int how)
                chain->data = (void *)bdata;
                break;
        }
-       hammer2_mtx_downgrade(&chain->core.lock, ostate);
+
+       /*
+        * Release HAMMER2_CHAIN_IOINPROG and signal waiters if requested.
+        */
+done:
+       for (;;) {
+               u_int oflags;
+               u_int nflags;
+
+               oflags = chain->flags;
+               nflags = oflags & ~(HAMMER2_CHAIN_IOINPROG |
+                                   HAMMER2_CHAIN_IOSIGNAL);
+               KKASSERT(oflags & HAMMER2_CHAIN_IOINPROG);
+               if (atomic_cmpset_int(&chain->flags, oflags, nflags)) {
+                       if (oflags & HAMMER2_CHAIN_IOSIGNAL)
+                               wakeup(&chain->flags);
+                       break;
+               }
+       }
 }
 
 /*
@@ -752,7 +818,7 @@ hammer2_chain_unlock(hammer2_chain_t *chain)
                if (lockcnt > 1) {
                        if (atomic_cmpset_int(&chain->lockcnt,
                                              lockcnt, lockcnt - 1)) {
-                               hammer2_mtx_unlock(&chain->core.lock);
+                               hammer2_mtx_unlock(&chain->lock);
                                return;
                        }
                } else {
@@ -774,9 +840,9 @@ hammer2_chain_unlock(hammer2_chain_t *chain)
         * all that will happen is that the chain will be reloaded after we
         * unload it.
         */
-       ostate = hammer2_mtx_upgrade(&chain->core.lock);
+       ostate = hammer2_mtx_upgrade(&chain->lock);
        if (chain->lockcnt) {
-               hammer2_mtx_unlock(&chain->core.lock);
+               hammer2_mtx_unlock(&chain->lock);
                return;
        }
 
@@ -789,7 +855,7 @@ hammer2_chain_unlock(hammer2_chain_t *chain)
        if (chain->dio == NULL) {
                if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0)
                        hammer2_chain_drop_data(chain, 0);
-               hammer2_mtx_unlock(&chain->core.lock);
+               hammer2_mtx_unlock(&chain->lock);
                return;
        }
 
@@ -863,7 +929,7 @@ hammer2_chain_unlock(hammer2_chain_t *chain)
        } else {
                hammer2_io_bqrelse(&chain->dio);
        }
-       hammer2_mtx_unlock(&chain->core.lock);
+       hammer2_mtx_unlock(&chain->lock);
 }
 
 /*
@@ -885,7 +951,7 @@ hammer2_chain_countbrefs(hammer2_chain_t *chain,
                         hammer2_blockref_t *base, int count)
 {
        hammer2_spin_ex(&chain->core.spin);
-        if ((chain->core.flags & HAMMER2_CORE_COUNTEDBREFS) == 0) {
+        if ((chain->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0) {
                if (base) {
                        while (--count >= 0) {
                                if (base[count].type)
@@ -902,7 +968,7 @@ hammer2_chain_countbrefs(hammer2_chain_t *chain,
                        chain->core.live_zero = 0;
                }
                /* else do not modify live_count */
-               atomic_set_int(&chain->core.flags, HAMMER2_CORE_COUNTEDBREFS);
+               atomic_set_int(&chain->flags, HAMMER2_CHAIN_COUNTEDBREFS);
        }
        hammer2_spin_unex(&chain->core.spin);
 }
@@ -1682,7 +1748,7 @@ again:
         * We need to hold the spinlock to access the block array and RB tree
         * and to interlock chain creation.
         */
-       if ((parent->core.flags & HAMMER2_CORE_COUNTEDBREFS) == 0)
+       if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0)
                hammer2_chain_countbrefs(parent, base, count);
 
        /*
@@ -2022,7 +2088,7 @@ again:
         * We need to hold the spinlock to access the block array and RB tree
         * and to interlock chain creation.
         */
-       if ((parent->core.flags & HAMMER2_CORE_COUNTEDBREFS) == 0)
+       if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0)
                hammer2_chain_countbrefs(parent, base, count);
 
        next_key = 0;
@@ -2156,7 +2222,7 @@ hammer2_chain_create(hammer2_trans_t *trans, hammer2_chain_t **parentp,
         * Topology may be crossing a PFS boundary.
         */
        parent = *parentp;
-       KKASSERT(hammer2_mtx_owned(&parent->core.lock));
+       KKASSERT(hammer2_mtx_owned(&parent->lock));
        KKASSERT(parent->error == 0);
        hmp = parent->hmp;
        chain = *chainp;
@@ -2182,7 +2248,7 @@ hammer2_chain_create(hammer2_trans_t *trans, hammer2_chain_t **parentp,
                 * to 1 by chain_alloc() for us, but lockcnt is not).
                 */
                chain->lockcnt = 1;
-               hammer2_mtx_ex(&chain->core.lock);
+               hammer2_mtx_ex(&chain->lock);
                allocated = 1;
 
                /*
@@ -2301,7 +2367,7 @@ again:
        /*
         * Make sure we've counted the brefs
         */
-       if ((parent->core.flags & HAMMER2_CORE_COUNTEDBREFS) == 0)
+       if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0)
                hammer2_chain_countbrefs(parent, base, count);
 
        KKASSERT(parent->core.live_count >= 0 &&
@@ -2487,7 +2553,7 @@ hammer2_chain_rename(hammer2_trans_t *trans, hammer2_blockref_t *bref,
         * Having both chains locked is extremely important for atomicy.
         */
        if (parentp && (parent = *parentp) != NULL) {
-               KKASSERT(hammer2_mtx_owned(&parent->core.lock));
+               KKASSERT(hammer2_mtx_owned(&parent->lock));
                KKASSERT(parent->refs > 0);
                KKASSERT(parent->error == 0);
 
@@ -2734,7 +2800,7 @@ hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent,
         */
        hmp = parent->hmp;
        *errorp = 0;
-       KKASSERT(hammer2_mtx_owned(&parent->core.lock));
+       KKASSERT(hammer2_mtx_owned(&parent->lock));
 
        /*hammer2_chain_modify(trans, &parent, HAMMER2_MODIFY_OPTDATA);*/
        if (parent->flags & HAMMER2_CHAIN_INITIAL) {
@@ -3328,7 +3394,7 @@ void
 hammer2_chain_delete(hammer2_trans_t *trans, hammer2_chain_t *parent,
                     hammer2_chain_t *chain, int flags)
 {
-       KKASSERT(hammer2_mtx_owned(&chain->core.lock));
+       KKASSERT(hammer2_mtx_owned(&chain->lock));
 
        /*
         * Nothing to do if already marked.
@@ -3388,7 +3454,7 @@ hammer2_base_find(hammer2_chain_t *parent,
         * Require the live chain's already have their core's counted
         * so we can optimize operations.
         */
-        KKASSERT(parent->core.flags & HAMMER2_CORE_COUNTEDBREFS);
+        KKASSERT(parent->flags & HAMMER2_CHAIN_COUNTEDBREFS);
 
        /*
         * Degenerate case
index abb8fd1..4dd6958 100644 (file)
@@ -1273,6 +1273,7 @@ hammer2_unlink_file(hammer2_trans_t *trans, hammer2_inode_t *dip,
        hammer2_key_t key_dummy;
        hammer2_key_t key_next;
        hammer2_key_t lhc;
+       int last_link;
        int error;
        int hlink;
        uint8_t type;
@@ -1418,7 +1419,9 @@ again:
         * or directory in (cparent, cluster).
         *
         * Delete the target when nlinks reaches 0 with special handling
-        * if (isopen) is set.
+        * to avoid I/O (to avoid actually updating the inode) for the 1->0
+        * transition, if possible.  This optimization makes rm -rf very
+        * fast.
         *
         * NOTE! In DragonFly the vnops function calls cache_unlink() after
         *       calling us here to clean out the namecache association,
@@ -1431,16 +1434,15 @@ again:
         *       will bump nlinks.
         */
        KKASSERT(cluster != NULL);
-       hammer2_cluster_modify(trans, cluster, 0);
-       wipdata = &hammer2_cluster_wdata(cluster)->ipdata;
-       ripdata = wipdata;
-       wipdata->nlinks += nlinks;
-       if ((int64_t)wipdata->nlinks < 0) {     /* XXX debugging */
-               wipdata->nlinks = 0;
-       }
-       hammer2_cluster_modsync(cluster);
 
-       if (wipdata->nlinks == 0) {
+       /*
+        * Note: nlinks is negative when decrementing, positive when
+        *       incrementing.
+        */
+       ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
+       last_link = (ripdata->nlinks + nlinks == 0);
+
+       if (last_link) {
                /*
                 * Target nlinks has reached 0, file now unlinked (but may
                 * still be open).
@@ -1457,6 +1459,19 @@ again:
                */
                hammer2_cluster_set_chainflags(cluster, HAMMER2_CHAIN_UNLINKED);
                if (nch && cache_isopen(nch)) {
+                       /*
+                        * If an unlinked file is still open we must update
+                        * the inodes link count.
+                        */
+                       hammer2_cluster_modify(trans, cluster, 0);
+                       wipdata = &hammer2_cluster_wdata(cluster)->ipdata;
+                       ripdata = wipdata;
+                       wipdata->nlinks += nlinks;
+                       /* XXX debugging */
+                       if ((int64_t)wipdata->nlinks < 0) {
+                               wipdata->nlinks = 0;
+                       }
+                       hammer2_cluster_modsync(cluster);
                        hammer2_inode_move_to_hidden(trans, &cparent, &cluster,
                                                     wipdata->inum);
                } else {
@@ -1476,13 +1491,26 @@ again:
                 * a nlinks adjustment of 0, and only wishes to remove file
                 * in order to be able to reconnect it under a different name.
                 *
-                * In this situation we do a non-permanent deletion of the
+                * In this situation we do a temporary deletion of the
                 * chain in order to allow the file to be reconnected in
                 * a different location.
                 */
                KKASSERT(nlinks == 0);
                hammer2_cluster_delete(trans, cparent, cluster, 0);
+       } else {
+               /*
+                * Links remain, must update the inode link count.
+                */
+               hammer2_cluster_modify(trans, cluster, 0);
+               wipdata = &hammer2_cluster_wdata(cluster)->ipdata;
+               ripdata = wipdata;
+               wipdata->nlinks += nlinks;
+               if ((int64_t)wipdata->nlinks < 0) {     /* XXX debugging */
+                       wipdata->nlinks = 0;
+               }
+               hammer2_cluster_modsync(cluster);
        }
+
        error = 0;
 done:
        if (cparent) {
index f7a48b9..212145c 100644 (file)
 void
 hammer2_dev_exlock(hammer2_dev_t *hmp)
 {
-       hammer2_mtx_ex(&hmp->vchain.core.lock);
+       hammer2_mtx_ex(&hmp->vchain.lock);
 }
 
 void
 hammer2_dev_shlock(hammer2_dev_t *hmp)
 {
-       hammer2_mtx_sh(&hmp->vchain.core.lock);
+       hammer2_mtx_sh(&hmp->vchain.lock);
 }
 
 void
 hammer2_dev_unlock(hammer2_dev_t *hmp)
 {
-       hammer2_mtx_unlock(&hmp->vchain.core.lock);
+       hammer2_mtx_unlock(&hmp->vchain.lock);
 }
 
 /*
index 5bff3aa..f2e6c35 100644 (file)
@@ -3064,17 +3064,13 @@ hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp, char pfx)
                chain->bref.key, chain->bref.keybits,
                chain->bref.mirror_tid);
 
-       kprintf("%*.*s      [%08x] (%s) refs=%d\n",
+       kprintf("%*.*s      [%08x] (%s) refs=%d",
                tab, tab, "",
                chain->flags,
                ((chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
                chain->data) ?  (char *)chain->data->ipdata.filename : "?"),
                chain->refs);
 
-       kprintf("%*.*s      core [%08x]",
-               tab, tab, "",
-               chain->core.flags);
-
        parent = chain->parent;
        if (parent)
                kprintf("\n%*.*s      p=%p [pflags %08x prefs %d",
index 51bbcda..662b120 100644 (file)
@@ -2437,8 +2437,13 @@ hammer2_run_unlinkq(hammer2_trans_t *trans, hammer2_pfs_t *pmp)
                        kprintf("hammer2: unlink on reclaim: %s refs=%d\n",
                                ripdata->filename, ip->refs);
                }
-               KKASSERT(ripdata->nlinks == 0);
 
+               /*
+                * NOTE: Due to optimizations to avoid I/O on the inode for
+                *       the last unlink, ripdata->nlinks is not necessarily
+                *       0 here.
+                */
+               /* KKASSERT(ripdata->nlinks == 0); (see NOTE) */
                cparent = hammer2_cluster_parent(cluster);
                hammer2_cluster_delete(trans, cparent, cluster,
                                       HAMMER2_DELETE_PERMANENT);