HAMMER VFS - Add code to reduce frontend vs flusher locking conflicts
authorMatthew Dillon <dillon@apollo.backplane.com>
Fri, 22 Jul 2011 15:55:06 +0000 (08:55 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Fri, 22 Jul 2011 15:55:06 +0000 (08:55 -0700)
* Implement a Pulse-width modulated time-domain multiplexer on B-Tree
  cursor operations which splits fronttend operations from backend flusher
  operations.

  The larger the number of inodes undergoing reclamation, the greater the
  pulse width given over to locking operations initiated by the flusher.
  Frontend operations (typically reads) are given smaller and smaller
  slot widths as the flusher gets more overloaded.

  The advantage of this mechanism is that we are not simply imposing a
  delay in the reader, we are imposing a variable-length time slot during
  which the reader is able to grab a B-Tree cursor.  This allows linear
  and partially cached operations to 'burst' many operations within the
  slot, allowing them run at nearly full speed and not imposing an artificial
  performance limitation for linear I/O.

* This solves a major deadlock stalling issue that prevented the flusher
  from being able to actually flush sufficient I/O to keep up with the
  inode backlog that e.g. blogbench creates, causing excessive locking
  stalls throughout the HAMMER filesystem, particularly when many file
  inodes are being cycled through.

* When running blogbench --iterations=150 prior to this change write
  performance would drop to unacceptable levels and read operations
  (particularly things which cycle through inodes like 'find' and 'ls')
  would stall for unacceptably long periods of time, often resulting
  in diagnostic cache_lock messages on the console showing namecache
  blockages in excess of 100 seconds.

  With the change namecache blockages are significantly reduced in both
  frequency and duration, find and ls operations are able to run
  concurrently with a heavy multi-file write load, and blogbench shows
  improved write performance while still giving a reasonable priority to
  read operations (which is what we want).

* Rename a few of the sysctls to normalize internal vs external variable
  names.

sys/vfs/hammer/hammer.h
sys/vfs/hammer/hammer_cursor.c
sys/vfs/hammer/hammer_flusher.c
sys/vfs/hammer/hammer_inode.c
sys/vfs/hammer/hammer_vfsops.c

index 07a066e..ed43d8a 100644 (file)
@@ -370,6 +370,7 @@ struct hammer_inode {
        int                     flags;
        int                     error;          /* flush error */
        int                     cursor_ip_refs; /* sanity */
+       int                     cursor_exclreq_count;
        int                     rsv_recs;
        struct vnode            *vp;
        hammer_pseudofs_inmem_t pfsm;
@@ -452,6 +453,7 @@ typedef struct hammer_inode *hammer_inode_t;
 #define HAMMER_INODE_SDIRTY    0x01000000 /* in-memory ino_data.size is dirty*/
 #define HAMMER_INODE_REDO      0x02000000 /* REDO logging active */
 #define HAMMER_INODE_RDIRTY    0x04000000 /* REDO records active in fifo */
+#define HAMMER_INODE_SLAVEFLUSH        0x08000000 /* being flushed by slave */
 
 #define HAMMER_INODE_MODMASK   (HAMMER_INODE_DDIRTY|HAMMER_INODE_SDIRTY|   \
                                 HAMMER_INODE_XDIRTY|HAMMER_INODE_BUFS|     \
@@ -736,6 +738,7 @@ struct hammer_node {
        TAILQ_HEAD(, hammer_cursor) cursor_list;  /* deadlock recovery */
        struct hammer_node_cache_list cache_list; /* passive caches */
        int                     flags;
+       int                     cursor_exclreq_count;
 };
 
 #define HAMMER_NODE_DELETED    0x0001
@@ -894,9 +897,9 @@ struct hammer_mount {
 
        int     volume_to_remove; /* volume that is currently being removed */
 
-       int     inode_reclaims; /* inodes pending reclaim by flusher */
        int     count_inodes;   /* total number of inodes */
        int     count_iqueued;  /* inodes queued to flusher */
+       int     count_reclaims; /* inodes pending reclaim by flusher */
 
        struct hammer_flusher flusher;
 
@@ -1005,10 +1008,11 @@ extern int hammer_debug_recover_faults;
 extern int hammer_debug_critical;
 extern int hammer_cluster_enable;
 extern int hammer_live_dedup;
+extern int hammer_tdmux_ticks;
 extern int hammer_count_fsyncs;
 extern int hammer_count_inodes;
 extern int hammer_count_iqueued;
-extern int hammer_count_reclaiming;
+extern int hammer_count_reclaims;
 extern int hammer_count_records;
 extern int hammer_count_record_datas;
 extern int hammer_count_volumes;
@@ -1044,7 +1048,7 @@ extern int hammer_limit_dirtybufspace;
 extern int hammer_limit_running_io;
 extern int hammer_limit_recs;
 extern int hammer_limit_inode_recs;
-extern int hammer_limit_reclaim;
+extern int hammer_limit_reclaims;
 extern int hammer_live_dedup_cache_size;
 extern int hammer_limit_redo;
 extern int hammer_bio_count;
@@ -1487,6 +1491,7 @@ void hammer_flusher_destroy(hammer_mount_t hmp);
 void hammer_flusher_sync(hammer_mount_t hmp);
 int  hammer_flusher_async(hammer_mount_t hmp, hammer_flush_group_t flg);
 int  hammer_flusher_async_one(hammer_mount_t hmp);
+int hammer_flusher_running(hammer_mount_t hmp);
 void hammer_flusher_wait(hammer_mount_t hmp, int seq);
 void hammer_flusher_wait_next(hammer_mount_t hmp);
 int  hammer_flusher_meta_limit(hammer_mount_t hmp);
index a88f5e1..42b9200 100644 (file)
@@ -51,22 +51,84 @@ hammer_init_cursor(hammer_transaction_t trans, hammer_cursor_t cursor,
 {
        hammer_volume_t volume;
        hammer_node_t node;
+       hammer_mount_t hmp;
+       u_int tticks;
        int error;
 
        bzero(cursor, sizeof(*cursor));
 
        cursor->trans = trans;
+       hmp = trans->hmp;
+
+       /*
+        * As the number of inodes queued to the flusher increases we use
+        * time-domain multiplexing to control read vs flush performance.
+        * We have to do it here, before acquiring any ip or node locks,
+        * to avoid deadlocking or excessively delaying the flusher.
+        *
+        * The full time period is hammer_tdmux_ticks, typically 1/5 of
+        * a second.
+        *
+        * inode allocation begins to get restrained at 2/4 the limit
+        * via the "hmrrcm" mechanism in hammer_inode.  We want to begin
+        * limiting read activity before that to try to avoid processes
+        * stalling out in "hmrrcm".
+        */
+       tticks = hammer_tdmux_ticks;
+       if (trans->type != HAMMER_TRANS_FLS && tticks &&
+           hmp->count_reclaims > hammer_limit_reclaims / tticks &&
+           hmp->count_reclaims > hammer_autoflush * 2 &&
+           hammer_flusher_running(hmp)) {
+               u_int rticks;
+               u_int xticks;
+               u_int dummy;
+
+               /*
+                * 0 ... xticks ... tticks
+                *
+                * rticks is the calculated position, xticks is the demarc
+                * where values below xticks are reserved for the flusher
+                * and values >= to xticks may be used by the frontend.
+                *
+                * At least one tick is always made available for the
+                * frontend.
+                */
+               rticks = (u_int)ticks % tticks;
+               xticks = hmp->count_reclaims * tticks / hammer_limit_reclaims;
+
+               /*
+                * Ensure rticks and xticks are stable
+                */
+               cpu_ccfence();
+               if (rticks < xticks) {
+                       if (hammer_debug_general & 0x0004)
+                               kprintf("rt %3u, xt %3u, tt %3u\n",
+                                       rticks, xticks, tticks);
+                       tsleep(&dummy, 0, "htdmux", xticks - rticks);
+               }
+       }
 
        /*
         * If the cursor operation is on behalf of an inode, lock
         * the inode.
+        *
+        * When acquiring a shared lock on an inode on which the backend
+        * flusher deadlocked, wait up to hammer_tdmux_ticks (1 second)
+        * for the deadlock to clear.
         */
        if ((cursor->ip = ip) != NULL) {
                ++ip->cursor_ip_refs;
-               if (trans->type == HAMMER_TRANS_FLS)
+               if (trans->type == HAMMER_TRANS_FLS) {
                        hammer_lock_ex(&ip->lock);
-               else
+               } else {
+#if 0
+                       if (ip->cursor_exclreq_count) {
+                               tsleep(&ip->cursor_exclreq_count, 0,
+                                      "hstag1", hammer_tdmux_ticks);
+                       }
+#endif
                        hammer_lock_sh(&ip->lock);
+               }
        }
 
        /*
@@ -94,7 +156,7 @@ hammer_init_cursor(hammer_transaction_t trans, hammer_cursor_t cursor,
         * the one from the root of the filesystem.
         */
        while (node == NULL) {
-               volume = hammer_get_root_volume(trans->hmp, &error);
+               volume = hammer_get_root_volume(hmp, &error);
                if (error)
                        break;
                node = hammer_get_node(trans, volume->ondisk->vol0_btree_root,
@@ -102,6 +164,18 @@ hammer_init_cursor(hammer_transaction_t trans, hammer_cursor_t cursor,
                hammer_rel_volume(volume, 0);
                if (error)
                        break;
+               /*
+                * When the frontend acquires the root b-tree node while the
+                * backend is deadlocked on it, wait up to hammer_tdmux_ticks
+                * (1 second) for the deadlock to clear.
+                */
+#if 0
+               if (node->cursor_exclreq_count &&
+                   cursor->trans->type != HAMMER_TRANS_FLS) {
+                       tsleep(&node->cursor_exclreq_count, 0,
+                              "hstag3", hammer_tdmux_ticks);
+               }
+#endif
                hammer_lock_sh(&node->lock);
 
                /*
@@ -184,10 +258,28 @@ hammer_done_cursor(hammer_cursor_t cursor)
        /*
         * If we deadlocked this node will be referenced.  Do a quick
         * lock/unlock to wait for the deadlock condition to clear.
+        *
+        * Maintain exclreq_count / wakeup as necessary to notify new
+        * entrants into ip.  We continue to hold the fs_token so our
+        * EDEADLK retry loop should get its chance before another thread
+        * steals the lock.
         */
        if (cursor->deadlk_node) {
+#if 0
+               if (ip && cursor->trans->type == HAMMER_TRANS_FLS)
+                       ++ip->cursor_exclreq_count;
+               ++cursor->deadlk_node->cursor_exclreq_count;
+#endif
                hammer_lock_ex_ident(&cursor->deadlk_node->lock, "hmrdlk");
                hammer_unlock(&cursor->deadlk_node->lock);
+#if 0
+               if (--cursor->deadlk_node->cursor_exclreq_count == 0)
+                       wakeup(&cursor->deadlk_node->cursor_exclreq_count);
+               if (ip && cursor->trans->type == HAMMER_TRANS_FLS) {
+                       if (--ip->cursor_exclreq_count == 0)
+                               wakeup(&ip->cursor_exclreq_count);
+               }
+#endif
                hammer_rel_node(cursor->deadlk_node);
                cursor->deadlk_node = NULL;
        }
@@ -211,6 +303,9 @@ hammer_done_cursor(hammer_cursor_t cursor)
  * The lock must already be either held shared or already held exclusively
  * by us.
  *
+ * We upgrade the parent first as it is the most likely to collide first
+ * with the downward traversal that the frontend typically does.
+ *
  * If we fail to upgrade the lock and cursor->deadlk_node is NULL, 
  * we add another reference to the node that failed and set
  * cursor->deadlk_node so hammer_done_cursor() can block on it.
@@ -220,6 +315,23 @@ hammer_cursor_upgrade(hammer_cursor_t cursor)
 {
        int error;
 
+       if (cursor->parent) {
+               error = hammer_lock_upgrade(&cursor->parent->lock, 1);
+               if (error && cursor->deadlk_node == NULL) {
+                       cursor->deadlk_node = cursor->parent;
+                       hammer_ref_node(cursor->deadlk_node);
+               }
+       } else {
+               error = 0;
+       }
+       if (error == 0) {
+               error = hammer_lock_upgrade(&cursor->node->lock, 1);
+               if (error && cursor->deadlk_node == NULL) {
+                       cursor->deadlk_node = cursor->node;
+                       hammer_ref_node(cursor->deadlk_node);
+               }
+       }
+#if 0
        error = hammer_lock_upgrade(&cursor->node->lock, 1);
        if (error && cursor->deadlk_node == NULL) {
                cursor->deadlk_node = cursor->node;
@@ -231,6 +343,7 @@ hammer_cursor_upgrade(hammer_cursor_t cursor)
                        hammer_ref_node(cursor->deadlk_node);
                }
        }
+#endif
        return(error);
 }
 
@@ -555,7 +668,27 @@ hammer_cursor_down(hammer_cursor_t cursor)
                      (elm->base.btype ? elm->base.btype : '?'));
                break;
        }
+
+       /*
+        * If no error occured we can lock the new child node.  If the
+        * node is deadlock flagged wait up to hammer_tdmux_ticks (1 second)
+        * for the deadlock to clear.  Otherwise a large number of concurrent
+        * readers can continuously stall the flusher.
+        *
+        * We specifically do this in the cursor_down() code in order to
+        * deal with frontend top-down searches smashing against bottom-up
+        * flusher-based mirror updates.  These collisions typically occur
+        * above the inode in the B-Tree and are not covered by the
+        * ip->cursor_exclreq_count logic.
+        */
        if (error == 0) {
+#if 0
+               if (node->cursor_exclreq_count &&
+                   cursor->trans->type != HAMMER_TRANS_FLS) {
+                       tsleep(&node->cursor_exclreq_count, 0,
+                              "hstag2", hammer_tdmux_ticks);
+               }
+#endif
                hammer_lock_sh(&node->lock);
                KKASSERT ((node->flags & HAMMER_NODE_DELETED) == 0);
                cursor->node = node;
@@ -669,17 +802,40 @@ hammer_lock_cursor(hammer_cursor_t cursor)
 int
 hammer_recover_cursor(hammer_cursor_t cursor)
 {
+       hammer_transaction_t trans;
+       hammer_inode_t ip;
        int error;
 
        hammer_unlock_cursor(cursor);
-       KKASSERT(cursor->trans->sync_lock_refs > 0);
+
+       ip = cursor->ip;
+       trans = cursor->trans;
+       KKASSERT(trans->sync_lock_refs > 0);
 
        /*
-        * Wait for the deadlock to clear
+        * Wait for the deadlock to clear.
+        *
+        * Maintain exclreq_count / wakeup as necessary to notify new
+        * entrants into ip.  We continue to hold the fs_token so our
+        * EDEADLK retry loop should get its chance before another thread
+        * steals the lock.
         */
        if (cursor->deadlk_node) {
+#if 0
+               if (ip && trans->type == HAMMER_TRANS_FLS)
+                       ++ip->cursor_exclreq_count;
+               ++cursor->deadlk_node->cursor_exclreq_count;
+#endif
                hammer_lock_ex_ident(&cursor->deadlk_node->lock, "hmrdlk");
                hammer_unlock(&cursor->deadlk_node->lock);
+#if 0
+               if (--cursor->deadlk_node->cursor_exclreq_count == 0)
+                       wakeup(&cursor->deadlk_node->cursor_exclreq_count);
+               if (ip && trans->type == HAMMER_TRANS_FLS) {
+                       if (--ip->cursor_exclreq_count == 0)
+                               wakeup(&ip->cursor_exclreq_count);
+               }
+#endif
                hammer_rel_node(cursor->deadlk_node);
                cursor->deadlk_node = NULL;
        }
index 6e5da43..b2f4cdb 100644 (file)
 static void hammer_flusher_master_thread(void *arg);
 static void hammer_flusher_slave_thread(void *arg);
 static int hammer_flusher_flush(hammer_mount_t hmp, int *nomorep);
-static void hammer_flusher_flush_inode(hammer_inode_t ip,
-                                       hammer_transaction_t trans);
+static int hammer_flusher_flush_inode(hammer_inode_t ip, void *data);
 
 RB_GENERATE(hammer_fls_rb_tree, hammer_inode, rb_flsnode,
               hammer_ino_rb_compare);
 
 /*
- * Inodes are sorted and assigned to slave threads in groups of 128.
- * We want a flush group size large enough such that the slave threads
- * are not likely to interfere with each other when accessing the B-Tree,
- * but not so large that we lose concurrency.
- */
-#define HAMMER_FLUSH_GROUP_SIZE 128
-
-/*
  * Support structures for the flusher threads.
  */
 struct hammer_flusher_info {
@@ -69,7 +60,7 @@ struct hammer_flusher_info {
        int             runstate;
        int             count;
        hammer_flush_group_t flg;
-       hammer_inode_t  work_array[HAMMER_FLUSH_GROUP_SIZE];
+       struct hammer_transaction trans;        /* per-slave transaction */
 };
 
 typedef struct hammer_flusher_info *hammer_flusher_info_t;
@@ -179,6 +170,20 @@ hammer_flusher_wait(hammer_mount_t hmp, int seq)
                tsleep(&hmp->flusher.done, 0, "hmrfls", 0);
 }
 
+/*
+ * Returns non-zero if the flusher is currently running.  Used for
+ * time-domain multiplexing of frontend operations in order to avoid
+ * starving the backend flusher.
+ */
+int
+hammer_flusher_running(hammer_mount_t hmp)
+{
+       int seq = hmp->flusher.next - 1;
+       if ((int)(seq - hmp->flusher.done) > 0)
+               return(1);
+       return (0);
+}
+
 void
 hammer_flusher_wait_next(hammer_mount_t hmp)
 {
@@ -308,9 +313,6 @@ hammer_flusher_flush(hammer_mount_t hmp, int *nomorep)
        hammer_flusher_info_t info;
        hammer_flush_group_t flg;
        hammer_reserve_t resv;
-       hammer_inode_t ip;
-       hammer_inode_t next_ip;
-       int slave_index;
        int count;
        int seq;
 
@@ -382,56 +384,18 @@ hammer_flusher_flush(hammer_mount_t hmp, int *nomorep)
                KKASSERT(hmp->next_flush_group != flg);
 
                /*
-                * Iterate the inodes in the flg's flush_tree and assign
-                * them to slaves.
+                * Place the flg in the flusher structure and start the
+                * slaves running.  The slaves will compete for inodes
+                * to flush.
+                *
+                * Make a per-thread copy of the transaction.
                 */
-               slave_index = 0;
-               info = TAILQ_FIRST(&hmp->flusher.ready_list);
-               next_ip = RB_FIRST(hammer_fls_rb_tree, &flg->flush_tree);
-
-               while ((ip = next_ip) != NULL) {
-                       next_ip = RB_NEXT(hammer_fls_rb_tree,
-                                         &flg->flush_tree, ip);
-
-                       if (++hmp->check_yield > hammer_yield_check) {
-                               hmp->check_yield = 0;
-                               lwkt_yield();
-                       }
-
-                       /*
-                        * Add ip to the slave's work array.  The slave is
-                        * not currently running.
-                        */
-                       info->work_array[info->count++] = ip;
-                       if (info->count != HAMMER_FLUSH_GROUP_SIZE)
-                               continue;
-
-                       /*
-                        * Get the slave running
-                        */
+               while ((info = TAILQ_FIRST(&hmp->flusher.ready_list)) != NULL) {
                        TAILQ_REMOVE(&hmp->flusher.ready_list, info, entry);
-                       TAILQ_INSERT_TAIL(&hmp->flusher.run_list, info, entry);
                        info->flg = flg;
                        info->runstate = 1;
-                       wakeup(&info->runstate);
-
-                       /*
-                        * Get a new slave.  We may have to wait for one to
-                        * finish running.
-                        */
-                       while ((info = TAILQ_FIRST(&hmp->flusher.ready_list)) == NULL) {
-                               tsleep(&hmp->flusher.ready_list, 0, "hmrfcc", 0);
-                       }
-               }
-
-               /*
-                * Run the current slave if necessary
-                */
-               if (info->count) {
-                       TAILQ_REMOVE(&hmp->flusher.ready_list, info, entry);
+                       info->trans = hmp->flusher.trans;
                        TAILQ_INSERT_TAIL(&hmp->flusher.run_list, info, entry);
-                       info->flg = flg;
-                       info->runstate = 1;
                        wakeup(&info->runstate);
                }
 
@@ -497,8 +461,6 @@ hammer_flusher_slave_thread(void *arg)
        hammer_flush_group_t flg;
        hammer_flusher_info_t info;
        hammer_mount_t hmp;
-       hammer_inode_t ip;
-       int i;
 
        info = arg;
        hmp = info->hmp;
@@ -511,13 +473,12 @@ hammer_flusher_slave_thread(void *arg)
                        break;
                flg = info->flg;
 
-               for (i = 0; i < info->count; ++i) {
-                       ip = info->work_array[i];
-                       hammer_flusher_flush_inode(ip, &hmp->flusher.trans);
-                       ++hammer_stats_inode_flushes;
-               }
+               RB_SCAN(hammer_fls_rb_tree, &flg->flush_tree, NULL,
+                       hammer_flusher_flush_inode, info);
+
                info->count = 0;
                info->runstate = 0;
+               info->flg = NULL;
                TAILQ_REMOVE(&hmp->flusher.run_list, info, entry);
                TAILQ_INSERT_TAIL(&hmp->flusher.ready_list, info, entry);
                wakeup(&hmp->flusher.ready_list);
@@ -563,12 +524,27 @@ hammer_flusher_clean_loose_ios(hammer_mount_t hmp)
  * error other then EWOULDBLOCK will force the mount to be read-only.
  */
 static
-void
-hammer_flusher_flush_inode(hammer_inode_t ip, hammer_transaction_t trans)
+int
+hammer_flusher_flush_inode(hammer_inode_t ip, void *data)
 {
-       hammer_mount_t hmp = ip->hmp;
+       hammer_flusher_info_t info = data;
+       hammer_mount_t hmp = info->hmp;
+       hammer_transaction_t trans = &info->trans;
        int error;
 
+       /*
+        * Several slaves are operating on the same flush group concurrently.
+        * The SLAVEFLUSH flag prevents them from tripping over each other.
+        *
+        * NOTE: It is possible for a EWOULDBLOCK'd ip returned by one slave
+        *       to be resynced by another, but normally such inodes are not
+        *       revisited until the master loop gets to them.
+        */
+       if (ip->flags & HAMMER_INODE_SLAVEFLUSH)
+               return(0);
+       ip->flags |= HAMMER_INODE_SLAVEFLUSH;
+       ++hammer_stats_inode_flushes;
+
        hammer_flusher_clean_loose_ios(hmp);
        error = hammer_sync_inode(trans, ip);
 
@@ -584,6 +560,8 @@ hammer_flusher_flush_inode(hammer_inode_t ip, hammer_transaction_t trans)
                        error = 0;
        }
        hammer_flush_inode_done(ip, error);
+       /* ip invalid */
+
        while (hmp->flusher.finalize_want)
                tsleep(&hmp->flusher.finalize_want, 0, "hmrsxx", 0);
        if (hammer_flusher_undo_exhausted(trans, 1)) {
@@ -592,6 +570,7 @@ hammer_flusher_flush_inode(hammer_inode_t ip, hammer_transaction_t trans)
        } else if (hammer_flusher_meta_limit(trans->hmp)) {
                hammer_flusher_finalize(trans, 0);
        }
+       return (0);
 }
 
 /*
index 41a9b01..25adbe9 100644 (file)
@@ -237,8 +237,8 @@ hammer_vop_reclaim(struct vop_reclaim_args *ap)
                ip->vp = NULL;
 
                if ((ip->flags & HAMMER_INODE_RECLAIM) == 0) {
-                       ++hammer_count_reclaiming;
-                       ++hmp->inode_reclaims;
+                       ++hammer_count_reclaims;
+                       ++hmp->count_reclaims;
                        ip->flags |= HAMMER_INODE_RECLAIM;
                }
                hammer_unlock(&ip->lock);
@@ -2182,7 +2182,7 @@ hammer_flush_inode_core(hammer_inode_t ip, hammer_flush_group_t flg, int flags)
         * inode reclaim wait pipeline continues to work.
         */
        if (flg->total_count >= hammer_autoflush ||
-           flg->total_count >= hammer_limit_reclaim / 4) {
+           flg->total_count >= hammer_limit_reclaims / 4) {
                if (hmp->fill_flush_group == flg)
                        hmp->fill_flush_group = TAILQ_NEXT(flg, flush_entry);
                hammer_flusher_async(hmp, flg);
@@ -2622,6 +2622,8 @@ hammer_flush_inode_done(hammer_inode_t ip, int error)
                --hmp->rsv_inodes;
        }
 
+       ip->flags &= ~HAMMER_INODE_SLAVEFLUSH;
+
        if (dorel)
                hammer_rel_inode(ip, 0);
 }
@@ -3244,8 +3246,8 @@ hammer_inode_wakereclaims(hammer_inode_t ip)
        if ((ip->flags & HAMMER_INODE_RECLAIM) == 0)
                return;
 
-       --hammer_count_reclaiming;
-       --hmp->inode_reclaims;
+       --hammer_count_reclaims;
+       --hmp->count_reclaims;
        ip->flags &= ~HAMMER_INODE_RECLAIM;
 
        if ((reclaim = TAILQ_FIRST(&hmp->reclaim_list)) != NULL) {
@@ -3263,11 +3265,11 @@ hammer_inode_wakereclaims(hammer_inode_t ip)
  * if a new inode is created or an inode is loaded from media.
  *
  * When we block we don't care *which* inode has finished reclaiming,
- * as lone as one does.
+ * as long as one does.
  *
- * The reclaim pipeline is primary governed by the auto-flush which is
- * 1/4 hammer_limit_reclaim.  We don't want to block if the count is
- * less than 1/2 hammer_limit_reclaim.  From 1/2 to full count is
+ * The reclaim pipeline is primarily governed by the auto-flush which is
+ * 1/4 hammer_limit_reclaims.  We don't want to block if the count is
+ * less than 1/2 hammer_limit_reclaims.  From 1/2 to full count is
  * dynamically governed.
  */
 void
@@ -3279,7 +3281,7 @@ hammer_inode_waitreclaims(hammer_transaction_t trans)
 
        /*
         * Track inode load, delay if the number of reclaiming inodes is
-        * between 2/4 and 4/4 hammer_limit_reclaim, depending.
+        * between 2/4 and 4/4 hammer_limit_reclaims, depending.
         */
        if (curthread->td_proc) {
                struct hammer_inostats *stats;
@@ -3287,17 +3289,17 @@ hammer_inode_waitreclaims(hammer_transaction_t trans)
                stats = hammer_inode_inostats(hmp, curthread->td_proc->p_pid);
                ++stats->count;
 
-               if (stats->count > hammer_limit_reclaim / 2)
-                       stats->count = hammer_limit_reclaim / 2;
-               lower_limit = hammer_limit_reclaim - stats->count;
+               if (stats->count > hammer_limit_reclaims / 2)
+                       stats->count = hammer_limit_reclaims / 2;
+               lower_limit = hammer_limit_reclaims - stats->count;
                if (hammer_debug_general & 0x10000) {
                        kprintf("pid %5d limit %d\n",
                                (int)curthread->td_proc->p_pid, lower_limit);
                }
        } else {
-               lower_limit = hammer_limit_reclaim * 3 / 4;
+               lower_limit = hammer_limit_reclaims * 3 / 4;
        }
-       if (hmp->inode_reclaims >= lower_limit) {
+       if (hmp->count_reclaims >= lower_limit) {
                reclaim.count = 1;
                TAILQ_INSERT_TAIL(&hmp->reclaim_list, &reclaim, entry);
                tsleep(&reclaim, 0, "hmrrcm", hz);
@@ -3384,13 +3386,13 @@ hammer_inode_waithard(hammer_mount_t hmp)
         * Hysteresis.
         */
        if (hmp->flags & HAMMER_MOUNT_FLUSH_RECOVERY) {
-               if (hmp->inode_reclaims < hammer_limit_reclaim / 2 &&
+               if (hmp->count_reclaims < hammer_limit_reclaims / 2 &&
                    hmp->count_iqueued < hmp->count_inodes / 20) {
                        hmp->flags &= ~HAMMER_MOUNT_FLUSH_RECOVERY;
                        return;
                }
        } else {
-               if (hmp->inode_reclaims < hammer_limit_reclaim ||
+               if (hmp->count_reclaims < hammer_limit_reclaims ||
                    hmp->count_iqueued < hmp->count_inodes / 10) {
                        return;
                }
index 3d5d319..4dfb470 100644 (file)
@@ -62,10 +62,11 @@ int hammer_debug_recover_faults;
 int hammer_debug_critical;             /* non-zero enter debugger on error */
 int hammer_cluster_enable = 1;         /* enable read clustering by default */
 int hammer_live_dedup = 0;
+int hammer_tdmux_ticks;
 int hammer_count_fsyncs;
 int hammer_count_inodes;
 int hammer_count_iqueued;
-int hammer_count_reclaiming;
+int hammer_count_reclaims;
 int hammer_count_records;
 int hammer_count_record_datas;
 int hammer_count_volumes;
@@ -103,7 +104,7 @@ int hammer_limit_dirtybufspace;             /* per-mount */
 int hammer_limit_running_io;           /* per-mount */
 int hammer_limit_recs;                 /* as a whole XXX */
 int hammer_limit_inode_recs = 2048;    /* per inode */
-int hammer_limit_reclaim;
+int hammer_limit_reclaims;
 int hammer_live_dedup_cache_size = DEDUP_CACHE_SIZE;
 int hammer_limit_redo = 4096 * 1024;   /* per inode */
 int hammer_autoflush = 500;            /* auto flush (typ on reclaim) */
@@ -160,6 +161,8 @@ SYSCTL_INT(_vfs_hammer, OID_AUTO, cluster_enable, CTLFLAG_RW,
  */
 SYSCTL_INT(_vfs_hammer, OID_AUTO, live_dedup, CTLFLAG_RW,
           &hammer_live_dedup, 0, "Enable live dedup");
+SYSCTL_INT(_vfs_hammer, OID_AUTO, tdmux_ticks, CTLFLAG_RW,
+          &hammer_tdmux_ticks, 0, "Hammer tdmux ticks");
 
 SYSCTL_INT(_vfs_hammer, OID_AUTO, limit_dirtybufspace, CTLFLAG_RW,
           &hammer_limit_dirtybufspace, 0, "");
@@ -169,8 +172,8 @@ SYSCTL_INT(_vfs_hammer, OID_AUTO, limit_recs, CTLFLAG_RW,
           &hammer_limit_recs, 0, "");
 SYSCTL_INT(_vfs_hammer, OID_AUTO, limit_inode_recs, CTLFLAG_RW,
           &hammer_limit_inode_recs, 0, "");
-SYSCTL_INT(_vfs_hammer, OID_AUTO, limit_reclaim, CTLFLAG_RW,
-          &hammer_limit_reclaim, 0, "");
+SYSCTL_INT(_vfs_hammer, OID_AUTO, limit_reclaims, CTLFLAG_RW,
+          &hammer_limit_reclaims, 0, "");
 SYSCTL_INT(_vfs_hammer, OID_AUTO, live_dedup_cache_size, CTLFLAG_RW,
           &hammer_live_dedup_cache_size, 0,
           "Number of cache entries");
@@ -183,8 +186,8 @@ SYSCTL_INT(_vfs_hammer, OID_AUTO, count_inodes, CTLFLAG_RD,
           &hammer_count_inodes, 0, "");
 SYSCTL_INT(_vfs_hammer, OID_AUTO, count_iqueued, CTLFLAG_RD,
           &hammer_count_iqueued, 0, "");
-SYSCTL_INT(_vfs_hammer, OID_AUTO, count_reclaiming, CTLFLAG_RD,
-          &hammer_count_reclaiming, 0, "");
+SYSCTL_INT(_vfs_hammer, OID_AUTO, count_reclaims, CTLFLAG_RD,
+          &hammer_count_reclaims, 0, "");
 SYSCTL_INT(_vfs_hammer, OID_AUTO, count_records, CTLFLAG_RD,
           &hammer_count_records, 0, "");
 SYSCTL_INT(_vfs_hammer, OID_AUTO, count_record_datas, CTLFLAG_RD,
@@ -332,6 +335,17 @@ hammer_vfs_init(struct vfsconf *conf)
 {
        int n;
 
+       /*
+        * Wait up to this long for an exclusive deadlock to clear
+        * before acquiring a new shared lock on the ip.  The deadlock
+        * may have occured on a b-tree node related to the ip.
+        */
+       if (hammer_tdmux_ticks == 0)
+               hammer_tdmux_ticks = hz / 5;
+
+       /*
+        * Autosize
+        */
        if (hammer_limit_recs == 0) {
                hammer_limit_recs = nbuf * 25;
                n = kmalloc_limit(M_HAMMER) / 512;
@@ -354,6 +368,7 @@ hammer_vfs_init(struct vfsconf *conf)
         */
        if (hammer_limit_running_io == 0)
                hammer_limit_running_io = hammer_limit_dirtybufspace;
+
        if (hammer_limit_running_io > 10 * 1024 * 1024)
                hammer_limit_running_io = 10 * 1024 * 1024;
 
@@ -362,8 +377,8 @@ hammer_vfs_init(struct vfsconf *conf)
         * This limits the number of inodes in this state to prevent a
         * memory pool blowout.
         */
-       if (hammer_limit_reclaim == 0)
-               hammer_limit_reclaim = desiredvnodes / 10;
+       if (hammer_limit_reclaims == 0)
+               hammer_limit_reclaims = desiredvnodes / 10;
 
        return(0);
 }