HAMMER VFS - The backend flusher now sorts inodes
authorMatthew Dillon <dillon@apollo.backplane.com>
Mon, 2 Nov 2009 17:11:46 +0000 (09:11 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Mon, 2 Nov 2009 17:11:46 +0000 (09:11 -0800)
* Change the tailq of inodes in a flush group to a red-black tree.
  The flusher now processes inodes in sorted order and breaks them up
  into larger sets for concurrent flushing.  The flusher threads are thus
  more likely to concurrently process inodes which are fairly far apart
  in the B-Tree.

  This greatly reduces lock interference between flusher threads.  However,
  B-Tree deadlocks are still an issue between inodes undergoing flushes
  and front-end access operations.  This can be observed by noting periods
  of low dev-write activity in 'hammer iostats 1' output during a blogbench
  test.  The hammer-S* kernel threads will likely be in a 'hmrdlk' state
  at the same time.

* Add sysctl vfs.hammer.limit_reclaim to set the maximum
  number of inodes with no vnode associations, default 4000.

  NOTE: For debugging only, setting this value too high will blow
  out the kmalloc pool.

sys/vfs/hammer/hammer.h
sys/vfs/hammer/hammer_flusher.c
sys/vfs/hammer/hammer_inode.c
sys/vfs/hammer/hammer_vfsops.c

index 35393ec..51a7378 100644 (file)
@@ -243,10 +243,15 @@ TAILQ_HEAD(hammer_node_cache_list, hammer_node_cache);
  * Without this a 'sync' could end up flushing 50,000 inodes in a single
  * transaction.
  */
+struct hammer_fls_rb_tree;
+RB_HEAD(hammer_fls_rb_tree, hammer_inode);
+RB_PROTOTYPE(hammer_fls_rb_tree, hammer_inode, rb_flsnode,
+             hammer_ino_rb_compare);
+
 struct hammer_flush_group {
        TAILQ_ENTRY(hammer_flush_group) flush_entry;
-       TAILQ_HEAD(, hammer_inode)      flush_list;
-       int                             unused01;       /* inode load */
+       struct hammer_fls_rb_tree       flush_tree;
+       int                             unused01;
        int                             total_count;    /* record load */
        int                             running;        /* group is running */
        int                             closed;
@@ -291,7 +296,7 @@ struct hammer_inode {
        RB_ENTRY(hammer_inode)  rb_node;
        hammer_inode_state_t    flush_state;
        hammer_flush_group_t    flush_group;
-       TAILQ_ENTRY(hammer_inode) flush_entry;
+       RB_ENTRY(hammer_inode)  rb_flsnode;     /* when on flush list */
        struct hammer_record_list target_list;  /* target of dependant recs */
        int64_t                 obj_id;         /* (key) object identifier */
        hammer_tid_t            obj_asof;       /* (key) snapshot or 0 */
@@ -365,8 +370,6 @@ typedef struct hammer_inode *hammer_inode_t;
 #define HAMMER_INODE_MODMASK_NOXDIRTY \
                                (HAMMER_INODE_MODMASK & ~HAMMER_INODE_XDIRTY)
 
-#define HAMMER_FLUSH_GROUP_SIZE        64
-
 #define HAMMER_FLUSH_SIGNAL    0x0001
 #define HAMMER_FLUSH_RECURSION 0x0002
 
@@ -381,8 +384,7 @@ struct hammer_reclaim {
        int     count;
 };
 
-#define HAMMER_RECLAIM_FLUSH   2000
-#define HAMMER_RECLAIM_WAIT    4000
+#define HAMMER_RECLAIM_WAIT    4000    /* default vfs.hammer.limit_reclaim */
 
 /*
  * Structure used to represent an unsynchronized record in-memory.  These
@@ -868,6 +870,7 @@ extern int hammer_count_io_locked;
 extern int hammer_limit_dirtybufspace;
 extern int hammer_limit_recs;
 extern int hammer_limit_inode_recs;
+extern int hammer_limit_reclaim;
 extern int hammer_bio_count;
 extern int hammer_verify_zone;
 extern int hammer_verify_data;
index a728015..5991f3d 100644 (file)
@@ -48,6 +48,17 @@ static void hammer_flusher_flush(hammer_mount_t hmp);
 static void hammer_flusher_flush_inode(hammer_inode_t ip,
                                        hammer_transaction_t trans);
 
+RB_GENERATE(hammer_fls_rb_tree, hammer_inode, rb_flsnode,
+              hammer_ino_rb_compare);
+
+/*
+ * Inodes are sorted and assigned to slave threads in groups of 128.
+ * We want a flush group size large enough such that the slave threads
+ * are not likely to interfere with each other when accessing the B-Tree,
+ * but not so large that we lose concurrency.
+ */
+#define HAMMER_FLUSH_GROUP_SIZE 128
+
 /*
  * Support structures for the flusher threads.
  */
@@ -309,15 +320,16 @@ hammer_flusher_flush(hammer_mount_t hmp)
                        hmp->next_flush_group = TAILQ_NEXT(flg, flush_entry);
 
                /*
-                * Iterate the inodes in the flg's flush_list and assign
+                * Iterate the inodes in the flg's flush_tree and assign
                 * them to slaves.
                 */
                slave_index = 0;
                info = TAILQ_FIRST(&hmp->flusher.ready_list);
-               next_ip = TAILQ_FIRST(&flg->flush_list);
+               next_ip = RB_FIRST(hammer_fls_rb_tree, &flg->flush_tree);
 
                while ((ip = next_ip) != NULL) {
-                       next_ip = TAILQ_NEXT(ip, flush_entry);
+                       next_ip = RB_NEXT(hammer_fls_rb_tree,
+                                         &flg->flush_tree, ip);
 
                        if (++hmp->check_yield > hammer_yield_check) {
                                hmp->check_yield = 0;
@@ -379,8 +391,7 @@ hammer_flusher_flush(hammer_mount_t hmp)
                 * Loop up on the same flg.  If the flg is done clean it up
                 * and break out.  We only flush one flg.
                 */
-               if (TAILQ_FIRST(&flg->flush_list) == NULL) {
-                       KKASSERT(TAILQ_EMPTY(&flg->flush_list));
+               if (RB_EMPTY(&flg->flush_tree)) {
                        KKASSERT(flg->refs == 0);
                        TAILQ_REMOVE(&hmp->flush_group_list, flg, flush_entry);
                        kfree(flg, hmp->m_misc);
@@ -413,7 +424,7 @@ hammer_flusher_flush(hammer_mount_t hmp)
 
 
 /*
- * The slave flusher thread pulls work off the master flush_list until no
+ * The slave flusher thread pulls work off the master flush list until no
  * work is left.
  */
 static void
index ef413f3..877a526 100644 (file)
@@ -1488,7 +1488,7 @@ hammer_destroy_inode_callback(struct hammer_inode *ip, void *data __unused)
         */
        switch(ip->flush_state) {
        case HAMMER_FST_FLUSH:
-               TAILQ_REMOVE(&ip->flush_group->flush_list, ip, flush_entry);
+               RB_REMOVE(hammer_fls_rb_tree, &ip->flush_group->flush_tree, ip);
                --ip->flush_group->refs;
                ip->flush_group = NULL;
                /* fall through */
@@ -1599,7 +1599,7 @@ hammer_flush_inode(hammer_inode_t ip, int flags)
        if (flg == NULL) {
                flg = kmalloc(sizeof(*flg), hmp->m_misc, M_WAITOK|M_ZERO);
                hmp->next_flush_group = flg;
-               TAILQ_INIT(&flg->flush_list);
+               RB_INIT(&flg->flush_tree);
                TAILQ_INSERT_TAIL(&hmp->flush_group_list, flg, flush_entry);
        }
 
@@ -2036,7 +2036,7 @@ hammer_flush_inode_core(hammer_inode_t ip, hammer_flush_group_t flg, int flags)
         * The flusher list inherits our inode and reference.
         */
        KKASSERT(flg->running == 0);
-       TAILQ_INSERT_TAIL(&flg->flush_list, ip, flush_entry);
+       RB_INSERT(hammer_fls_rb_tree, &flg->flush_tree, ip);
        if (--ip->hmp->flusher.group_lock == 0)
                wakeup(&ip->hmp->flusher.group_lock);
 
@@ -2354,7 +2354,7 @@ hammer_flush_inode_done(hammer_inode_t ip, int error)
                /*
                 * Remove from the flush_group
                 */
-               TAILQ_REMOVE(&ip->flush_group->flush_list, ip, flush_entry);
+               RB_REMOVE(hammer_fls_rb_tree, &ip->flush_group->flush_tree, ip);
                ip->flush_group = NULL;
 
                /*
@@ -3009,7 +3009,7 @@ hammer_inode_wakereclaims(hammer_inode_t ip)
                        TAILQ_REMOVE(&hmp->reclaim_list, reclaim, entry);
                        wakeup(reclaim);
                }
-               if (hmp->inode_reclaims > HAMMER_RECLAIM_WAIT / 2)
+               if (hmp->inode_reclaims > hammer_limit_reclaim / 2)
                        break;
        }
 }
@@ -3027,7 +3027,7 @@ hammer_inode_waitreclaims(hammer_mount_t hmp)
 {
        struct hammer_reclaim reclaim;
 
-       if (hmp->inode_reclaims < HAMMER_RECLAIM_WAIT)
+       if (hmp->inode_reclaims < hammer_limit_reclaim)
                return;
        reclaim.count = 1;
        TAILQ_INSERT_TAIL(&hmp->reclaim_list, &reclaim, entry);
@@ -3061,13 +3061,13 @@ hammer_inode_waithard(hammer_mount_t hmp)
         * Hysteresis.
         */
        if (hmp->flags & HAMMER_MOUNT_FLUSH_RECOVERY) {
-               if (hmp->inode_reclaims < HAMMER_RECLAIM_WAIT / 2 &&
+               if (hmp->inode_reclaims < hammer_limit_reclaim / 2 &&
                    hmp->count_iqueued < hmp->count_inodes / 20) {
                        hmp->flags &= ~HAMMER_MOUNT_FLUSH_RECOVERY;
                        return;
                }
        } else {
-               if (hmp->inode_reclaims < HAMMER_RECLAIM_WAIT ||
+               if (hmp->inode_reclaims < hammer_limit_reclaim ||
                    hmp->count_iqueued < hmp->count_inodes / 10) {
                        return;
                }
index 89a3832..23be0ac 100644 (file)
@@ -98,6 +98,7 @@ int hammer_count_io_locked;
 int hammer_limit_dirtybufspace;                /* per-mount */
 int hammer_limit_recs;                 /* as a whole XXX */
 int hammer_limit_inode_recs = 1024;    /* per inode */
+int hammer_limit_reclaim = HAMMER_RECLAIM_WAIT;
 int hammer_autoflush = 2000;           /* auto flush */
 int hammer_bio_count;
 int hammer_verify_zone;
@@ -140,6 +141,8 @@ SYSCTL_INT(_vfs_hammer, OID_AUTO, limit_recs, CTLFLAG_RW,
           &hammer_limit_recs, 0, "");
 SYSCTL_INT(_vfs_hammer, OID_AUTO, limit_inode_recs, CTLFLAG_RW,
           &hammer_limit_inode_recs, 0, "");
+SYSCTL_INT(_vfs_hammer, OID_AUTO, limit_reclaim, CTLFLAG_RW,
+          &hammer_limit_reclaim, 0, "");
 
 SYSCTL_INT(_vfs_hammer, OID_AUTO, count_fsyncs, CTLFLAG_RD,
           &hammer_count_fsyncs, 0, "");
@@ -377,7 +380,7 @@ hammer_vfs_mount(struct mount *mp, char *mntpt, caddr_t data,
                kmalloc_create(&hmp->m_inodes, "HAMMER-inodes");
 
                maxinodes = desiredvnodes + desiredvnodes / 5 +
-                           HAMMER_RECLAIM_WAIT;
+                           hammer_limit_reclaim * 2;
                kmalloc_raise_limit(hmp->m_inodes,
                                    maxinodes * sizeof(struct hammer_inode));
 
@@ -746,7 +749,7 @@ hammer_free_hmp(struct mount *mp)
        KKASSERT(RB_EMPTY(&hmp->rb_inos_root));
        while ((flg = TAILQ_FIRST(&hmp->flush_group_list)) != NULL) {
                TAILQ_REMOVE(&hmp->flush_group_list, flg, flush_entry);
-               KKASSERT(TAILQ_EMPTY(&flg->flush_list));
+               KKASSERT(RB_EMPTY(&flg->flush_tree));
                if (flg->refs) {
                        kprintf("HAMMER: Warning, flush_group %p was "
                                "not empty on umount!\n", flg);