HAMMER performance and kernel memory issues.
authorMatthew Dillon <dillon@apollo.backplane.com>
Sun, 28 Dec 2008 23:29:48 +0000 (15:29 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Sun, 28 Dec 2008 23:29:48 +0000 (15:29 -0800)
* The flusher could stall-out due to deadlocks under certain heavy load
  situations (blogbench -i1000 -o), causing the number of hammer inodes
  to increase slowly until the kernel runs out of memory.

  Fix this by detecting the condition and stalling the threads
  causing the deadlocks instead for a short while.  Any user thread
  which blocked performing B-Tree node I/O is a candidate.

* Improve the stability of the inode reclaim heuristic which slows
  down the creation of new inodes when too many disconnected hammer
  inodes are present.

* Remove vfs.hammer.limit_iqueued - this sysctl is no longer used.

Reported-by: YONETANI Tomokazu <qhwt+dfly@les.ath.cx>
sys/vfs/hammer/hammer.h
sys/vfs/hammer/hammer_btree.c
sys/vfs/hammer/hammer_cursor.c
sys/vfs/hammer/hammer_flusher.c
sys/vfs/hammer/hammer_inode.c
sys/vfs/hammer/hammer_ondisk.c
sys/vfs/hammer/hammer_transaction.c
sys/vfs/hammer/hammer_vfsops.c

index d1055c3..b0cfc01 100644 (file)
@@ -116,6 +116,7 @@ struct hammer_transaction {
 typedef struct hammer_transaction *hammer_transaction_t;
 
 #define HAMMER_TRANSF_NEWINODE 0x0001
+#define HAMMER_TRANSF_DIDIO    0x0002
 
 /*
  * HAMMER locks
@@ -352,11 +353,12 @@ typedef struct hammer_inode *hammer_inode_t;
 /*
  * Used by the inode reclaim code to pipeline reclaims and avoid
  * blowing out kernel memory or letting the flusher get too far
- * behind.
+ * behind.  The reclaim wakes up when count reaches 0 or the
+ * timer expires.
  */
 struct hammer_reclaim {
        TAILQ_ENTRY(hammer_reclaim) entry;
-       int     okydoky;
+       int     count;
 };
 
 #define HAMMER_RECLAIM_FLUSH   2000
@@ -753,6 +755,7 @@ struct hammer_mount {
 typedef struct hammer_mount    *hammer_mount_t;
 
 #define HAMMER_MOUNT_CRITICAL_ERROR    0x0001
+#define HAMMER_MOUNT_FLUSH_RECOVERY    0x0002
 
 struct hammer_sync_info {
        int error;
@@ -821,7 +824,6 @@ extern int hammer_count_io_running_read;
 extern int hammer_count_io_running_write;
 extern int hammer_count_io_locked;
 extern int hammer_limit_dirtybufspace;
-extern int hammer_limit_iqueued;
 extern int hammer_limit_recs;
 extern int hammer_bio_count;
 extern int hammer_verify_zone;
@@ -846,6 +848,7 @@ void        hammer_scan_inode_snapshots(hammer_mount_t hmp,
 void   hammer_put_inode(struct hammer_inode *ip);
 void   hammer_put_inode_ref(struct hammer_inode *ip);
 void   hammer_inode_waitreclaims(hammer_mount_t hmp);
+void   hammer_inode_waithard(hammer_mount_t hmp);
 
 int    hammer_unload_volume(hammer_volume_t volume, void *data __unused);
 int    hammer_adjust_volume_mode(hammer_volume_t volume, void *data __unused);
@@ -963,7 +966,8 @@ int hammer_btree_lock_children(hammer_cursor_t cursor,
 void   hammer_btree_unlock_children(hammer_cursor_t cursor,
                        struct hammer_node_locklist **locklistp);
 int    hammer_btree_search_node(hammer_base_elm_t elm, hammer_node_ondisk_t node);
-hammer_node_t hammer_btree_get_parent(hammer_node_t node, int *parent_indexp,
+hammer_node_t hammer_btree_get_parent(hammer_transaction_t trans,
+                       hammer_node_t node, int *parent_indexp,
                        int *errorp, int try_exclusive);
 
 void   hammer_print_btree_node(hammer_node_ondisk_t ondisk);
@@ -999,8 +1003,8 @@ void               hammer_rel_buffer(hammer_buffer_t buffer, int flush);
 
 int            hammer_vfs_export(struct mount *mp, int op,
                        const struct export_args *export);
-hammer_node_t  hammer_get_node(hammer_mount_t hmp, hammer_off_t node_offset,
-                       int isnew, int *errorp);
+hammer_node_t  hammer_get_node(hammer_transaction_t trans,
+                       hammer_off_t node_offset, int isnew, int *errorp);
 void           hammer_ref_node(hammer_node_t node);
 hammer_node_t  hammer_ref_node_safe(struct hammer_mount *hmp,
                        hammer_node_cache_t cache, int *errorp);
@@ -1169,6 +1173,7 @@ void hammer_flusher_sync(hammer_mount_t hmp);
 int  hammer_flusher_async(hammer_mount_t hmp, hammer_flush_group_t flg);
 int  hammer_flusher_async_one(hammer_mount_t hmp);
 void hammer_flusher_wait(hammer_mount_t hmp, int seq);
+void hammer_flusher_wait_next(hammer_mount_t hmp);
 int  hammer_flusher_meta_limit(hammer_mount_t hmp);
 int  hammer_flusher_meta_halflimit(hammer_mount_t hmp);
 int  hammer_flusher_undo_exhausted(hammer_transaction_t trans, int quarter);
index f3543fa..9c8a14b 100644 (file)
@@ -2331,8 +2331,8 @@ hammer_btree_mirror_propagate(hammer_cursor_t cursor, hammer_tid_t mirror_tid)
 }
 
 hammer_node_t
-hammer_btree_get_parent(hammer_node_t node, int *parent_indexp, int *errorp,
-                       int try_exclusive)
+hammer_btree_get_parent(hammer_transaction_t trans, hammer_node_t node,
+                       int *parent_indexp, int *errorp, int try_exclusive)
 {
        hammer_node_t parent;
        hammer_btree_elm_t elm;
@@ -2341,7 +2341,7 @@ hammer_btree_get_parent(hammer_node_t node, int *parent_indexp, int *errorp,
        /*
         * Get the node
         */
-       parent = hammer_get_node(node->hmp, node->ondisk->parent, 0, errorp);
+       parent = hammer_get_node(trans, node->ondisk->parent, 0, errorp);
        if (*errorp) {
                KKASSERT(parent == NULL);
                return(NULL);
@@ -2406,7 +2406,7 @@ btree_set_parent(hammer_transaction_t trans, hammer_node_t node,
        switch(elm->base.btype) {
        case HAMMER_BTREE_TYPE_INTERNAL:
        case HAMMER_BTREE_TYPE_LEAF:
-               child = hammer_get_node(node->hmp, elm->internal.subtree_offset,
+               child = hammer_get_node(trans, elm->internal.subtree_offset,
                                        0, &error);
                if (error == 0) {
                        hammer_modify_node_field(trans, child, parent);
@@ -2461,7 +2461,7 @@ hammer_btree_lock_children(hammer_cursor_t cursor,
                    elm->base.btype != HAMMER_BTREE_TYPE_INTERNAL) {
                        continue;
                }
-               child = hammer_get_node(hmp,
+               child = hammer_get_node(cursor->trans,
                                        elm->internal.subtree_offset,
                                        0, &error);
                if (child)
@@ -2479,7 +2479,7 @@ hammer_btree_lock_children(hammer_cursor_t cursor,
                case HAMMER_BTREE_TYPE_INTERNAL:
                case HAMMER_BTREE_TYPE_LEAF:
                        KKASSERT(elm->internal.subtree_offset != 0);
-                       child = hammer_get_node(hmp,
+                       child = hammer_get_node(cursor->trans,
                                                elm->internal.subtree_offset,
                                                0, &error);
                        break;
index 72b1240..039b238 100644 (file)
@@ -94,8 +94,7 @@ hammer_init_cursor(hammer_transaction_t trans, hammer_cursor_t cursor,
                volume = hammer_get_root_volume(trans->hmp, &error);
                if (error)
                        break;
-               node = hammer_get_node(trans->hmp,
-                                      volume->ondisk->vol0_btree_root,
+               node = hammer_get_node(trans, volume->ondisk->vol0_btree_root,
                                       0, &error);
                hammer_rel_volume(volume, 0);
                if (error)
@@ -313,7 +312,8 @@ hammer_load_cursor_parent(hammer_cursor_t cursor, int try_exclusive)
 
        if (cursor->node->ondisk->parent) {
                node = cursor->node;
-               parent = hammer_btree_get_parent(node, &parent_index,
+               parent = hammer_btree_get_parent(cursor->trans, node,
+                                                &parent_index,
                                                 &error, try_exclusive);
                if (error == 0) {
                        elm = &parent->ondisk->elms[parent_index];
@@ -453,7 +453,7 @@ hammer_cursor_down(hammer_cursor_t cursor)
                KKASSERT(elm->internal.subtree_offset != 0);
                cursor->left_bound = &elm[0].internal.base;
                cursor->right_bound = &elm[1].internal.base;
-               node = hammer_get_node(cursor->trans->hmp,
+               node = hammer_get_node(cursor->trans,
                                       elm->internal.subtree_offset, 0, &error);
                if (error == 0) {
                        KASSERT(elm->base.btype == node->ondisk->type, ("BTYPE MISMATCH %c %c NODE %p\n", elm->base.btype, node->ondisk->type, node));
index b167c51..d50997c 100644 (file)
@@ -137,6 +137,15 @@ hammer_flusher_wait(hammer_mount_t hmp, int seq)
 }
 
 void
+hammer_flusher_wait_next(hammer_mount_t hmp)
+{
+       int seq;
+
+       seq = hammer_flusher_async_one(hmp);
+       hammer_flusher_wait(hmp, seq);
+}
+
+void
 hammer_flusher_create(hammer_mount_t hmp)
 {
        hammer_flusher_info_t info;
index 18abae7..e8df385 100644 (file)
@@ -51,7 +51,7 @@ static int    hammer_setup_parent_inodes(hammer_inode_t ip,
                                        hammer_flush_group_t flg);
 static int     hammer_setup_parent_inodes_helper(hammer_record_t record,
                                        hammer_flush_group_t flg);
-static void    hammer_inode_wakereclaims(hammer_inode_t ip);
+static void    hammer_inode_wakereclaims(hammer_inode_t ip, int dowake);
 
 #ifdef DEBUG_TRUNCATE
 extern struct hammer_inode *HammerTruncIp;
@@ -262,7 +262,7 @@ hammer_get_vnode(struct hammer_inode *ip, struct vnode **vpp)
                        obj_type = ip->ino_data.obj_type;
                        vp->v_type = hammer_get_vnode_type(obj_type);
 
-                       hammer_inode_wakereclaims(ip);
+                       hammer_inode_wakereclaims(ip, 0);
 
                        switch(ip->ino_data.obj_type) {
                        case HAMMER_OBJTYPE_CDEV:
@@ -358,6 +358,13 @@ hammer_get_inode(hammer_transaction_t trans, hammer_inode_t dip,
        /*
         * Determine if we already have an inode cached.  If we do then
         * we are golden.
+        *
+        * If we find an inode with no vnode we have to mark the
+        * transaction such that hammer_inode_waitreclaims() is
+        * called later on to avoid building up an infinite number
+        * of inodes.  Otherwise we can continue to * add new inodes
+        * faster then they can be disposed of, even with the tsleep
+        * delay.
         */
        iinfo.obj_id = obj_id;
        iinfo.obj_asof = asof;
@@ -365,6 +372,10 @@ hammer_get_inode(hammer_transaction_t trans, hammer_inode_t dip,
 loop:
        ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
        if (ip) {
+#if 0
+               if (ip->vp == NULL)
+                       trans->flags |= HAMMER_TRANSF_NEWINODE;
+#endif
                hammer_ref(&ip->lock);
                *errorp = 0;
                return(ip);
@@ -510,6 +521,7 @@ hammer_create_inode(hammer_transaction_t trans, struct vattr *vap,
        ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK|M_ZERO);
        ++hammer_count_inodes;
        ++hmp->count_inodes;
+       trans->flags |= HAMMER_TRANSF_NEWINODE;
 
        if (pfsm) {
                KKASSERT(pfsm->localization != 0);
@@ -672,7 +684,7 @@ hammer_free_inode(hammer_inode_t ip)
        KKASSERT(ip->lock.refs == 1);
        hammer_uncache_node(&ip->cache[0]);
        hammer_uncache_node(&ip->cache[1]);
-       hammer_inode_wakereclaims(ip);
+       hammer_inode_wakereclaims(ip, 1);
        if (ip->objid_cache)
                hammer_clear_objid(ip);
        --hammer_count_inodes;
@@ -2714,12 +2726,12 @@ hammer_test_inode(hammer_inode_t ip)
  * Clear the RECLAIM flag on an inode.  This occurs when the inode is
  * reassociated with a vp or just before it gets freed.
  *
- * Wakeup one thread blocked waiting on reclaims to complete.  Note that
- * the inode the thread is waiting on behalf of is a different inode then
- * the inode we are called with.  This is to create a pipeline.
+ * Pipeline wakeups to threads blocked due to an excessive number of
+ * detached inodes.  The reclaim count generates a bit of negative
+ * feedback.
  */
 static void
-hammer_inode_wakereclaims(hammer_inode_t ip)
+hammer_inode_wakereclaims(hammer_inode_t ip, int dowake)
 {
        struct hammer_reclaim *reclaim;
        hammer_mount_t hmp = ip->hmp;
@@ -2731,10 +2743,12 @@ hammer_inode_wakereclaims(hammer_inode_t ip)
        --hmp->inode_reclaims;
        ip->flags &= ~HAMMER_INODE_RECLAIM;
 
-       if ((reclaim = TAILQ_FIRST(&hmp->reclaim_list)) != NULL) {
-               TAILQ_REMOVE(&hmp->reclaim_list, reclaim, entry);
-               reclaim->okydoky = 1;
-               wakeup(reclaim);
+       if (hmp->inode_reclaims < HAMMER_RECLAIM_WAIT || dowake) {
+               reclaim = TAILQ_FIRST(&hmp->reclaim_list);
+               if (reclaim && reclaim->count > 0 && --reclaim->count == 0) {
+                       TAILQ_REMOVE(&hmp->reclaim_list, reclaim, entry);
+                       wakeup(reclaim);
+               }
        }
 }
 
@@ -2752,21 +2766,55 @@ hammer_inode_waitreclaims(hammer_mount_t hmp)
        struct hammer_reclaim reclaim;
        int delay;
 
-       if (hmp->inode_reclaims > HAMMER_RECLAIM_WAIT) {
-               reclaim.okydoky = 0;
-               TAILQ_INSERT_TAIL(&hmp->reclaim_list,
-                                 &reclaim, entry);
-       } else {
-               reclaim.okydoky = 1;
+       if (hmp->inode_reclaims < HAMMER_RECLAIM_WAIT)
+               return;
+       delay = (hmp->inode_reclaims - HAMMER_RECLAIM_WAIT) * hz /
+               (HAMMER_RECLAIM_WAIT * 3) + 1;
+       if (delay > 0) {
+               reclaim.count = 2;
+               TAILQ_INSERT_TAIL(&hmp->reclaim_list, &reclaim, entry);
+               tsleep(&reclaim, 0, "hmrrcm", delay);
+               if (reclaim.count > 0)
+                       TAILQ_REMOVE(&hmp->reclaim_list, &reclaim, entry);
        }
+}
 
-       if (reclaim.okydoky == 0) {
-               delay = (hmp->inode_reclaims - HAMMER_RECLAIM_WAIT) * hz /
-                       (HAMMER_RECLAIM_WAIT * 5);
-               if (delay >= 0)
-                       tsleep(&reclaim, 0, "hmrrcm", delay + 1);
-               if (reclaim.okydoky == 0)
-                       TAILQ_REMOVE(&hmp->reclaim_list, &reclaim, entry);
+/*
+ * A larger then normal backlog of inodes is sitting in the flusher,
+ * enforce a general slowdown to let it catch up.  This routine is only
+ * called on completion of a non-flusher-related transaction which
+ * performed B-Tree node I/O.
+ *
+ * It is possible for the flusher to stall in a continuous load.
+ * blogbench -i1000 -o seems to do a good job generating this sort of load.
+ * If the flusher is unable to catch up the inode count can bloat until
+ * we run out of kvm.
+ *
+ * This is a bit of a hack.
+ */
+void
+hammer_inode_waithard(hammer_mount_t hmp)
+{
+       /*
+        * Hysteresis.
+        */
+       if (hmp->flags & HAMMER_MOUNT_FLUSH_RECOVERY) {
+               if (hmp->inode_reclaims < HAMMER_RECLAIM_WAIT / 2 &&
+                   hmp->count_iqueued < hmp->count_inodes / 20) {
+                       hmp->flags &= ~HAMMER_MOUNT_FLUSH_RECOVERY;
+                       return;
+               }
+       } else {
+               if (hmp->inode_reclaims < HAMMER_RECLAIM_WAIT ||
+                   hmp->count_iqueued < hmp->count_inodes / 10) {
+                       return;
+               }
+               hmp->flags |= HAMMER_MOUNT_FLUSH_RECOVERY;
        }
+
+       /*
+        * Block for one flush cycle.
+        */
+       hammer_flusher_wait_next(hmp);
 }
 
index 9bf7872..656b7b9 100644 (file)
@@ -1048,9 +1048,10 @@ hammer_bnew_ext(hammer_mount_t hmp, hammer_off_t buf_offset, int bytes,
  * additional references, if necessary.
  */
 hammer_node_t
-hammer_get_node(hammer_mount_t hmp, hammer_off_t node_offset,
+hammer_get_node(hammer_transaction_t trans, hammer_off_t node_offset,
                int isnew, int *errorp)
 {
+       hammer_mount_t hmp = trans->hmp;
        hammer_node_t node;
 
        KKASSERT((node_offset & HAMMER_OFF_ZONE_MASK) == HAMMER_ZONE_BTREE);
@@ -1074,10 +1075,12 @@ again:
                }
        }
        hammer_ref(&node->lock);
-       if (node->ondisk)
+       if (node->ondisk) {
                *errorp = 0;
-       else
+       } else {
                *errorp = hammer_load_node(node, isnew);
+               trans->flags |= HAMMER_TRANSF_DIDIO;
+       }
        if (*errorp) {
                hammer_rel_node(node);
                node = NULL;
@@ -1364,7 +1367,7 @@ hammer_alloc_btree(hammer_transaction_t trans, int *errorp)
                                            sizeof(struct hammer_node_ondisk),
                                            errorp);
        if (*errorp == 0) {
-               node = hammer_get_node(trans->hmp, node_offset, 1, errorp);
+               node = hammer_get_node(trans, node_offset, 1, errorp);
                hammer_modify_node_noundo(trans, node);
                bzero(node->ondisk, sizeof(*node->ondisk));
                hammer_modify_node_done(node);
index 01ff37c..07a81f3 100644 (file)
@@ -118,6 +118,7 @@ hammer_start_transaction_fls(struct hammer_transaction *trans,
 void
 hammer_done_transaction(struct hammer_transaction *trans)
 {
+       hammer_mount_t hmp = trans->hmp;
        int expected_lock_refs;
 
        hammer_rel_volume(trans->rootvol, 0);
@@ -125,8 +126,12 @@ hammer_done_transaction(struct hammer_transaction *trans)
        expected_lock_refs = (trans->type == HAMMER_TRANS_FLS) ? 1 : 0;
        KKASSERT(trans->sync_lock_refs == expected_lock_refs);
        trans->sync_lock_refs = 0;
-       if (trans->flags & HAMMER_TRANSF_NEWINODE)
-               hammer_inode_waitreclaims(trans->hmp);
+       if (trans->type != HAMMER_TRANS_FLS) {
+               if (trans->flags & HAMMER_TRANSF_NEWINODE)
+                       hammer_inode_waitreclaims(hmp);
+               else if (trans->flags & HAMMER_TRANSF_DIDIO)
+                       hammer_inode_waithard(hmp);
+       }
 }
 
 /*
index 46f9ca8..599e8e8 100644 (file)
@@ -93,7 +93,6 @@ int hammer_count_io_running_write;
 int hammer_count_io_locked;
 int hammer_limit_dirtybufspace;                /* per-mount */
 int hammer_limit_recs;                 /* as a whole XXX */
-int hammer_limit_iqueued;              /* per-mount */
 int hammer_autoflush = 2000;           /* auto flush */
 int hammer_bio_count;
 int hammer_verify_zone;
@@ -128,8 +127,6 @@ SYSCTL_INT(_vfs_hammer, OID_AUTO, limit_dirtybufspace, CTLFLAG_RW,
           &hammer_limit_dirtybufspace, 0, "");
 SYSCTL_INT(_vfs_hammer, OID_AUTO, limit_recs, CTLFLAG_RW,
           &hammer_limit_recs, 0, "");
-SYSCTL_INT(_vfs_hammer, OID_AUTO, limit_iqueued, CTLFLAG_RW,
-          &hammer_limit_iqueued, 0, "");
 
 SYSCTL_INT(_vfs_hammer, OID_AUTO, count_fsyncs, CTLFLAG_RD,
           &hammer_count_fsyncs, 0, "");
@@ -272,8 +269,6 @@ hammer_vfs_init(struct vfsconf *conf)
                if (hammer_limit_dirtybufspace < 100)
                        hammer_limit_dirtybufspace = 100;
        }
-       if (hammer_limit_iqueued == 0)
-               hammer_limit_iqueued = desiredvnodes / 5;
        return(0);
 }