HAMMER VFS - frontload kmalloc()'s when rebalancing
authorMatthew Dillon <dillon@apollo.backplane.com>
Sat, 20 Mar 2010 20:01:38 +0000 (13:01 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Sat, 20 Mar 2010 20:01:38 +0000 (13:01 -0700)
* The rebalancing code must allocate upwards of 16MB of memory to hold
  copies of B-Tree nodes (~64*64*4K).  This is enough to blow out the
  emergency memory reserve used by the pageout daemon and deadlock the
  system in low memory situations.

* Refactor the allocations.  Allocate all the memory up-front so no
  major allocations occur while nodes in the B-Tree are held locked.

* There are probably other cases where this may become a problem.  With
  UFS it wasn't an issue because flushing a file was fairly unsophisticated.
  But with HAMMER certain aspects of the flush require B-Tree lookups and
  can't be dumbed down to a simple raw disk write.

  The rebalancing code was the most aggregious abuser of kernel memory
  though and that should now be fixed.

Reported-by: Francois Tigeot <ftigeot@wolfpond.org>
sys/vfs/hammer/hammer.h
sys/vfs/hammer/hammer_btree.c
sys/vfs/hammer/hammer_rebalance.c
sys/vfs/hammer/hammer_reblock.c

index 6a437a0..3040abc 100644 (file)
@@ -724,6 +724,7 @@ struct hammer_node_lock {
 typedef struct hammer_node_lock *hammer_node_lock_t;
 
 #define HAMMER_NODE_LOCK_UPDATED       0x0001
+#define HAMMER_NODE_LOCK_LCACHE                0x0002
 
 /*
  * Common I/O management structure - embedded in in-memory structures
@@ -1142,14 +1143,19 @@ int     hammer_btree_correct_lhb(hammer_cursor_t cursor, hammer_tid_t tid);
 int    btree_set_parent(hammer_transaction_t trans, hammer_node_t node,
                         hammer_btree_elm_t elm);
 void   hammer_node_lock_init(hammer_node_lock_t parent, hammer_node_t node);
+void   hammer_btree_lcache_init(hammer_mount_t hmp, hammer_node_lock_t lcache,
+                       int depth);
+void   hammer_btree_lcache_free(hammer_mount_t hmp, hammer_node_lock_t lcache);
 int    hammer_btree_lock_children(hammer_cursor_t cursor, int depth,
-                       hammer_node_lock_t parent);
+                       hammer_node_lock_t parent,
+                       hammer_node_lock_t lcache);
 void   hammer_btree_lock_copy(hammer_cursor_t cursor,
                        hammer_node_lock_t parent);
 int    hammer_btree_sync_copy(hammer_cursor_t cursor,
                        hammer_node_lock_t parent);
-void   hammer_btree_unlock_children(hammer_cursor_t cursor,
-                       hammer_node_lock_t parent);
+void   hammer_btree_unlock_children(hammer_mount_t hmp,
+                       hammer_node_lock_t parent,
+                       hammer_node_lock_t lcache);
 int    hammer_btree_search_node(hammer_base_elm_t elm, hammer_node_ondisk_t node);
 hammer_node_t hammer_btree_get_parent(hammer_transaction_t trans,
                        hammer_node_t node, int *parent_indexp,
index 91cb117..b1e2a28 100644 (file)
@@ -1526,7 +1526,7 @@ btree_split_internal(hammer_cursor_t cursor)
        const int esize = sizeof(*elm);
 
        hammer_node_lock_init(&lockroot, cursor->node);
-       error = hammer_btree_lock_children(cursor, 1, &lockroot);
+       error = hammer_btree_lock_children(cursor, 1, &lockroot, NULL);
        if (error)
                goto done;
        if ((error = hammer_cursor_upgrade(cursor)) != 0)
@@ -1750,7 +1750,7 @@ btree_split_internal(hammer_cursor_t cursor)
                 &cursor->node->ondisk->elms[cursor->node->ondisk->count].internal.base) >= 0);
 
 done:
-       hammer_btree_unlock_children(cursor, &lockroot);
+       hammer_btree_unlock_children(cursor->trans->hmp, &lockroot, NULL);
        hammer_cursor_downgrade(cursor);
        return (error);
 }
@@ -2667,6 +2667,54 @@ hammer_node_lock_init(hammer_node_lock_t parent, hammer_node_t node)
        parent->flags = 0;
 }
 
+/*
+ * Initialize a cache of hammer_node_lock's including space allocated
+ * for node copies.
+ *
+ * This is used by the rebalancing code to preallocate the copy space
+ * for ~4096 B-Tree nodes (16MB of data) prior to acquiring any HAMMER
+ * locks, otherwise we can blow out the pageout daemon's emergency
+ * reserve and deadlock it.
+ *
+ * NOTE: HAMMER_NODE_LOCK_LCACHE is not set on items cached in the lcache.
+ *      The flag is set when the item is pulled off the cache for use.
+ */
+void
+hammer_btree_lcache_init(hammer_mount_t hmp, hammer_node_lock_t lcache,
+                        int depth)
+{
+       hammer_node_lock_t item;
+       int count;
+
+       for (count = 1; depth; --depth)
+               count *= HAMMER_BTREE_LEAF_ELMS;
+       bzero(lcache, sizeof(*lcache));
+       TAILQ_INIT(&lcache->list);
+       while (count) {
+               item = kmalloc(sizeof(*item), hmp->m_misc, M_WAITOK|M_ZERO);
+               item->copy = kmalloc(sizeof(*item->copy),
+                                    hmp->m_misc, M_WAITOK);
+               TAILQ_INIT(&item->list);
+               TAILQ_INSERT_TAIL(&lcache->list, item, entry);
+               --count;
+       }
+}
+
+void
+hammer_btree_lcache_free(hammer_mount_t hmp, hammer_node_lock_t lcache)
+{
+       hammer_node_lock_t item;
+
+       while ((item = TAILQ_FIRST(&lcache->list)) != NULL) {
+               TAILQ_REMOVE(&lcache->list, item, entry);
+               KKASSERT(item->copy);
+               KKASSERT(TAILQ_EMPTY(&item->list));
+               kfree(item->copy, hmp->m_misc);
+               kfree(item, hmp->m_misc);
+       }
+       KKASSERT(lcache->copy == NULL);
+}
+
 /*
  * Exclusively lock all the children of node.  This is used by the split
  * code to prevent anyone from accessing the children of a cursor node
@@ -2683,7 +2731,8 @@ hammer_node_lock_init(hammer_node_lock_t parent, hammer_node_t node)
  */
 int
 hammer_btree_lock_children(hammer_cursor_t cursor, int depth,
-                          hammer_node_lock_t parent)
+                          hammer_node_lock_t parent,
+                          hammer_node_lock_t lcache)
 {
        hammer_node_t node;
        hammer_node_lock_t item;
@@ -2746,10 +2795,20 @@ hammer_btree_lock_children(hammer_cursor_t cursor, int depth,
                                error = EDEADLK;
                                hammer_rel_node(child);
                        } else {
-                               item = kmalloc(sizeof(*item), hmp->m_misc,
-                                              M_WAITOK|M_ZERO);
+                               if (lcache) {
+                                       item = TAILQ_FIRST(&lcache->list);
+                                       KKASSERT(item != NULL);
+                                       item->flags |= HAMMER_NODE_LOCK_LCACHE;
+                                       TAILQ_REMOVE(&lcache->list,
+                                                    item, entry);
+                               } else {
+                                       item = kmalloc(sizeof(*item),
+                                                      hmp->m_misc,
+                                                      M_WAITOK|M_ZERO);
+                                       TAILQ_INIT(&item->list);
+                               }
+
                                TAILQ_INSERT_TAIL(&parent->list, item, entry);
-                               TAILQ_INIT(&item->list);
                                item->parent = parent;
                                item->node = child;
                                item->index = i;
@@ -2762,13 +2821,14 @@ hammer_btree_lock_children(hammer_cursor_t cursor, int depth,
                                        error = hammer_btree_lock_children(
                                                        cursor,
                                                        depth - 1,
-                                                       item);
+                                                       item,
+                                                       lcache);
                                }
                        }
                }
        }
        if (error)
-               hammer_btree_unlock_children(cursor, parent);
+               hammer_btree_unlock_children(hmp, parent, lcache);
        return(error);
 }
 
@@ -2783,10 +2843,12 @@ hammer_btree_lock_copy(hammer_cursor_t cursor, hammer_node_lock_t parent)
        hammer_node_lock_t item;
 
        if (parent->copy == NULL) {
-               parent->copy = kmalloc(sizeof(*parent->copy), hmp->m_misc,
-                                      M_WAITOK);
-               *parent->copy = *parent->node->ondisk;
+               KKASSERT((parent->flags & HAMMER_NODE_LOCK_LCACHE) == 0);
+               parent->copy = kmalloc(sizeof(*parent->copy),
+                                      hmp->m_misc, M_WAITOK);
        }
+       KKASSERT((parent->flags & HAMMER_NODE_LOCK_UPDATED) == 0);
+       *parent->copy = *parent->node->ondisk;
        TAILQ_FOREACH(item, &parent->list, entry) {
                hammer_btree_lock_copy(cursor, item);
        }
@@ -2821,22 +2883,45 @@ hammer_btree_sync_copy(hammer_cursor_t cursor, hammer_node_lock_t parent)
  * Release previously obtained node locks.  The caller is responsible for
  * cleaning up parent->node itself (its usually just aliased from a cursor),
  * but this function will take care of the copies.
+ *
+ * NOTE: The root node is not placed in the lcache and node->copy is not
+ *      deallocated when lcache != NULL.
  */
 void
-hammer_btree_unlock_children(hammer_cursor_t cursor, hammer_node_lock_t parent)
+hammer_btree_unlock_children(hammer_mount_t hmp, hammer_node_lock_t parent,
+                            hammer_node_lock_t lcache)
 {
        hammer_node_lock_t item;
+       hammer_node_ondisk_t copy;
 
-       if (parent->copy) {
-               kfree(parent->copy, cursor->trans->hmp->m_misc);
-               parent->copy = NULL;    /* safety */
-       }
        while ((item = TAILQ_FIRST(&parent->list)) != NULL) {
                TAILQ_REMOVE(&parent->list, item, entry);
-               hammer_btree_unlock_children(cursor, item);
+               hammer_btree_unlock_children(hmp, item, lcache);
                hammer_unlock(&item->node->lock);
                hammer_rel_node(item->node);
-               kfree(item, cursor->trans->hmp->m_misc);
+               if (lcache) {
+                       /*
+                        * NOTE: When placing the item back in the lcache
+                        *       the flag is cleared by the bzero().
+                        *       Remaining fields are cleared as a safety
+                        *       measure.
+                        */
+                       KKASSERT(item->flags & HAMMER_NODE_LOCK_LCACHE);
+                       KKASSERT(TAILQ_EMPTY(&item->list));
+                       copy = item->copy;
+                       bzero(item, sizeof(*item));
+                       TAILQ_INIT(&item->list);
+                       item->copy = copy;
+                       if (copy)
+                               bzero(copy, sizeof(*copy));
+                       TAILQ_INSERT_TAIL(&lcache->list, item, entry);
+               } else {
+                       kfree(item, hmp->m_misc);
+               }
+       }
+       if (parent->copy && (parent->flags & HAMMER_NODE_LOCK_LCACHE) == 0) {
+               kfree(parent->copy, hmp->m_misc);
+               parent->copy = NULL;    /* safety */
        }
 }
 
index 1b8de5c..5390aa2 100644 (file)
@@ -35,7 +35,7 @@
 #include "hammer.h"
 
 static int rebalance_node(struct hammer_ioc_rebalance *rebal,
-                       hammer_cursor_t cursor);
+                       hammer_cursor_t cursor, hammer_node_lock_t lcache);
 static void rebalance_closeout(hammer_node_lock_t base_item, int base_count,
                        hammer_btree_elm_t elm);
 static void rebalance_parent_ptrs(hammer_node_lock_t base_item, int index,
@@ -55,6 +55,7 @@ hammer_ioc_rebalance(hammer_transaction_t trans, hammer_inode_t ip,
                 struct hammer_ioc_rebalance *rebal)
 {
        struct hammer_cursor cursor;
+       struct hammer_node_lock lcache;
        hammer_btree_leaf_elm_t elm;
        int error;
        int seq;
@@ -79,6 +80,8 @@ hammer_ioc_rebalance(hammer_transaction_t trans, hammer_inode_t ip,
        rebal->key_cur.localization &= HAMMER_LOCALIZE_MASK;
        rebal->key_cur.localization += ip->obj_localization;
 
+       hammer_btree_lcache_init(trans->hmp, &lcache, 2);
+
        seq = trans->hmp->flusher.act;
 
        /*
@@ -135,7 +138,7 @@ retry:
                         */
                        if (cursor.index == cursor.node->ondisk->count - 1) {
                                hammer_sync_lock_sh(trans);
-                               error = rebalance_node(rebal, &cursor);
+                               error = rebalance_node(rebal, &cursor, &lcache);
                                hammer_sync_unlock(trans);
                        }
                } else {
@@ -208,6 +211,7 @@ retry:
        }
 failed:
        rebal->key_cur.localization &= HAMMER_LOCALIZE_MASK;
+       hammer_btree_lcache_free(trans->hmp, &lcache);
        return(error);
 }
 
@@ -236,7 +240,8 @@ failed:
  * XXX live-tracked
  */
 static int
-rebalance_node(struct hammer_ioc_rebalance *rebal, hammer_cursor_t cursor)
+rebalance_node(struct hammer_ioc_rebalance *rebal, hammer_cursor_t cursor,
+              struct hammer_node_lock *lcache)
 {
        struct hammer_node_lock lockroot;
        hammer_node_lock_t base_item;
@@ -264,7 +269,7 @@ rebalance_node(struct hammer_ioc_rebalance *rebal, hammer_cursor_t cursor)
        error = hammer_cursor_upgrade(cursor);
        if (error)
                goto done;
-       error = hammer_btree_lock_children(cursor, 2, &lockroot);
+       error = hammer_btree_lock_children(cursor, 2, &lockroot, lcache);
        if (error)
                goto done;
 
@@ -480,7 +485,7 @@ rebalance_node(struct hammer_ioc_rebalance *rebal, hammer_cursor_t cursor)
         */
        rebal->stat_nrebal += hammer_btree_sync_copy(cursor, &lockroot);
 done:
-       hammer_btree_unlock_children(cursor, &lockroot);
+       hammer_btree_unlock_children(cursor->trans->hmp, &lockroot, lcache);
        hammer_cursor_downgrade(cursor);
        return (error);
 }
index a27a1be..3fc2301 100644 (file)
@@ -504,7 +504,7 @@ hammer_reblock_int_node(struct hammer_ioc_reblock *reblock,
        int i;
 
        hammer_node_lock_init(&lockroot, cursor->node);
-       error = hammer_btree_lock_children(cursor, 1, &lockroot);
+       error = hammer_btree_lock_children(cursor, 1, &lockroot, NULL);
        if (error)
                goto done;
 
@@ -586,7 +586,7 @@ hammer_reblock_int_node(struct hammer_ioc_reblock *reblock,
        hammer_rel_node(onode);
 
 done:
-       hammer_btree_unlock_children(cursor, &lockroot);
+       hammer_btree_unlock_children(cursor->trans->hmp, &lockroot, NULL);
        return (error);
 }