From 24cf83d2ed18a34bd6fc6fa42c1845f7c69c12f3 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Sat, 20 Mar 2010 13:01:38 -0700 Subject: [PATCH] HAMMER VFS - frontload kmalloc()'s when rebalancing * The rebalancing code must allocate upwards of 16MB of memory to hold copies of B-Tree nodes (~64*64*4K). This is enough to blow out the emergency memory reserve used by the pageout daemon and deadlock the system in low memory situations. * Refactor the allocations. Allocate all the memory up-front so no major allocations occur while nodes in the B-Tree are held locked. * There are probably other cases where this may become a problem. With UFS it wasn't an issue because flushing a file was fairly unsophisticated. But with HAMMER certain aspects of the flush require B-Tree lookups and can't be dumbed down to a simple raw disk write. The rebalancing code was the most aggregious abuser of kernel memory though and that should now be fixed. Reported-by: Francois Tigeot --- sys/vfs/hammer/hammer.h | 12 ++- sys/vfs/hammer/hammer_btree.c | 121 +++++++++++++++++++++++++----- sys/vfs/hammer/hammer_rebalance.c | 15 ++-- sys/vfs/hammer/hammer_reblock.c | 4 +- 4 files changed, 124 insertions(+), 28 deletions(-) diff --git a/sys/vfs/hammer/hammer.h b/sys/vfs/hammer/hammer.h index 6a437a02a9..3040abc03d 100644 --- a/sys/vfs/hammer/hammer.h +++ b/sys/vfs/hammer/hammer.h @@ -724,6 +724,7 @@ struct hammer_node_lock { typedef struct hammer_node_lock *hammer_node_lock_t; #define HAMMER_NODE_LOCK_UPDATED 0x0001 +#define HAMMER_NODE_LOCK_LCACHE 0x0002 /* * Common I/O management structure - embedded in in-memory structures @@ -1142,14 +1143,19 @@ int hammer_btree_correct_lhb(hammer_cursor_t cursor, hammer_tid_t tid); int btree_set_parent(hammer_transaction_t trans, hammer_node_t node, hammer_btree_elm_t elm); void hammer_node_lock_init(hammer_node_lock_t parent, hammer_node_t node); +void hammer_btree_lcache_init(hammer_mount_t hmp, hammer_node_lock_t lcache, + int depth); +void hammer_btree_lcache_free(hammer_mount_t hmp, hammer_node_lock_t lcache); int hammer_btree_lock_children(hammer_cursor_t cursor, int depth, - hammer_node_lock_t parent); + hammer_node_lock_t parent, + hammer_node_lock_t lcache); void hammer_btree_lock_copy(hammer_cursor_t cursor, hammer_node_lock_t parent); int hammer_btree_sync_copy(hammer_cursor_t cursor, hammer_node_lock_t parent); -void hammer_btree_unlock_children(hammer_cursor_t cursor, - hammer_node_lock_t parent); +void hammer_btree_unlock_children(hammer_mount_t hmp, + hammer_node_lock_t parent, + hammer_node_lock_t lcache); int hammer_btree_search_node(hammer_base_elm_t elm, hammer_node_ondisk_t node); hammer_node_t hammer_btree_get_parent(hammer_transaction_t trans, hammer_node_t node, int *parent_indexp, diff --git a/sys/vfs/hammer/hammer_btree.c b/sys/vfs/hammer/hammer_btree.c index 91cb117426..b1e2a28e22 100644 --- a/sys/vfs/hammer/hammer_btree.c +++ b/sys/vfs/hammer/hammer_btree.c @@ -1526,7 +1526,7 @@ btree_split_internal(hammer_cursor_t cursor) const int esize = sizeof(*elm); hammer_node_lock_init(&lockroot, cursor->node); - error = hammer_btree_lock_children(cursor, 1, &lockroot); + error = hammer_btree_lock_children(cursor, 1, &lockroot, NULL); if (error) goto done; if ((error = hammer_cursor_upgrade(cursor)) != 0) @@ -1750,7 +1750,7 @@ btree_split_internal(hammer_cursor_t cursor) &cursor->node->ondisk->elms[cursor->node->ondisk->count].internal.base) >= 0); done: - hammer_btree_unlock_children(cursor, &lockroot); + hammer_btree_unlock_children(cursor->trans->hmp, &lockroot, NULL); hammer_cursor_downgrade(cursor); return (error); } @@ -2667,6 +2667,54 @@ hammer_node_lock_init(hammer_node_lock_t parent, hammer_node_t node) parent->flags = 0; } +/* + * Initialize a cache of hammer_node_lock's including space allocated + * for node copies. + * + * This is used by the rebalancing code to preallocate the copy space + * for ~4096 B-Tree nodes (16MB of data) prior to acquiring any HAMMER + * locks, otherwise we can blow out the pageout daemon's emergency + * reserve and deadlock it. + * + * NOTE: HAMMER_NODE_LOCK_LCACHE is not set on items cached in the lcache. + * The flag is set when the item is pulled off the cache for use. + */ +void +hammer_btree_lcache_init(hammer_mount_t hmp, hammer_node_lock_t lcache, + int depth) +{ + hammer_node_lock_t item; + int count; + + for (count = 1; depth; --depth) + count *= HAMMER_BTREE_LEAF_ELMS; + bzero(lcache, sizeof(*lcache)); + TAILQ_INIT(&lcache->list); + while (count) { + item = kmalloc(sizeof(*item), hmp->m_misc, M_WAITOK|M_ZERO); + item->copy = kmalloc(sizeof(*item->copy), + hmp->m_misc, M_WAITOK); + TAILQ_INIT(&item->list); + TAILQ_INSERT_TAIL(&lcache->list, item, entry); + --count; + } +} + +void +hammer_btree_lcache_free(hammer_mount_t hmp, hammer_node_lock_t lcache) +{ + hammer_node_lock_t item; + + while ((item = TAILQ_FIRST(&lcache->list)) != NULL) { + TAILQ_REMOVE(&lcache->list, item, entry); + KKASSERT(item->copy); + KKASSERT(TAILQ_EMPTY(&item->list)); + kfree(item->copy, hmp->m_misc); + kfree(item, hmp->m_misc); + } + KKASSERT(lcache->copy == NULL); +} + /* * Exclusively lock all the children of node. This is used by the split * code to prevent anyone from accessing the children of a cursor node @@ -2683,7 +2731,8 @@ hammer_node_lock_init(hammer_node_lock_t parent, hammer_node_t node) */ int hammer_btree_lock_children(hammer_cursor_t cursor, int depth, - hammer_node_lock_t parent) + hammer_node_lock_t parent, + hammer_node_lock_t lcache) { hammer_node_t node; hammer_node_lock_t item; @@ -2746,10 +2795,20 @@ hammer_btree_lock_children(hammer_cursor_t cursor, int depth, error = EDEADLK; hammer_rel_node(child); } else { - item = kmalloc(sizeof(*item), hmp->m_misc, - M_WAITOK|M_ZERO); + if (lcache) { + item = TAILQ_FIRST(&lcache->list); + KKASSERT(item != NULL); + item->flags |= HAMMER_NODE_LOCK_LCACHE; + TAILQ_REMOVE(&lcache->list, + item, entry); + } else { + item = kmalloc(sizeof(*item), + hmp->m_misc, + M_WAITOK|M_ZERO); + TAILQ_INIT(&item->list); + } + TAILQ_INSERT_TAIL(&parent->list, item, entry); - TAILQ_INIT(&item->list); item->parent = parent; item->node = child; item->index = i; @@ -2762,13 +2821,14 @@ hammer_btree_lock_children(hammer_cursor_t cursor, int depth, error = hammer_btree_lock_children( cursor, depth - 1, - item); + item, + lcache); } } } } if (error) - hammer_btree_unlock_children(cursor, parent); + hammer_btree_unlock_children(hmp, parent, lcache); return(error); } @@ -2783,10 +2843,12 @@ hammer_btree_lock_copy(hammer_cursor_t cursor, hammer_node_lock_t parent) hammer_node_lock_t item; if (parent->copy == NULL) { - parent->copy = kmalloc(sizeof(*parent->copy), hmp->m_misc, - M_WAITOK); - *parent->copy = *parent->node->ondisk; + KKASSERT((parent->flags & HAMMER_NODE_LOCK_LCACHE) == 0); + parent->copy = kmalloc(sizeof(*parent->copy), + hmp->m_misc, M_WAITOK); } + KKASSERT((parent->flags & HAMMER_NODE_LOCK_UPDATED) == 0); + *parent->copy = *parent->node->ondisk; TAILQ_FOREACH(item, &parent->list, entry) { hammer_btree_lock_copy(cursor, item); } @@ -2821,22 +2883,45 @@ hammer_btree_sync_copy(hammer_cursor_t cursor, hammer_node_lock_t parent) * Release previously obtained node locks. The caller is responsible for * cleaning up parent->node itself (its usually just aliased from a cursor), * but this function will take care of the copies. + * + * NOTE: The root node is not placed in the lcache and node->copy is not + * deallocated when lcache != NULL. */ void -hammer_btree_unlock_children(hammer_cursor_t cursor, hammer_node_lock_t parent) +hammer_btree_unlock_children(hammer_mount_t hmp, hammer_node_lock_t parent, + hammer_node_lock_t lcache) { hammer_node_lock_t item; + hammer_node_ondisk_t copy; - if (parent->copy) { - kfree(parent->copy, cursor->trans->hmp->m_misc); - parent->copy = NULL; /* safety */ - } while ((item = TAILQ_FIRST(&parent->list)) != NULL) { TAILQ_REMOVE(&parent->list, item, entry); - hammer_btree_unlock_children(cursor, item); + hammer_btree_unlock_children(hmp, item, lcache); hammer_unlock(&item->node->lock); hammer_rel_node(item->node); - kfree(item, cursor->trans->hmp->m_misc); + if (lcache) { + /* + * NOTE: When placing the item back in the lcache + * the flag is cleared by the bzero(). + * Remaining fields are cleared as a safety + * measure. + */ + KKASSERT(item->flags & HAMMER_NODE_LOCK_LCACHE); + KKASSERT(TAILQ_EMPTY(&item->list)); + copy = item->copy; + bzero(item, sizeof(*item)); + TAILQ_INIT(&item->list); + item->copy = copy; + if (copy) + bzero(copy, sizeof(*copy)); + TAILQ_INSERT_TAIL(&lcache->list, item, entry); + } else { + kfree(item, hmp->m_misc); + } + } + if (parent->copy && (parent->flags & HAMMER_NODE_LOCK_LCACHE) == 0) { + kfree(parent->copy, hmp->m_misc); + parent->copy = NULL; /* safety */ } } diff --git a/sys/vfs/hammer/hammer_rebalance.c b/sys/vfs/hammer/hammer_rebalance.c index 1b8de5c37e..5390aa2935 100644 --- a/sys/vfs/hammer/hammer_rebalance.c +++ b/sys/vfs/hammer/hammer_rebalance.c @@ -35,7 +35,7 @@ #include "hammer.h" static int rebalance_node(struct hammer_ioc_rebalance *rebal, - hammer_cursor_t cursor); + hammer_cursor_t cursor, hammer_node_lock_t lcache); static void rebalance_closeout(hammer_node_lock_t base_item, int base_count, hammer_btree_elm_t elm); static void rebalance_parent_ptrs(hammer_node_lock_t base_item, int index, @@ -55,6 +55,7 @@ hammer_ioc_rebalance(hammer_transaction_t trans, hammer_inode_t ip, struct hammer_ioc_rebalance *rebal) { struct hammer_cursor cursor; + struct hammer_node_lock lcache; hammer_btree_leaf_elm_t elm; int error; int seq; @@ -79,6 +80,8 @@ hammer_ioc_rebalance(hammer_transaction_t trans, hammer_inode_t ip, rebal->key_cur.localization &= HAMMER_LOCALIZE_MASK; rebal->key_cur.localization += ip->obj_localization; + hammer_btree_lcache_init(trans->hmp, &lcache, 2); + seq = trans->hmp->flusher.act; /* @@ -135,7 +138,7 @@ retry: */ if (cursor.index == cursor.node->ondisk->count - 1) { hammer_sync_lock_sh(trans); - error = rebalance_node(rebal, &cursor); + error = rebalance_node(rebal, &cursor, &lcache); hammer_sync_unlock(trans); } } else { @@ -208,6 +211,7 @@ retry: } failed: rebal->key_cur.localization &= HAMMER_LOCALIZE_MASK; + hammer_btree_lcache_free(trans->hmp, &lcache); return(error); } @@ -236,7 +240,8 @@ failed: * XXX live-tracked */ static int -rebalance_node(struct hammer_ioc_rebalance *rebal, hammer_cursor_t cursor) +rebalance_node(struct hammer_ioc_rebalance *rebal, hammer_cursor_t cursor, + struct hammer_node_lock *lcache) { struct hammer_node_lock lockroot; hammer_node_lock_t base_item; @@ -264,7 +269,7 @@ rebalance_node(struct hammer_ioc_rebalance *rebal, hammer_cursor_t cursor) error = hammer_cursor_upgrade(cursor); if (error) goto done; - error = hammer_btree_lock_children(cursor, 2, &lockroot); + error = hammer_btree_lock_children(cursor, 2, &lockroot, lcache); if (error) goto done; @@ -480,7 +485,7 @@ rebalance_node(struct hammer_ioc_rebalance *rebal, hammer_cursor_t cursor) */ rebal->stat_nrebal += hammer_btree_sync_copy(cursor, &lockroot); done: - hammer_btree_unlock_children(cursor, &lockroot); + hammer_btree_unlock_children(cursor->trans->hmp, &lockroot, lcache); hammer_cursor_downgrade(cursor); return (error); } diff --git a/sys/vfs/hammer/hammer_reblock.c b/sys/vfs/hammer/hammer_reblock.c index a27a1beae5..3fc2301e72 100644 --- a/sys/vfs/hammer/hammer_reblock.c +++ b/sys/vfs/hammer/hammer_reblock.c @@ -504,7 +504,7 @@ hammer_reblock_int_node(struct hammer_ioc_reblock *reblock, int i; hammer_node_lock_init(&lockroot, cursor->node); - error = hammer_btree_lock_children(cursor, 1, &lockroot); + error = hammer_btree_lock_children(cursor, 1, &lockroot, NULL); if (error) goto done; @@ -586,7 +586,7 @@ hammer_reblock_int_node(struct hammer_ioc_reblock *reblock, hammer_rel_node(onode); done: - hammer_btree_unlock_children(cursor, &lockroot); + hammer_btree_unlock_children(cursor->trans->hmp, &lockroot, NULL); return (error); } -- 2.41.0