From bbb01e143edc616b0a7ea418ab3a82c6abb1700c Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Tue, 2 Mar 2010 14:36:40 -0800 Subject: [PATCH] HAMMER VFS - Correct seriuos bug in hammer rebalancing code * Correct a serious bug in the hammer rebalancing code which can cause incremental mirrors to lose track of records, resulting in missing records on the slave (missing files, missing directory entries, or files which improperly contain data holes). The rebalancing code needs to update the mirror_tid chain when moving elements from one node to another. It updated the parent node but failed to update the internal element linkage pointing to the child. This can cause incremental mirroring operations to fail to copy some B-Tree records. * Correct a possible issue between the rebalancing code and B-Tree iterations during mirror-write operations. When the rebalancer moves an element from one child node to another and a cursor exists which is pointing at the parent internal node, the cursor will now be mispositioned because elements which were to the right of the cursor are now to the left of the cursor. Adjust hammer_cursor_moved_element() to properly move the cursor and print a debug message to the console so the code path can be tested. * These bugs are serious but also very rare. There is a good chance that your slaves are just fine, but we recommend validating their contents anyway. * PROCEDURE FOR FIXING BROKEN SLAVES: With a little help from the sysop these problems are self-correcting with the new kernel code in place. First make sure both the master and slave are running a kernel with the mirroring fix. Second, verify the slave matches the master (taking into account changes being made to live files). Probably the easiest way to do this is as follows (make sure you push into the softlink, not cpdup the softlink itself): cpdup -v -V /pfs/master/. /pfs/slave/. This should get you a list of missing files and directories. For each missing file or directory do a chmod, chown, or chflags on the master, then change it back. This will propagate the updated inode to the slave So for example: chflags nodump file chflags dump file If file data is corrupt on the mirroring slave you need to recopy the file on the master, which can be done with dd conv=notrunc: dd if=file of=file bs=32k conv=notrunc These actions will cause the master to re-mirror the related files and data to the slave. Reported-by: Stathis Kamperis --- sys/vfs/hammer/hammer.h | 5 ++-- sys/vfs/hammer/hammer_cursor.c | 52 +++++++++++++++++++++++++++++++++++---- sys/vfs/hammer/hammer_mirror.c | 12 +++++---- sys/vfs/hammer/hammer_rebalance.c | 52 +++++++++++++++++++++++++-------------- 4 files changed, 90 insertions(+), 31 deletions(-) diff --git a/sys/vfs/hammer/hammer.h b/sys/vfs/hammer/hammer.h index 44afd336fd..cc30895523 100644 --- a/sys/vfs/hammer/hammer.h +++ b/sys/vfs/hammer/hammer.h @@ -1077,8 +1077,9 @@ void hammer_cursor_removed_node(hammer_node_t onode, hammer_node_t parent, int index); void hammer_cursor_split_node(hammer_node_t onode, hammer_node_t nnode, int index); -void hammer_cursor_moved_element(hammer_node_t onode, hammer_node_t nnode, - int oindex, int nindex); +void hammer_cursor_moved_element(hammer_node_t oparent, int pindex, + hammer_node_t onode, int oindex, + hammer_node_t nnode, int nindex); void hammer_cursor_parent_changed(hammer_node_t node, hammer_node_t oparent, hammer_node_t nparent, int nindex); void hammer_cursor_inserted_element(hammer_node_t node, int index); diff --git a/sys/vfs/hammer/hammer_cursor.c b/sys/vfs/hammer/hammer_cursor.c index ae546c4e78..614ab18371 100644 --- a/sys/vfs/hammer/hammer_cursor.c +++ b/sys/vfs/hammer/hammer_cursor.c @@ -771,22 +771,27 @@ again: * An element was moved from one node to another or within a node. The * index may also represent the end of the node (index == numelements). * + * {oparent,pindex} is the parent node's pointer to onode/oindex. + * * This is used by the rebalancing code. This is not an insertion or * deletion and any additional elements, including the degenerate case at * the end of the node, will be dealt with by additional distinct calls. */ void -hammer_cursor_moved_element(hammer_node_t onode, hammer_node_t nnode, - int oindex, int nindex) +hammer_cursor_moved_element(hammer_node_t oparent, int pindex, + hammer_node_t onode, int oindex, + hammer_node_t nnode, int nindex) { hammer_cursor_t cursor; hammer_node_ondisk_t ondisk; hammer_node_ondisk_t nndisk; + /* + * Adjust any cursors pointing at the element + */ ondisk = onode->ondisk; nndisk = nnode->ondisk; - -again: +again1: TAILQ_FOREACH(cursor, &onode->cursor_list, deadlk_entry) { KKASSERT(cursor->node == onode); if (cursor->index != oindex) @@ -799,7 +804,44 @@ again: cursor->index = nindex; hammer_ref_node(nnode); hammer_rel_node(onode); - goto again; + goto again1; + } + + /* + * When moving the first element of onode to a different node any + * cursor which is pointing at (oparent,pindex) must be repointed + * to nnode and ATEDISK must be cleared. + * + * This prevents cursors from losing track due to insertions. + * Insertions temporarily release the cursor in order to update + * the mirror_tids. It primarily effects the mirror_write code. + * The other code paths generally only do a single insertion and + * then relookup or drop the cursor. + */ + if (onode == nnode || oindex) + return; + ondisk = oparent->ondisk; +again2: + TAILQ_FOREACH(cursor, &oparent->cursor_list, deadlk_entry) { + KKASSERT(cursor->node == oparent); + if (cursor->index != pindex) + continue; + kprintf("HAMMER debug: shifted cursor pointing at parent\n" + "parent %016jx:%d onode %016jx:%d nnode %016jx:%d\n", + (intmax_t)oparent->node_offset, pindex, + (intmax_t)onode->node_offset, oindex, + (intmax_t)nnode->node_offset, nindex); + print_backtrace(); + TAILQ_REMOVE(&oparent->cursor_list, cursor, deadlk_entry); + TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry); + if (cursor->leaf == &ondisk->elms[oindex].leaf) + cursor->leaf = &nndisk->elms[nindex].leaf; + cursor->node = nnode; + cursor->index = nindex; + cursor->flags &= ~HAMMER_CURSOR_ATEDISK; + hammer_ref_node(nnode); + hammer_rel_node(oparent); + goto again2; } } diff --git a/sys/vfs/hammer/hammer_mirror.c b/sys/vfs/hammer/hammer_mirror.c index b7f81d47be..a00c477e52 100644 --- a/sys/vfs/hammer/hammer_mirror.c +++ b/sys/vfs/hammer/hammer_mirror.c @@ -610,7 +610,7 @@ hammer_ioc_mirror_write_rec(hammer_cursor_t cursor, /* * Certain records are not part of the mirroring operation */ - if (hammer_mirror_nomirror(&mrec->leaf.base)) + if (error == 0 && hammer_mirror_nomirror(&mrec->leaf.base)) return(0); /* @@ -628,10 +628,12 @@ hammer_ioc_mirror_write_rec(hammer_cursor_t cursor, * * These functions can return EDEADLK */ - cursor->key_beg = mrec->leaf.base; - cursor->flags |= HAMMER_CURSOR_BACKEND; - cursor->flags &= ~HAMMER_CURSOR_INSERT; - error = hammer_btree_lookup(cursor); + if (error == 0) { + cursor->key_beg = mrec->leaf.base; + cursor->flags |= HAMMER_CURSOR_BACKEND; + cursor->flags &= ~HAMMER_CURSOR_INSERT; + error = hammer_btree_lookup(cursor); + } if (error == 0 && hammer_mirror_check(cursor, mrec)) { error = hammer_mirror_update(cursor, mrec); diff --git a/sys/vfs/hammer/hammer_rebalance.c b/sys/vfs/hammer/hammer_rebalance.c index c823415eaa..6250eb5f6a 100644 --- a/sys/vfs/hammer/hammer_rebalance.c +++ b/sys/vfs/hammer/hammer_rebalance.c @@ -238,6 +238,7 @@ rebalance_node(struct hammer_ioc_rebalance *rebal, hammer_cursor_t cursor) hammer_node_lock_t item; hammer_btree_elm_t elm; hammer_node_t node; + hammer_tid_t tid; u_int8_t type1; int base_count; int root_count; @@ -319,6 +320,10 @@ rebalance_node(struct hammer_ioc_rebalance *rebal, hammer_cursor_t cursor) * hammer_cursor_moved_element() is called for each element moved * to update tracked cursors, including the index beyond the last * element (at count). + * + * Any cursors tracking the internal node itself must also be + * updated, potentially repointing them at a leaf and clearing + * ATEDISK. */ base_item = TAILQ_FIRST(&lockroot.list); base_count = 0; @@ -368,23 +373,28 @@ rebalance_node(struct hammer_ioc_rebalance *rebal, hammer_cursor_t cursor) base_item->flags |= HAMMER_NODE_LOCK_UPDATED; /* - * Adjust the mirror_tid of the target. The parent - * node (lockroot.node) should already have an - * aggregate mirror_tid so we do not have to update - * that. + * Adjust the mirror_tid of the target and the + * internal element linkage. * - * However, it is possible for us to catch a - * hammer_btree_mirror_propagate() with its pants - * down. Update the parent if necessary. + * The parent node (lockroot.node) should already + * have an aggregate mirror_tid so we do not have + * to update that. However, it is possible for us + * to catch a hammer_btree_mirror_propagate() with + * its pants down. Update the parent if necessary. */ - if (base_item->copy->mirror_tid < - node->ondisk->mirror_tid) { - base_item->copy->mirror_tid = - node->ondisk->mirror_tid; - if (lockroot.copy->mirror_tid < - node->ondisk->mirror_tid) { - lockroot.copy->mirror_tid = - node->ondisk->mirror_tid; + tid = node->ondisk->mirror_tid; + + if (base_item->copy->mirror_tid < tid) { + base_item->copy->mirror_tid = tid; + if (lockroot.copy->mirror_tid < tid) { + lockroot.copy->mirror_tid = tid; + lockroot.flags |= + HAMMER_NODE_LOCK_UPDATED; + } + if (lockroot.copy->elms[root_count]. + internal.mirror_tid < tid) { + lockroot.copy->elms[root_count]. + internal.mirror_tid = tid; lockroot.flags |= HAMMER_NODE_LOCK_UPDATED; } @@ -401,8 +411,11 @@ rebalance_node(struct hammer_ioc_rebalance *rebal, hammer_cursor_t cursor) rebalance_parent_ptrs(base_item, base_count, item, chld_item); } - hammer_cursor_moved_element(node, base_item->node, - i, base_count); + hammer_cursor_moved_element(item->parent->node, + item->index, + node, i, + base_item->node, + base_count); ++base_count; if (chld_item) chld_item = TAILQ_NEXT(chld_item, entry); @@ -412,8 +425,9 @@ rebalance_node(struct hammer_ioc_rebalance *rebal, hammer_cursor_t cursor) * Always call at the end (i == number of elements) in * case a cursor is sitting indexed there. */ - hammer_cursor_moved_element(node, base_item->node, - i, base_count); + hammer_cursor_moved_element(item->parent->node, item->index, + node, i, + base_item->node, base_count); } /* -- 2.15.1