HAMMER - Fix lost inode issue (primarily with nohistory mounts)
authorMatthew Dillon <dillon@apollo.backplane.com>
Thu, 3 Sep 2009 15:18:43 +0000 (08:18 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Thu, 3 Sep 2009 15:18:43 +0000 (08:18 -0700)
* When a HAMMER cursor is unlocked it becomes tracked and unrelated
  B-Tree operations will cause the tracked cursor's nodes and indices
  to be updated.  The cursor structure also has a leaf element pointer
  which was not being properly updated.  This could lead to panics and
  lost inodes.

  Properly adjust the leaf element pointer in tracked cursors.

* The bug primarily occurs with nohistory mounts or nohistory sub-trees
  due to the larger number of physical deletions made to the B-Tree, but
  could also occur (rarely) with normal mounts.

* Add additional assertions to catch any further occurrences (though I
  think all the cases have been covered now).

* Add a new sysctl vfs.hammer.error_panic which can be set to e.g. 9 to
  cause critical errors to panic immediately instead of returning
  through the call stack, making debugging possible.

Reported-by: Numerous people
sys/vfs/hammer/hammer_btree.c
sys/vfs/hammer/hammer_cursor.c
sys/vfs/hammer/hammer_inode.c
sys/vfs/hammer/hammer_mirror.c
sys/vfs/hammer/hammer_object.c
sys/vfs/hammer/hammer_pfs.c
sys/vfs/hammer/hammer_prune.c
sys/vfs/hammer/hammer_rebalance.c
sys/vfs/hammer/hammer_reblock.c
sys/vfs/hammer/hammer_vfsops.c
sys/vfs/hammer/hammer_vnops.c

index 2c05e12..575a4fb 100644 (file)
@@ -1703,6 +1703,7 @@ btree_split_leaf(hammer_cursor_t cursor)
         */
        leaf = cursor->node;
        ondisk = leaf->ondisk;
+       KKASSERT(ondisk->count > 2);
        split = (ondisk->count + 1) / 2;
        if (cursor->index <= split)
                --split;
@@ -2296,6 +2297,11 @@ btree_remove(hammer_cursor_t cursor)
  * The passed inode has no relationship to the cursor position other
  * then being in the same pseudofs as the insertion or deletion we
  * are propagating the mirror_tid for.
+ *
+ * WARNING!  Because we push and pop the passed cursor, it may be
+ *          modified by other B-Tree operations while it is unlocked
+ *          and things like the node & leaf pointers, and indexes might
+ *          change.
  */
 void
 hammer_btree_do_propagation(hammer_cursor_t cursor,
@@ -2333,6 +2339,7 @@ hammer_btree_do_propagation(hammer_cursor_t cursor,
        error = hammer_btree_mirror_propagate(ncursor, mirror_tid);
        KKASSERT(error == 0);
        hammer_pop_cursor(cursor, ncursor);
+       /* WARNING: cursor's leaf pointer may change after pop */
 }
 
 
index 027a323..a4d08be 100644 (file)
@@ -488,6 +488,15 @@ hammer_cursor_down(hammer_cursor_t cursor)
  * used for the mirror propagation and physical node removal cases but
  * ultimately the intention is to use them for all deadlock recovery
  * operations.
+ *
+ * WARNING!  The contents of the cursor may be modified while unlocked.
+ *          passive modifications including adjusting the node, parent,
+ *          indexes, and leaf pointer.
+ *
+ *          An outright removal of the element the cursor was pointing at
+ *          will cause the HAMMER_CURSOR_TRACKED_RIPOUT flag to be set,
+ *          which chains to causing the HAMMER_CURSOR_RETEST to be set
+ *          when the cursor is locked again.
  */
 void
 hammer_unlock_cursor(hammer_cursor_t cursor)
@@ -671,11 +680,18 @@ void
 hammer_cursor_replaced_node(hammer_node_t onode, hammer_node_t nnode)
 {
        hammer_cursor_t cursor;
+       hammer_node_ondisk_t ondisk;
+       hammer_node_ondisk_t nndisk;
+
+       ondisk = onode->ondisk;
+       nndisk = nnode->ondisk;
 
        while ((cursor = TAILQ_FIRST(&onode->cursor_list)) != NULL) {
                TAILQ_REMOVE(&onode->cursor_list, cursor, deadlk_entry);
                TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry);
                KKASSERT(cursor->node == onode);
+               if (cursor->leaf == &ondisk->elms[cursor->index].leaf)
+                       cursor->leaf = &nndisk->elms[cursor->index].leaf;
                cursor->node = nnode;
                hammer_ref_node(nnode);
                hammer_rel_node(onode);
@@ -690,13 +706,18 @@ void
 hammer_cursor_removed_node(hammer_node_t node, hammer_node_t parent, int index)
 {
        hammer_cursor_t cursor;
+       hammer_node_ondisk_t ondisk;
 
        KKASSERT(parent != NULL);
+       ondisk = node->ondisk;
+
        while ((cursor = TAILQ_FIRST(&node->cursor_list)) != NULL) {
                KKASSERT(cursor->node == node);
                KKASSERT(cursor->index == 0);
                TAILQ_REMOVE(&node->cursor_list, cursor, deadlk_entry);
                TAILQ_INSERT_TAIL(&parent->cursor_list, cursor, deadlk_entry);
+               if (cursor->leaf == &ondisk->elms[cursor->index].leaf)
+                       cursor->leaf = NULL;
                cursor->flags |= HAMMER_CURSOR_TRACKED_RIPOUT;
                cursor->node = parent;
                cursor->index = index;
@@ -712,6 +733,11 @@ void
 hammer_cursor_split_node(hammer_node_t onode, hammer_node_t nnode, int index)
 {
        hammer_cursor_t cursor;
+       hammer_node_ondisk_t ondisk;
+       hammer_node_ondisk_t nndisk;
+
+       ondisk = onode->ondisk;
+       nndisk = nnode->ondisk;
 
 again:
        TAILQ_FOREACH(cursor, &onode->cursor_list, deadlk_entry) {
@@ -720,6 +746,8 @@ again:
                        continue;
                TAILQ_REMOVE(&onode->cursor_list, cursor, deadlk_entry);
                TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry);
+               if (cursor->leaf == &ondisk->elms[cursor->index].leaf)
+                       cursor->leaf = &nndisk->elms[cursor->index - index].leaf;
                cursor->node = nnode;
                cursor->index -= index;
                hammer_ref_node(nnode);
@@ -741,6 +769,11 @@ hammer_cursor_moved_element(hammer_node_t onode, hammer_node_t nnode,
                            int oindex, int nindex)
 {
        hammer_cursor_t cursor;
+       hammer_node_ondisk_t ondisk;
+       hammer_node_ondisk_t nndisk;
+
+       ondisk = onode->ondisk;
+       nndisk = nnode->ondisk;
 
 again:
        TAILQ_FOREACH(cursor, &onode->cursor_list, deadlk_entry) {
@@ -749,6 +782,8 @@ again:
                        continue;
                TAILQ_REMOVE(&onode->cursor_list, cursor, deadlk_entry);
                TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry);
+               if (cursor->leaf == &ondisk->elms[oindex].leaf)
+                       cursor->leaf = &nndisk->elms[nindex].leaf;
                cursor->node = nnode;
                cursor->index = nindex;
                hammer_ref_node(nnode);
@@ -793,12 +828,19 @@ void
 hammer_cursor_deleted_element(hammer_node_t node, int index)
 {
        hammer_cursor_t cursor;
+       hammer_node_ondisk_t ondisk;
+
+       ondisk = node->ondisk;
 
        TAILQ_FOREACH(cursor, &node->cursor_list, deadlk_entry) {
                KKASSERT(cursor->node == node);
                if (cursor->index == index) {
                        cursor->flags |= HAMMER_CURSOR_TRACKED_RIPOUT;
+                       if (cursor->leaf == &ondisk->elms[cursor->index].leaf)
+                               cursor->leaf = NULL;
                } else if (cursor->index > index) {
+                       if (cursor->leaf == &ondisk->elms[cursor->index].leaf)
+                               cursor->leaf = &ondisk->elms[cursor->index - 1].leaf;
                        --cursor->index;
                }
        }
@@ -813,11 +855,17 @@ void
 hammer_cursor_inserted_element(hammer_node_t node, int index)
 {
        hammer_cursor_t cursor;
+       hammer_node_ondisk_t ondisk;
+
+       ondisk = node->ondisk;
 
        TAILQ_FOREACH(cursor, &node->cursor_list, deadlk_entry) {
                KKASSERT(cursor->node == node);
-               if (cursor->index >= index)
+               if (cursor->index >= index) {
+                       if (cursor->leaf == &ondisk->elms[cursor->index].leaf)
+                               cursor->leaf = &ondisk->elms[cursor->index + 1].leaf;
                        ++cursor->index;
+               }
        }
 }
 
index f1ebb7f..c5794d2 100644 (file)
@@ -2583,6 +2583,8 @@ done:
         *
         * We must release our cursor lock to avoid a 3-way deadlock
         * due to the exclusive sync lock the finalizer must get.
+        *
+        * WARNING: See warnings in hammer_unlock_cursor() function.
         */
         if (hammer_flusher_meta_limit(hmp)) {
                hammer_unlock_cursor(cursor);
index 78f8027..06422db 100644 (file)
@@ -382,6 +382,8 @@ hammer_ioc_mirror_write(hammer_transaction_t trans, hammer_inode_t ip,
                /*
                 * Don't blow out the buffer cache.  Leave room for frontend
                 * cache as well.
+                *
+                * WARNING: See warnings in hammer_unlock_cursor() function.
                 */
                while (hammer_flusher_meta_halflimit(trans->hmp) ||
                       hammer_flusher_undo_exhausted(trans, 2)) {
@@ -894,6 +896,10 @@ hammer_mirror_write(hammer_cursor_t cursor,
                hammer_modify_volume_done(trans->rootvol);
        }
 
+       /*
+        * WARNING!  cursor's leaf pointer may have changed after
+        *           do_propagation returns.
+        */
        if (error == 0 && doprop)
                hammer_btree_do_propagation(cursor, NULL, &mrec->leaf);
 
index b54b6bb..530713b 100644 (file)
@@ -97,6 +97,8 @@ hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2)
 
 /*
  * Basic record comparison code similar to hammer_btree_cmp().
+ *
+ * obj_id is not compared and may not yet be assigned in the record.
  */
 static int
 hammer_rec_cmp(hammer_base_elm_t elm, hammer_record_t rec)
@@ -785,13 +787,15 @@ hammer_ip_del_directory(struct hammer_transaction *trans,
                /*
                 * If the record is on-disk we have to queue the deletion by
                 * the record's key.  This also causes lookups to skip the
-                * record.
+                * record (lookups for the purposes of finding an unused
+                * directory key do not skip the record).
                 */
                KKASSERT(dip->flags &
                         (HAMMER_INODE_ONDISK | HAMMER_INODE_DONDISK));
                record = hammer_alloc_mem_record(dip, 0);
                record->type = HAMMER_MEM_RECORD_DEL;
                record->leaf.base = cursor->leaf->base;
+               KKASSERT(dip->obj_id == record->leaf.base.obj_id);
 
                /*
                 * ip may be NULL, indicating the deletion of a directory
@@ -1267,6 +1271,9 @@ hammer_ip_sync_record_cursor(hammer_cursor_t cursor, hammer_record_t record)
         * sync a valid directory entry to disk due to dependancies,
         * we must convert the record to a covering delete so the
         * frontend does not have visibility on the synced entry.
+        *
+        * WARNING: cursor's leaf pointer may have changed after do_propagation
+        *          returns!
         */
        if (error == 0) {
                if (doprop) {
@@ -1282,6 +1289,7 @@ hammer_ip_sync_record_cursor(hammer_cursor_t cursor, hammer_record_t record)
                        KKASSERT(record->type == HAMMER_MEM_RECORD_ADD);
                        record->flags &= ~HAMMER_RECF_DELETED_FE;
                        record->type = HAMMER_MEM_RECORD_DEL;
+                       KKASSERT(record->ip->obj_id == record->leaf.base.obj_id);
                        KKASSERT(record->flush_state == HAMMER_FST_FLUSH);
                        record->flags &= ~HAMMER_RECF_CONVERT_DELETE;
                        KKASSERT((record->flags & (HAMMER_RECF_COMMITTED |
@@ -2368,6 +2376,9 @@ hammer_delete_at_cursor(hammer_cursor_t cursor, int delete_flags,
         * cursor->ip is NULL when called from the pruning, mirroring,
         * and pfs code.  If non-NULL propagation will be conditionalized
         * on whether the PFS is in no-history mode or not.
+        *
+        * WARNING: cursor's leaf pointer may have changed after do_propagation
+        *          returns!
         */
        if (doprop) {
                if (cursor->ip)
index 49e8465..c8ef139 100644 (file)
@@ -402,6 +402,8 @@ retry:
                 * We only care about leafs.  Internal nodes can be returned
                 * in mirror-filtered mode (they are used to generate SKIP
                 * mrecords), but we don't need them for this code.
+                *
+                * WARNING: See warnings in hammer_unlock_cursor() function.
                 */
                cursor.flags |= HAMMER_CURSOR_ATEDISK;
                if (cursor.node->ondisk->type == HAMMER_BTREE_TYPE_LEAF) {
index d266a85..c0dca40 100644 (file)
@@ -216,6 +216,9 @@ retry:
                }
                ++prune->stat_scanrecords;
 
+               /*
+                * WARNING: See warnings in hammer_unlock_cursor() function.
+                */
                while (hammer_flusher_meta_halflimit(trans->hmp) ||
                       hammer_flusher_undo_exhausted(trans, 2)) {
                        hammer_unlock_cursor(&cursor);
index 43fe789..012254e 100644 (file)
@@ -146,6 +146,9 @@ retry:
                /*
                 * Update returned scan position and do a flush if
                 * necessary.
+                *
+                * WARNING: See warnings in hammer_unlock_cursor()
+                *          function.
                 */
                elm = &cursor.node->ondisk->elms[cursor.index].leaf;
                rebal->key_cur = elm->base;
index 3e4eed4..23cdba8 100644 (file)
@@ -143,6 +143,8 @@ retry:
                /*
                 * If there is insufficient free space it may be due to
                 * reserved bigblocks, which flushing might fix.
+                *
+                * WARNING: See warnings in hammer_unlock_cursor() function.
                 */
                if (hammer_checkspace(trans->hmp, slop)) {
                        if (++checkspace_count == 10) {
@@ -161,6 +163,8 @@ retry:
                 * crossing a synchronization boundary.
                 *
                 * NOTE: cursor.node may have changed on return.
+                *
+                * WARNING: See warnings in hammer_unlock_cursor() function.
                 */
                hammer_sync_lock_sh(trans);
                error = hammer_reblock_helper(reblock, &cursor, elm);
@@ -186,7 +190,8 @@ retry:
                 * them.  But if we allocate too many we can still deadlock
                 * the buffer cache.
                 *
-                * (The cursor's node and element may change!)
+                * WARNING: See warnings in hammer_unlock_cursor() function.
+                *          (The cursor's node and element may change!)
                 */
                if (bd_heatup()) {
                        hammer_unlock_cursor(&cursor);
@@ -287,6 +292,9 @@ hammer_reblock_helper(struct hammer_ioc_reblock *reblock,
                         * This is nasty, the uncache code may have to get
                         * vnode locks and because of that we can't hold
                         * the cursor locked.
+                        *
+                        * WARNING: See warnings in hammer_unlock_cursor()
+                        *          function.
                         */
                        leaf = elm->leaf;
                        hammer_unlock_cursor(cursor);
index d8e609f..be8d642 100644 (file)
@@ -57,6 +57,7 @@ int hammer_debug_btree;
 int hammer_debug_tid;
 int hammer_debug_recover;              /* -1 will disable, +1 will force */
 int hammer_debug_recover_faults;
+int hammer_error_panic;                        /* panic on error levels */
 int hammer_cluster_enable = 1;         /* enable read clustering by default */
 int hammer_count_fsyncs;
 int hammer_count_inodes;
@@ -127,6 +128,8 @@ SYSCTL_INT(_vfs_hammer, OID_AUTO, debug_recover, CTLFLAG_RW,
           &hammer_debug_recover, 0, "");
 SYSCTL_INT(_vfs_hammer, OID_AUTO, debug_recover_faults, CTLFLAG_RW,
           &hammer_debug_recover_faults, 0, "");
+SYSCTL_INT(_vfs_hammer, OID_AUTO, error_panic, CTLFLAG_RW,
+          &hammer_error_panic, 0, "");
 SYSCTL_INT(_vfs_hammer, OID_AUTO, cluster_enable, CTLFLAG_RW,
           &hammer_cluster_enable, 0, "");
 
@@ -775,10 +778,13 @@ hammer_critical_error(hammer_mount_t hmp, hammer_inode_t ip,
                      int error, const char *msg)
 {
        hmp->flags |= HAMMER_MOUNT_CRITICAL_ERROR;
+
        krateprintf(&hmp->krate,
-               "HAMMER(%s): Critical error inode=%lld %s\n",
-               hmp->mp->mnt_stat.f_mntfromname,
-               (long long)(ip ? ip->obj_id : -1), msg);
+                   "HAMMER(%s): Critical error inode=%jd error=%d %s\n",
+                   hmp->mp->mnt_stat.f_mntfromname,
+                   (intmax_t)(ip ? ip->obj_id : -1),
+                   error, msg);
+
        if (hmp->ronly == 0) {
                hmp->ronly = 2;         /* special errored read-only mode */
                hmp->mp->mnt_flag |= MNT_RDONLY;
@@ -786,6 +792,8 @@ hammer_critical_error(hammer_mount_t hmp, hammer_inode_t ip,
                        hmp->mp->mnt_stat.f_mntfromname);
        }
        hmp->error = error;
+       if (hammer_error_panic > 2)
+               Debugger("Entering debugger");
 }
 
 
index a125ba0..3e4625b 100644 (file)
@@ -3101,6 +3101,9 @@ retry:
                 *
                 * If any changes whatsoever have been made to the cursor
                 * set EDEADLK and retry.
+                *
+                * WARNING: See warnings in hammer_unlock_cursor()
+                *          function.
                 */
                if (error == 0 && ip && ip->ino_data.obj_type ==
                                        HAMMER_OBJTYPE_DIRECTORY) {