From 6a37e7e4b492073cdb3c293d5a142462d05342eb Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Fri, 18 Jan 2008 07:02:41 +0000 Subject: [PATCH] HAMMER 21/many: B-Tree node locking finalization. * Implement the final locking scheme for B-Tree nodes. Use shared locks for all searches, upgrade to exclusive locks for modifications. If unable to upgrade fall through with an EDEADLK error code and retry the operation after releasing all other locks and blocking on the lock that could not be obtained. Simple iterations never fail and do not need to handle an EDEADLK error code. Because EDEADLK can actually occur quite often the error paths for most code modules will begin to get some exercise, which is good for code stability. It is possible to cache cursor positions closer to the desired target to reduce re-lookup times but I don't try to do this yet. * Finalize code for basic (unbalanced) deletions. Neither leaf nor internal nodes are allowed to be empty any more (except at the root of a cluster), but recursive deletions may deadlock while going up the tree and leave an internal node with a zero'd out element. The search and iteration code now properly detects such elements and finishes off the deletion, though a complete cleaning will be left up to the balancing module. * Remove most instances of recursively instantiated cursors. There is still one left in the directory deletion code. * Remove all remaining unprotected cursor transitions (where locks had to be released to avoid deadlocks). --- sys/vfs/hammer/hammer.h | 21 ++- sys/vfs/hammer/hammer_btree.c | 273 ++++++++++++++++++++------------ sys/vfs/hammer/hammer_cursor.c | 119 +++++++++----- sys/vfs/hammer/hammer_cursor.h | 15 +- sys/vfs/hammer/hammer_inode.c | 17 +- sys/vfs/hammer/hammer_object.c | 81 +++++++--- sys/vfs/hammer/hammer_ondisk.c | 7 +- sys/vfs/hammer/hammer_recover.c | 5 +- sys/vfs/hammer/hammer_spike.c | 18 ++- sys/vfs/hammer/hammer_subs.c | 35 +++- sys/vfs/hammer/hammer_vnops.c | 43 +++-- 11 files changed, 432 insertions(+), 202 deletions(-) diff --git a/sys/vfs/hammer/hammer.h b/sys/vfs/hammer/hammer.h index 8f24a51bfb..0a380da80b 100644 --- a/sys/vfs/hammer/hammer.h +++ b/sys/vfs/hammer/hammer.h @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.25 2008/01/17 05:06:09 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.26 2008/01/18 07:02:41 dillon Exp $ */ /* * This header file contains structures used internally by the HAMMERFS @@ -111,6 +111,18 @@ hammer_islastref(struct hammer_lock *lock) return(lock->refs == 1); } +/* + * This inline is specifically optimized for the case where the caller + * owns the lock, but wants to know what kind of lock he owns. A + * negative value indicates a shared lock, a positive value indicates + * an exclusive lock. + */ +static __inline int +hammer_lock_held(struct hammer_lock *lock) +{ + return(lock->lockcount); +} + /* * Structure used to represent an inode in-memory. * @@ -485,17 +497,20 @@ hammer_record_t hammer_alloc_mem_record(hammer_inode_t ip); void hammer_rel_mem_record(hammer_record_t record); -int hammer_cursor_up(hammer_cursor_t cursor, int nonblock); +int hammer_cursor_up(hammer_cursor_t cursor); int hammer_cursor_toroot(hammer_cursor_t cursor); int hammer_cursor_down(hammer_cursor_t cursor); +int hammer_cursor_upgrade(hammer_cursor_t cursor); +void hammer_cursor_downgrade(hammer_cursor_t cursor); void hammer_lock_ex(struct hammer_lock *lock); int hammer_lock_ex_try(struct hammer_lock *lock); void hammer_lock_sh(struct hammer_lock *lock); +int hammer_lock_upgrade(struct hammer_lock *lock); +void hammer_lock_downgrade(struct hammer_lock *lock); void hammer_unlock(struct hammer_lock *lock); void hammer_ref(struct hammer_lock *lock); void hammer_unref(struct hammer_lock *lock); -void hammer_downgrade(struct hammer_lock *lock); u_int32_t hammer_to_unix_xid(uuid_t *uuid); void hammer_guid_to_uuid(uuid_t *uuid, u_int32_t guid); diff --git a/sys/vfs/hammer/hammer_btree.c b/sys/vfs/hammer/hammer_btree.c index d4695a850c..1378025866 100644 --- a/sys/vfs/hammer/hammer_btree.c +++ b/sys/vfs/hammer/hammer_btree.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.20 2008/01/17 05:06:09 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.21 2008/01/18 07:02:41 dillon Exp $ */ /* @@ -90,7 +90,8 @@ static int btree_search(hammer_cursor_t cursor, int flags); static int btree_split_internal(hammer_cursor_t cursor); static int btree_split_leaf(hammer_cursor_t cursor); -static int btree_remove(hammer_cursor_t cursor); +static int btree_remove(hammer_cursor_t cursor, int depth); +static int btree_remove_deleted_element(hammer_cursor_t cursor); static int btree_set_parent(hammer_node_t node, hammer_btree_elm_t elm); #if 0 static int btree_rebalance(hammer_cursor_t cursor); @@ -105,7 +106,7 @@ static void hammer_make_separator(hammer_base_elm_t key1, * Iterate records after a search. The cursor is iterated forwards past * the current record until a record matching the key-range requirements * is found. ENOENT is returned if the iteration goes past the ending - * key. + * key. * * The iteration is inclusive of key_beg and can be inclusive or exclusive * of key_end depending on whether HAMMER_CURSOR_END_INCLUSIVE is set. @@ -117,6 +118,8 @@ static void hammer_make_separator(hammer_base_elm_t key1, * the iteration. XXX future - in case of an inverted lock we may have * to reinitiate the lookup and set key_beg to properly pick up where we * left off. + * + * NOTE! EDEADLK *CANNOT* be returned by this procedure. */ int hammer_btree_iterate(hammer_cursor_t cursor) @@ -161,7 +164,7 @@ hammer_btree_iterate(hammer_cursor_t cursor) * up our scan. */ if (cursor->index == node->count) { - error = hammer_cursor_up(cursor, 0); + error = hammer_cursor_up(cursor); if (error) break; node = cursor->node->ondisk; @@ -209,12 +212,21 @@ hammer_btree_iterate(hammer_cursor_t cursor) break; } KKASSERT(s <= 0); - KKASSERT(elm->internal.subtree_offset != 0); - error = hammer_cursor_down(cursor); - if (error) - break; - KKASSERT(cursor->index == 0); - node = cursor->node->ondisk; + + /* + * When iterating try to clean up any deleted + * internal elements left over from btree_remove() + * deadlocks, but it is ok if we can't. + */ + if (elm->internal.subtree_offset == 0) + btree_remove_deleted_element(cursor); + if (elm->internal.subtree_offset != 0) { + error = hammer_cursor_down(cursor); + if (error) + break; + KKASSERT(cursor->index == 0); + node = cursor->node->ondisk; + } continue; } else { elm = &node->elms[cursor->index]; @@ -291,7 +303,9 @@ hammer_btree_iterate(hammer_cursor_t cursor) /* * Lookup cursor->key_beg. 0 is returned on success, ENOENT if the entry - * could not be found, and a fatal error otherwise. + * could not be found, EDEADLK if inserting and a retry is needed, and a + * fatal error otherwise. When retrying, the caller must terminate the + * cursor and reinitialize it. * * The cursor is suitably positioned for a deletion on success, and suitably * positioned for an insertion on ENOENT. @@ -455,9 +469,12 @@ hammer_btree_extract(hammer_cursor_t cursor, int flags) int hammer_btree_insert(hammer_cursor_t cursor, hammer_btree_elm_t elm) { - hammer_node_ondisk_t parent; hammer_node_ondisk_t node; int i; + int error; + + if ((error = hammer_cursor_upgrade(cursor)) != 0) + return(error); /* * Insert the element at the leaf node and update the count in the @@ -488,15 +505,6 @@ hammer_btree_insert(hammer_cursor_t cursor, hammer_btree_elm_t elm) if (i != node->count - 1) KKASSERT(hammer_btree_cmp(&node->elms[i+1].leaf.base, &elm->leaf.base) > 0); - /* - * Adjust the sub-tree count in the parent. note that the parent - * may be in a different cluster. - */ - if (cursor->parent) { - hammer_modify_node(cursor->parent); - parent = cursor->parent->ondisk; - i = cursor->parent_index; - } return(0); } @@ -528,10 +536,11 @@ hammer_btree_insert_cluster(hammer_cursor_t cursor, hammer_cluster_t ncluster, hammer_btree_elm_t elm; const int esize = sizeof(*elm); u_int8_t save; - int error = 0; + int error; int pi, i; - kprintf("cursor %p ncluster %p\n", cursor, ncluster); + if ((error = hammer_cursor_upgrade(cursor)) != 0) + return(error); hammer_modify_node(cursor->node); node = cursor->node->ondisk; i = cursor->index; @@ -580,6 +589,8 @@ hammer_btree_insert_cluster(hammer_cursor_t cursor, hammer_cluster_t ncluster, kprintf("no parent\n"); } else { kprintf("has parent\n"); + if (error) + return(error); } @@ -741,6 +752,9 @@ hammer_btree_delete(hammer_cursor_t cursor) int error; int i; + if ((error = hammer_cursor_upgrade(cursor)) != 0) + return(error); + /* * Delete the element from the leaf node. * @@ -776,12 +790,18 @@ hammer_btree_delete(hammer_cursor_t cursor) * * This may reposition the cursor at one of the parent's of the * current node. + * + * Ignore deadlock errors, that simply means that btree_remove + * was unable to recurse and had to leave the subtree_offset + * in the parent set to 0. */ KKASSERT(cursor->index <= ondisk->count); if (ondisk->count == 0) { do { - error = btree_remove(cursor); + error = btree_remove(cursor, 0); } while (error == EAGAIN); + if (error == EDEADLK) + error = 0; } else { error = 0; } @@ -864,7 +884,7 @@ btree_search(hammer_cursor_t cursor, int flags) if (error) goto done; KKASSERT(cursor->parent); - error = hammer_cursor_up(cursor, 0); + error = hammer_cursor_up(cursor); if (error) goto done; cluster = cursor->node->cluster; @@ -877,7 +897,7 @@ btree_search(hammer_cursor_t cursor, int flags) while (hammer_btree_cmp(&cursor->key_beg, cursor->left_bound) < 0 || hammer_btree_cmp(&cursor->key_beg, cursor->right_bound) >= 0) { KKASSERT(cursor->parent); - error = hammer_cursor_up(cursor, 0); + error = hammer_cursor_up(cursor); if (error) goto done; } @@ -909,41 +929,13 @@ btree_search(hammer_cursor_t cursor, int flags) break; if (cursor->parent->ondisk->count != HAMMER_BTREE_INT_ELMS) break; - error = hammer_cursor_up(cursor, 0); + error = hammer_cursor_up(cursor); /* cluster and node are now may become stale */ if (error) goto done; } /* cluster = cursor->node->cluster; not needed until next cluster = */ -#if 0 - /* - * If we are deleting we can't start at an internal node with only - * one element unless it is root, because all of our code assumes - * that internal nodes will never be empty. Just do this generally - * for both leaf and internal nodes to get better balance. - * - * This handles the case where the cursor is sitting at a leaf and - * either the leaf or parent contain an insufficient number of - * elements. - * - * NOTE: These cursor-up's CAN continue to cross cluster boundaries. - * - * XXX NOTE: Iterations may not set this flag anyway. - */ - while (flags & HAMMER_CURSOR_DELETE) { - if (cursor->node->ondisk->count > 1) - break; - if (cursor->parent == NULL) - break; - KKASSERT(cursor->node->ondisk->count != 0); - error = hammer_cursor_up(cursor, 0); - /* cluster and node are now may become stale */ - if (error) - goto done; - } -#endif - new_cluster: /* * Push down through internal nodes to locate the requested key. @@ -951,23 +943,6 @@ new_cluster: cluster = cursor->node->cluster; node = cursor->node->ondisk; while (node->type == HAMMER_BTREE_TYPE_INTERNAL) { -#if 0 - /* - * If we are a the root node and deleting, try to collapse - * all of the root's children into the root. This is the - * only point where tree depth is reduced. - * - * XXX NOTE: Iterations may not set this flag anyway. - */ - if ((flags & HAMMER_CURSOR_DELETE) && cursor->parent == NULL) { - error = btree_collapse(cursor); - /* node becomes stale after call */ - /* XXX ENOSPC */ - if (error) - goto done; - } - node = cursor->node->ondisk; -#endif /* * Scan the node to find the subtree index to push down into. * We go one-past, then back-up. @@ -984,16 +959,6 @@ new_cluster: */ for (i = 0; i <= node->count; ++i) { elm = &node->elms[i]; - - KKASSERT(i == node->count || - elm->internal.subtree_offset != 0); -#if 0 - if (i < node->count && - elm->internal.subtree_offset == 0){ - btree_remove_deleted_elements(cursor); - goto new_cluster; - } -#endif r = hammer_btree_cmp(&cursor->key_beg, &elm->base); if (r < 0) break; @@ -1022,6 +987,10 @@ new_cluster: /* * Correct a left-hand boundary mismatch. + * + * This is done without an exclusive lock XXX. We + * have to do this or the search will not terminate + * at a leaf. */ hammer_modify_node(cursor->node); save = node->elms[0].base.btype; @@ -1046,6 +1015,10 @@ new_cluster: * Correct a right-hand boundary mismatch. * (actual push-down record is i-2 prior to * adjustments to i). + * + * This is done without an exclusive lock XXX. We + * have to do this or the search will not terminate + * at a leaf. */ elm = &node->elms[i]; hammer_modify_node(cursor->node); @@ -1060,9 +1033,9 @@ new_cluster: --i; } cursor->index = i; + elm = &node->elms[i]; if (hammer_debug_btree) { - elm = &node->elms[i]; kprintf("SEARCH-I %p:%d %016llx %02x key=%016llx did=%016llx\n", cursor->node, i, elm->internal.base.obj_id, @@ -1072,6 +1045,25 @@ new_cluster: ); } + /* + * When searching try to clean up any deleted + * internal elements left over from btree_remove() + * deadlocks. + * + * If we fail and we are doing an insertion lookup, + * we have to return EDEADLK, because an insertion lookup + * must terminate at a leaf. + */ + if (elm->internal.subtree_offset == 0) { + error = btree_remove_deleted_element(cursor); + if (error == 0) + goto new_cluster; + if (flags & HAMMER_CURSOR_INSERT) + return(EDEADLK); + return(ENOENT); + } + + /* * Handle insertion and deletion requirements. * @@ -1349,6 +1341,9 @@ btree_split_internal(hammer_cursor_t cursor) int i; const int esize = sizeof(*elm); + if ((error = hammer_cursor_upgrade(cursor)) != 0) + return(error); + /* * We are splitting but elms[split] will be promoted to the parent, * leaving the right hand node with one less element. If the @@ -1360,7 +1355,6 @@ btree_split_internal(hammer_cursor_t cursor) split = (ondisk->count + 1) / 2; if (cursor->index <= split) --split; - error = 0; /* * If we are at the root of the cluster, create a new root node with @@ -1374,7 +1368,7 @@ btree_split_internal(hammer_cursor_t cursor) if (ondisk->parent == 0) { parent = hammer_alloc_btree(node->cluster, &error); if (parent == NULL) - return(error); + goto done; hammer_lock_ex(&parent->lock); hammer_modify_node(parent); ondisk = parent->ondisk; @@ -1415,7 +1409,7 @@ btree_split_internal(hammer_cursor_t cursor) parent->flags |= HAMMER_NODE_DELETED; hammer_rel_node(parent); } - return(error); + goto done; } hammer_lock_ex(&new_node->lock); @@ -1524,11 +1518,15 @@ btree_split_internal(hammer_cursor_t cursor) KKASSERT(hammer_btree_cmp(cursor->right_bound, &cursor->node->ondisk->elms[cursor->node->ondisk->count].internal.base) >= 0); - return (0); +done: + hammer_cursor_downgrade(cursor); + return (error); } /* * Same as the above, but splits a full leaf node. + * + * This function */ static int @@ -1548,6 +1546,9 @@ btree_split_leaf(hammer_cursor_t cursor) int i; const size_t esize = sizeof(*elm); + if ((error = hammer_cursor_upgrade(cursor)) != 0) + return(error); + /* * Calculate the split point. If the insertion point will be on * the left-hand side adjust the split point to give the right @@ -1578,7 +1579,7 @@ btree_split_leaf(hammer_cursor_t cursor) if (ondisk->parent == 0) { parent = hammer_alloc_btree(leaf->cluster, &error); if (parent == NULL) - return(error); + goto done; hammer_lock_ex(&parent->lock); hammer_modify_node(parent); ondisk = parent->ondisk; @@ -1618,7 +1619,7 @@ btree_split_leaf(hammer_cursor_t cursor) parent->flags |= HAMMER_NODE_DELETED; hammer_rel_node(parent); } - return(error); + goto done; } hammer_lock_ex(&new_leaf->lock); @@ -1727,7 +1728,9 @@ btree_split_leaf(hammer_cursor_t cursor) KKASSERT(hammer_btree_cmp(cursor->right_bound, &cursor->node->ondisk->elms[cursor->node->ondisk->count-1].leaf.base) > 0); - return (0); +done: + hammer_cursor_downgrade(cursor); + return (error); } /* @@ -1744,7 +1747,7 @@ btree_split_leaf(hammer_cursor_t cursor) * that element, make sure cursor->index is properly adjusted on success. */ int -btree_remove(hammer_cursor_t cursor) +btree_remove(hammer_cursor_t cursor, int depth) { hammer_node_ondisk_t ondisk; hammer_btree_elm_t elm; @@ -1769,21 +1772,40 @@ btree_remove(hammer_cursor_t cursor) cursor->index = 0; error = 0; + if (depth > 16) { + Debugger("btree_remove: stack limit reached"); + return(EDEADLK); + } + /* - * When trying to delete a cluster retain a lock on the - * cluster's root node (node) to prevent insertions while - * we try to undo the spike. + * When trying to delete a cluster we need to exclusively + * lock the cluster root, its parent (leaf in parent cluster), + * AND the parent of that leaf if it's going to be empty, + * because we can't leave around an empty leaf. + * + * XXX this is messy due to potentially recursive locks. + * downgrade the cursor, get a second shared lock on the + * node that cannot deadlock because we only own shared locks + * then, cursor-up, and re-upgrade everything. If the + * upgrades EDEADLK then don't try to remove the cluster + * at this time. */ if ((parent = cursor->parent) != NULL) { + hammer_cursor_downgrade(cursor); save = node; hammer_ref_node(save); - hammer_lock_ex(&save->lock); - error = hammer_cursor_up(cursor, 1); + hammer_lock_sh(&save->lock); + + error = hammer_cursor_up(cursor); + if (error == 0) + error = hammer_cursor_upgrade(cursor); + if (error == 0) + error = hammer_lock_upgrade(&save->lock); + if (error) { + /* may be EDEADLK */ kprintf("BTREE_REMOVE: Cannot delete cluster\n"); Debugger("BTREE_REMOVE"); - if (error == EAGAIN) - error = 0; } else { /* * cursor->node is now the leaf in the parent @@ -1809,7 +1831,7 @@ btree_remove(hammer_cursor_t cursor) cursor->index - 2) * esize); ondisk->count -= 2; if (ondisk->count == 0) - error = btree_remove(cursor); + error = btree_remove(cursor, depth + 1); hammer_flush_node(save); save->flags |= HAMMER_NODE_DELETED; } @@ -1835,6 +1857,14 @@ btree_remove(hammer_cursor_t cursor) hammer_flush_node(node); node->flags |= HAMMER_NODE_DELETED; + /* + * Don't blow up the kernel stack. + */ + if (depth > 20) { + kprintf("btree_remove: stack limit reached"); + return(EDEADLK); + } + /* * If the parent would otherwise not become empty we can physically * remove the zero'd element. Note however that in order to @@ -1848,7 +1878,10 @@ btree_remove(hammer_cursor_t cursor) * XXX we can theoretically recalculate the midpoint but there isn't * much of a reason to do it. */ - error = hammer_cursor_up(cursor, 1); + error = hammer_cursor_up(cursor); + if (error == 0) + error = hammer_cursor_upgrade(cursor); + if (error) { kprintf("BTREE_REMOVE: Cannot lock parent, skipping\n"); Debugger("BTREE_REMOVE"); @@ -1865,6 +1898,11 @@ btree_remove(hammer_cursor_t cursor) /* ondisk is node's ondisk */ /* elm is node's element */ + /* + * Remove the internal element that we zero'd out. Tell the caller + * to loop if it hits zero (to try to avoid eating up precious kernel + * stack). + */ KKASSERT(ondisk->count > 0); bcopy(&elm[1], &elm[0], (ondisk->count - cursor->index) * esize); --ondisk->count; @@ -1873,6 +1911,39 @@ btree_remove(hammer_cursor_t cursor) return(error); } +/* + * Attempt to remove the deleted internal element at the current cursor + * position. If we are unable to remove the element we return EDEADLK. + * + * If the current internal node becomes empty we delete it in the parent + * and cursor up, looping until we finish or we deadlock. + * + * On return, if successful, the cursor will be pointing at the next + * iterative position in the B-Tree. If unsuccessful the cursor will be + * pointing at the last deleted internal element that could not be + * removed. + */ +static +int +btree_remove_deleted_element(hammer_cursor_t cursor) +{ + hammer_node_t node; + hammer_btree_elm_t elm; + int error; + + if ((error = hammer_cursor_upgrade(cursor)) != 0) + return(error); + node = cursor->node; + elm = &node->ondisk->elms[cursor->index]; + if (elm->internal.subtree_offset == 0) { + do { + error = btree_remove(cursor, 0); + kprintf("BTREE REMOVE DELETED ELEMENT %d\n", error); + } while (error == EAGAIN); + } + return(error); +} + /* * The element (elm) has been moved to a new internal node (node). * @@ -1882,6 +1953,8 @@ btree_remove(hammer_cursor_t cursor) * If the element represents a spike the target cluster's header must * be adjusted to point to the element's new location. This only * applies to HAMMER_SPIKE_END. + * + * XXX deadlock potential here with our exclusive locks */ static int diff --git a/sys/vfs/hammer/hammer_cursor.c b/sys/vfs/hammer/hammer_cursor.c index f076c0f0e1..b8233ad132 100644 --- a/sys/vfs/hammer/hammer_cursor.c +++ b/sys/vfs/hammer/hammer_cursor.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_cursor.c,v 1.13 2008/01/17 05:06:09 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_cursor.c,v 1.14 2008/01/18 07:02:41 dillon Exp $ */ /* @@ -64,7 +64,7 @@ hammer_init_cursor_hmp(hammer_cursor_t cursor, struct hammer_node **cache, if (cache && *cache) { node = hammer_ref_node_safe(hmp, cache, &error); if (error == 0) { - hammer_lock_ex(&node->lock); + hammer_lock_sh(&node->lock); if (node->flags & HAMMER_NODE_DELETED) { hammer_unlock(&node->lock); hammer_rel_node(node); @@ -89,7 +89,7 @@ hammer_init_cursor_hmp(hammer_cursor_t cursor, struct hammer_node **cache, hammer_rel_cluster(cluster, 0); if (error) break; - hammer_lock_ex(&node->lock); + hammer_lock_sh(&node->lock); if (node->flags & HAMMER_NODE_DELETED) { hammer_unlock(&node->lock); hammer_rel_node(node); @@ -122,7 +122,7 @@ hammer_init_cursor_cluster(hammer_cursor_t cursor, hammer_cluster_t cluster) cluster->ondisk->clu_btree_root, &error); if (error == 0) { - hammer_lock_ex(&cursor->node->lock); + hammer_lock_sh(&cursor->node->lock); error = hammer_load_cursor_parent(cursor); } KKASSERT(error == 0); @@ -157,12 +157,73 @@ hammer_done_cursor(hammer_cursor_t cursor) if (cursor->ip) hammer_mem_done(cursor); + /* + * If we deadlocked this node will be referenced. Do a quick + * lock/unlock to wait for the deadlock condition to clear. + */ + if (cursor->deadlk_node) { + hammer_lock_ex(&cursor->deadlk_node->lock); + hammer_unlock(&cursor->deadlk_node->lock); + hammer_rel_node(cursor->deadlk_node); + cursor->deadlk_node = NULL; + } + cursor->data = NULL; cursor->record = NULL; cursor->left_bound = NULL; cursor->right_bound = NULL; } +/* + * Upgrade cursor->node and cursor->parent to exclusive locks. This + * function can return EDEADLK. + * + * If we fail to upgrade the lock and cursor->deadlk_node is NULL, + * we add another reference to the node that failed and set + * cursor->deadlk_node so hammer_done_cursor() can block on it. + */ +int +hammer_cursor_upgrade(hammer_cursor_t cursor) +{ + int error; + + if (hammer_lock_held(&cursor->node->lock) < 0) { + error = hammer_lock_upgrade(&cursor->node->lock); + if (error && cursor->deadlk_node == NULL) { + cursor->deadlk_node = cursor->node; + hammer_ref_node(cursor->deadlk_node); + } + } else { + error = 0; + } + if (cursor->parent && hammer_lock_held(&cursor->parent->lock) < 0) { + error = hammer_lock_upgrade(&cursor->parent->lock); + if (error && cursor->deadlk_node == NULL) { + cursor->deadlk_node = cursor->parent; + hammer_ref_node(cursor->deadlk_node); + } + } else { + error = 0; + } + return(error); +} + +/* + * Downgrade cursor->node and cursor->parent to shared locks. This + * function can return EDEADLK. + */ +void +hammer_cursor_downgrade(hammer_cursor_t cursor) +{ + if (hammer_lock_held(&cursor->node->lock) > 0) + hammer_lock_downgrade(&cursor->node->lock); + if (cursor->parent && hammer_lock_held(&cursor->parent->lock) > 0) + hammer_lock_downgrade(&cursor->parent->lock); +} + + +#if 0 + /* * Acquire the parent B-Tree node of the specified node, returning a * referenced but unlocked node. NULL can be returned with *errorp == 0 @@ -221,6 +282,8 @@ hammer_get_parent_node(hammer_node_t node, int *errorp) return(parent); } +#endif + /* * Load the parent of cursor->node into cursor->parent. There are several * cases. (1) The parent is in the current cluster. (2) We are at the @@ -286,12 +349,7 @@ hammer_load_cursor_parent_local(hammer_cursor_t cursor) cursor->left_bound = &elm[0].internal.base; cursor->right_bound = &elm[1].internal.base; - if (hammer_lock_ex_try(&parent->lock) != 0) { - hammer_unlock(&cursor->node->lock); - hammer_lock_ex(&parent->lock); - hammer_lock_ex(&cursor->node->lock); - /* XXX check node generation count */ - } + hammer_lock_sh(&parent->lock); return(error); } @@ -362,12 +420,7 @@ hammer_load_cursor_parent_cluster(hammer_cursor_t cursor) &ccluster->ondisk->clu_btree_end) >= 0); */ - if (hammer_lock_ex_try(&parent->lock) != 0) { - hammer_unlock(&cursor->node->lock); - hammer_lock_ex(&parent->lock); - hammer_lock_ex(&cursor->node->lock); - /* XXX check node generation count */ - } + hammer_lock_sh(&parent->lock); return(0); } @@ -379,28 +432,11 @@ hammer_load_cursor_parent_cluster(hammer_cursor_t cursor) * the cursor remains unchanged. */ int -hammer_cursor_up(hammer_cursor_t cursor, int nonblock) +hammer_cursor_up(hammer_cursor_t cursor) { - hammer_node_t save; int error; - /* - * If asked to do this non-blocking acquire a lock on the parent - * now, before we mess with the cursor. - */ - if (nonblock) { - save = hammer_get_parent_node(cursor->parent, &error); - if (error) - return(error); - if (save) { - if (hammer_lock_ex_try(&save->lock) != 0) { - hammer_rel_node(save); - return(EAGAIN); - } - } - } else { - save = NULL; - } + hammer_cursor_downgrade(cursor); /* * Set the node to its parent. If the parent is NULL we are at @@ -422,10 +458,6 @@ hammer_cursor_up(hammer_cursor_t cursor, int nonblock) } else { error = hammer_load_cursor_parent(cursor); } - if (save) { - hammer_unlock(&save->lock); - hammer_rel_node(save); - } return(error); } @@ -444,11 +476,13 @@ hammer_cursor_toroot(hammer_cursor_t cursor) if (cursor->node->ondisk->parent == 0) return (0); + hammer_cursor_downgrade(cursor); + /* * Parent is root of cluster? */ if (cursor->parent->ondisk->parent == 0) - return (hammer_cursor_up(cursor, 0)); + return (hammer_cursor_up(cursor)); /* * Ok, reload the cursor with the root of the cluster, then @@ -469,7 +503,7 @@ hammer_cursor_toroot(hammer_cursor_t cursor) cursor->node = hammer_get_node(cluster, cluster->ondisk->clu_btree_root, &error); cursor->index = 0; - hammer_lock_ex(&cursor->node->lock); + hammer_lock_sh(&cursor->node->lock); hammer_rel_cluster(cluster, 0); if (error == 0) error = hammer_load_cursor_parent(cursor); @@ -495,6 +529,7 @@ hammer_cursor_down(hammer_cursor_t cursor) /* * The current node becomes the current parent */ + hammer_cursor_downgrade(cursor); node = cursor->node; KKASSERT(cursor->index >= 0 && cursor->index < node->ondisk->count); if (cursor->parent) { @@ -566,7 +601,7 @@ hammer_cursor_down(hammer_cursor_t cursor) break; } if (error == 0) { - hammer_lock_ex(&node->lock); + hammer_lock_sh(&node->lock); cursor->node = node; cursor->index = 0; } diff --git a/sys/vfs/hammer/hammer_cursor.h b/sys/vfs/hammer/hammer_cursor.h index 0914abaefb..24e1aae66a 100644 --- a/sys/vfs/hammer/hammer_cursor.h +++ b/sys/vfs/hammer/hammer_cursor.h @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_cursor.h,v 1.7 2008/01/17 05:06:09 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_cursor.h,v 1.8 2008/01/18 07:02:41 dillon Exp $ */ /* @@ -44,10 +44,8 @@ * treads MAY MODIFY YOUR CURSOR when you are not holding an exclusive * lock on the cursor's nodes. * - * Most operations maintain an exclusive lock on their node and - * parent node, but certain cursor operations may temporarily release - * one or both locks. In particular, a cursor_up operation may have - * to temporarily release underlying locks to avoid a deadlock. + * The cursor module maintains a shared lock on cursor->node and + * cursor->parent. */ struct hammer_cursor { /* @@ -56,9 +54,16 @@ struct hammer_cursor { */ hammer_node_t parent; int parent_index; + hammer_node_t node; int index; + /* + * Set of a deadlock occurs. hammer_done_cursor() will block on + * this after releasing parent and node, before returning. + */ + hammer_node_t deadlk_node; + /* * Pointer to the current node's bounds. Typically points to the * appropriate boundary elements in the parent or points to bounds diff --git a/sys/vfs/hammer/hammer_inode.c b/sys/vfs/hammer/hammer_inode.c index e6817dfa0b..3b6833bded 100644 --- a/sys/vfs/hammer/hammer_inode.c +++ b/sys/vfs/hammer/hammer_inode.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.21 2008/01/17 05:06:09 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.22 2008/01/18 07:02:41 dillon Exp $ */ #include "hammer.h" @@ -228,6 +228,7 @@ loop: /* * Locate the on-disk inode. */ +retry: hammer_init_cursor_hmp(&cursor, cache, hmp); cursor.key_beg.obj_id = ip->obj_id; cursor.key_beg.key = 0; @@ -240,6 +241,10 @@ loop: HAMMER_CURSOR_ASOF; *errorp = hammer_btree_lookup(&cursor); + if (*errorp == EDEADLK) { + hammer_done_cursor(&cursor); + goto retry; + } /* * On success the B-Tree lookup will hold the appropriate @@ -416,9 +421,11 @@ retry: error = hammer_ip_delete_record(&cursor, last_tid); if (error == 0) ip->flags |= HAMMER_INODE_DELONDISK; + hammer_cache_node(cursor.node, &ip->cache[0]); } - hammer_cache_node(cursor.node, &ip->cache[0]); hammer_done_cursor(&cursor); + if (error == EDEADLK) + goto retry; } /* @@ -470,6 +477,7 @@ hammer_update_itimes(hammer_inode_t ip) struct hammer_inode_record *rec; int error; +retry: error = 0; if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DELONDISK)) == HAMMER_INODE_ONDISK) { @@ -484,7 +492,6 @@ hammer_update_itimes(hammer_inode_t ip) cursor.flags |= HAMMER_CURSOR_GET_RECORD | HAMMER_CURSOR_ASOF; error = hammer_btree_lookup(&cursor); - if (error == 0) { rec = &cursor.record->inode; hammer_modify_buffer(cursor.record_buffer); @@ -492,9 +499,11 @@ hammer_update_itimes(hammer_inode_t ip) rec->ino_mtime = ip->ino_rec.ino_mtime; ip->flags &= ~HAMMER_INODE_ITIMES; /* XXX recalculate crc */ + hammer_cache_node(cursor.node, &ip->cache[0]); } - hammer_cache_node(cursor.node, &ip->cache[0]); hammer_done_cursor(&cursor); + if (error == EDEADLK) + goto retry; } return(error); } diff --git a/sys/vfs/hammer/hammer_object.c b/sys/vfs/hammer/hammer_object.c index 80f2a0dc6c..ce571f1783 100644 --- a/sys/vfs/hammer/hammer_object.c +++ b/sys/vfs/hammer/hammer_object.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.20 2008/01/17 05:06:09 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.21 2008/01/18 07:02:41 dillon Exp $ */ #include "hammer.h" @@ -349,6 +349,9 @@ hammer_ip_add_directory(struct hammer_transaction *trans, * cursor must be seeked to the directory entry record being deleted. * * NOTE: HAMMER_CURSOR_DELETE may not have been set. XXX remove flag. + * + * This function can return EDEADLK requiring the caller to terminate + * the cursor and retry. */ int hammer_ip_del_directory(struct hammer_transaction *trans, @@ -363,12 +366,16 @@ hammer_ip_del_directory(struct hammer_transaction *trans, * One less link. The file may still be open in the OS even after * all links have gone away so we only try to sync if the OS has * no references and nlinks falls to 0. + * + * We have to terminate the cursor before syncing the inode to + * avoid deadlocking against ourselves. */ if (error == 0) { --ip->ino_rec.ino_nlinks; hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY); if (ip->ino_rec.ino_nlinks == 0 && (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))) { + hammer_done_cursor(cursor); hammer_sync_inode(ip, MNT_NOWAIT, 1); } @@ -433,6 +440,7 @@ hammer_ip_sync_data(hammer_transaction_t trans, hammer_inode_t ip, void *bdata; int error; +retry: error = hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp); if (error) return(error); @@ -519,6 +527,8 @@ done: if (error == ENOSPC) hammer_load_spike(&cursor, spike); hammer_done_cursor(&cursor); + if (error == EDEADLK) + goto retry; return(error); } @@ -537,6 +547,7 @@ hammer_ip_sync_record(hammer_record_t record, struct hammer_cursor **spike) void *bdata; int error; +retry: error = hammer_init_cursor_hmp(&cursor, &record->ip->cache[0], record->ip->hmp); if (error) @@ -554,22 +565,23 @@ hammer_ip_sync_record(hammer_record_t record, struct hammer_cursor **spike) * insert, re-lookup without the insert flag so the cursor * is properly positioned for the spike. */ -again: - error = hammer_btree_lookup(&cursor); - if (error == 0) { - if (record->rec.base.base.rec_type == HAMMER_RECTYPE_DIRENTRY) { - hmp = cursor.node->cluster->volume->hmp; - if (++hmp->namekey_iterator == 0) - ++hmp->namekey_iterator; - record->rec.base.base.key &= ~(0xFFFFFFFFLL); - record->rec.base.base.key |= hmp->namekey_iterator; - cursor.key_beg.key = record->rec.base.base.key; - goto again; + for (;;) { + error = hammer_btree_lookup(&cursor); + if (error) + break; + if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY) { + kprintf("hammer_ip_sync_record: duplicate rec " + "at (%016llx)\n", record->rec.base.base.key); + Debugger("duplicate record1"); + error = EIO; + break; } - kprintf("hammer_ip_sync_record: duplicate rec at (%016llx)\n", - record->rec.base.base.key); - Debugger("duplicate record1"); - error = EIO; + hmp = cursor.node->cluster->volume->hmp; + if (++hmp->namekey_iterator == 0) + ++hmp->namekey_iterator; + record->rec.base.base.key &= ~(0xFFFFFFFFLL); + record->rec.base.base.key |= hmp->namekey_iterator; + cursor.key_beg.key = record->rec.base.base.key; } if (error != ENOENT) goto done; @@ -676,6 +688,8 @@ done: if (error == ENOSPC) hammer_load_spike(&cursor, spike); hammer_done_cursor(&cursor); + if (error == EDEADLK) + goto retry; return(error); } @@ -686,6 +700,9 @@ done: * * The target cursor will be modified by this call. Note in particular * that HAMMER_CURSOR_INSERT is set. + * + * NOTE: This can return EDEADLK, requiring the caller to release its cursor + * and retry the operation. */ int hammer_write_record(hammer_cursor_t cursor, hammer_record_ondisk_t orec, @@ -869,6 +886,9 @@ hammer_ip_lookup(hammer_cursor_t cursor, struct hammer_inode *ip) * * When 0 is returned hammer_ip_next() may be used to iterate additional * records within the requested range. + * + * This function can return EDEADLK, requiring the caller to terminate + * the cursor and try again. */ int hammer_ip_first(hammer_cursor_t cursor, struct hammer_inode *ip) @@ -894,10 +914,14 @@ hammer_ip_first(hammer_cursor_t cursor, struct hammer_inode *ip) * The ATEDISK flag is used by hammer_btree_iterate to determine * whether it must index forwards or not. It is also used here * to select the next record from in-memory or on-disk. + * + * EDEADLK can only occur if the lookup hit an empty internal + * element and couldn't delete it. Since this could only occur + * in-range, we can just iterate from the failure point. */ if (ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) { error = hammer_btree_lookup(cursor); - if (error == ENOENT) { + if (error == ENOENT || error == EDEADLK) { cursor->flags &= ~HAMMER_CURSOR_ATEDISK; error = hammer_btree_iterate(cursor); } @@ -1091,6 +1115,7 @@ hammer_ip_delete_range(hammer_transaction_t trans, hammer_inode_t ip, int error; int64_t off; +retry: hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp); cursor.key_beg.obj_id = ip->obj_id; @@ -1193,6 +1218,8 @@ hammer_ip_delete_range(hammer_transaction_t trans, hammer_inode_t ip, error = hammer_ip_next(&cursor); } hammer_done_cursor(&cursor); + if (error == EDEADLK) + goto retry; if (error == ENOENT) error = 0; return(error); @@ -1210,6 +1237,7 @@ hammer_ip_delete_range_all(hammer_transaction_t trans, hammer_inode_t ip) hammer_base_elm_t base; int error; +retry: hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp); cursor.key_beg.obj_id = ip->obj_id; @@ -1250,13 +1278,18 @@ hammer_ip_delete_range_all(hammer_transaction_t trans, hammer_inode_t ip) error = hammer_ip_next(&cursor); } hammer_done_cursor(&cursor); + if (error == EDEADLK) + goto retry; if (error == ENOENT) error = 0; return(error); } /* - * Delete the record at the current cursor + * Delete the record at the current cursor. + * + * NOTE: This can return EDEADLK, requiring the caller to terminate the + * cursor and retry. */ int hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid) @@ -1284,10 +1317,14 @@ hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid) hammer_modify_buffer(cursor->record_buffer); cursor->record->base.base.delete_tid = tid; - hammer_modify_node(cursor->node); - elm = &cursor->node->ondisk->elms[cursor->index]; - elm->leaf.base.delete_tid = tid; - hammer_update_syncid(cursor->record_buffer->cluster, tid); + error = hammer_cursor_upgrade(cursor); + if (error == 0) { + hammer_modify_node(cursor->node); + elm = &cursor->node->ondisk->elms[cursor->index]; + elm->leaf.base.delete_tid = tid; + hammer_update_syncid(cursor->record_buffer->cluster, + tid); + } } /* diff --git a/sys/vfs/hammer/hammer_ondisk.c b/sys/vfs/hammer/hammer_ondisk.c index 335b5fe2fc..a46463fd12 100644 --- a/sys/vfs/hammer/hammer_ondisk.c +++ b/sys/vfs/hammer/hammer_ondisk.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.21 2008/01/15 06:02:57 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.22 2008/01/18 07:02:41 dillon Exp $ */ /* * Manage HAMMER's on-disk structures. These routines are primarily @@ -2047,8 +2047,9 @@ hammer_sync_scan1(struct mount *mp, struct vnode *vp, void *data) struct hammer_inode *ip; ip = VTOI(vp); - if (vp->v_type == VNON || ((ip->flags & HAMMER_INODE_MODMASK) == 0 && - RB_EMPTY(&vp->v_rbdirty_tree))) { + if (vp->v_type == VNON || ip == NULL || + ((ip->flags & HAMMER_INODE_MODMASK) == 0 && + RB_EMPTY(&vp->v_rbdirty_tree))) { return(-1); } return(0); diff --git a/sys/vfs/hammer/hammer_recover.c b/sys/vfs/hammer/hammer_recover.c index fa94feeca5..d3cab3638a 100644 --- a/sys/vfs/hammer/hammer_recover.c +++ b/sys/vfs/hammer/hammer_recover.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_recover.c,v 1.3 2008/01/15 06:02:57 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_recover.c,v 1.4 2008/01/18 07:02:41 dillon Exp $ */ #include "hammer.h" @@ -561,6 +561,7 @@ hammer_recover_btree(hammer_cluster_t cluster, hammer_buffer_t buffer, cursor.flags |= HAMMER_CURSOR_INSERT; error = hammer_btree_lookup(&cursor); + KKASSERT(error != EDEADLK); KKASSERT(cursor.node); if (error == 0) { kprintf("hammer_recover_btree: Duplicate record\n"); @@ -581,6 +582,7 @@ hammer_recover_btree(hammer_cluster_t cluster, hammer_buffer_t buffer, error = hammer_btree_insert_cluster(&cursor, ncluster, rec_offset); kprintf("recover spike record error %d\n", error); + KKASSERT(error != EDEADLK); if (error) Debugger("spike recovery"); } else { @@ -596,6 +598,7 @@ hammer_recover_btree(hammer_cluster_t cluster, hammer_buffer_t buffer, elm.leaf.data_crc = rec->base.data_crc; error = hammer_btree_insert(&cursor, &elm); + KKASSERT(error != EDEADLK); } if (error) { diff --git a/sys/vfs/hammer/hammer_spike.c b/sys/vfs/hammer/hammer_spike.c index 9a09cd7c8b..7b7a15ac2f 100644 --- a/sys/vfs/hammer/hammer_spike.c +++ b/sys/vfs/hammer/hammer_spike.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/Attic/hammer_spike.c,v 1.8 2008/01/17 05:06:09 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/Attic/hammer_spike.c,v 1.9 2008/01/18 07:02:41 dillon Exp $ */ #include "hammer.h" @@ -59,10 +59,10 @@ hammer_load_spike(hammer_cursor_t cursor, struct hammer_cursor **spikep) if (spike->parent) { hammer_ref_node(spike->parent); - hammer_lock_ex(&spike->parent->lock); + hammer_lock_sh(&spike->parent->lock); } hammer_ref_node(spike->node); - hammer_lock_ex(&spike->node->lock); + hammer_lock_sh(&spike->node->lock); kprintf("LOAD SPIKE %p\n", spike); } @@ -97,14 +97,20 @@ hammer_spike(struct hammer_cursor **spikep) /*Debugger("ENOSPC");*/ /* - * Lock and flush the cluster XXX + * Validate and lock the spike. If this fails due to a deadlock + * we still return 0 since a spike is only called when the + * caller intends to retry the operation. */ spike = *spikep; KKASSERT(spike != NULL); KKASSERT(spike->parent && spike->parent->cluster == spike->node->cluster); - error = 0; + error = hammer_cursor_upgrade(spike); + if (error) { + error = 0; + goto failed4; + } onode = spike->node; ocluster = onode->cluster; @@ -146,6 +152,7 @@ hammer_spike(struct hammer_cursor **spikep) HAMMER_RECTYPE_CLUSTER); error = hammer_write_record(&ncursor, spike->record, spike->data, spike->flags); + KKASSERT(error != EDEADLK); if (error == ENOSPC) { kprintf("impossible ENOSPC error on spike\n"); error = EIO; @@ -245,6 +252,7 @@ success: failed3: kprintf("UNLOAD SPIKE %p %d\n", spike, error); hammer_unlock(&ocluster->io.lock); +failed4: hammer_done_cursor(spike); --hammer_count_spikes; kfree(spike, M_HAMMER); diff --git a/sys/vfs/hammer/hammer_subs.c b/sys/vfs/hammer/hammer_subs.c index 0f66c63400..fb38c2d3fe 100644 --- a/sys/vfs/hammer/hammer_subs.c +++ b/sys/vfs/hammer/hammer_subs.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_subs.c,v 1.11 2008/01/09 00:46:22 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_subs.c,v 1.12 2008/01/18 07:02:41 dillon Exp $ */ /* * HAMMER structural locking @@ -91,6 +91,7 @@ hammer_lock_sh(struct hammer_lock *lock) crit_enter(); while (lock->locktd != NULL) { if (lock->locktd == curthread) { + Debugger("hammer_lock_sh: lock_sh on exclusive"); ++lock->lockcount; crit_exit(); return; @@ -103,8 +104,38 @@ hammer_lock_sh(struct hammer_lock *lock) crit_exit(); } +/* + * Upgrade a shared lock to an exclusively held lock. This function will + * return EDEADLK If there is more then one shared holder. + * + * No error occurs and no action is taken if the lock is already exclusively + * held by the caller. + */ +int +hammer_lock_upgrade(struct hammer_lock *lock) +{ + int error; + + crit_enter(); + if (lock->lockcount > 0) { + KKASSERT(lock->locktd == curthread); + error = 0; + } else if (lock->lockcount == -1) { + lock->lockcount = 1; + lock->locktd = curthread; + error = 0; + } else { + error = EDEADLK; + } + crit_exit(); + return(error); +} + +/* + * Downgrade an exclusively held lock to a shared lock. + */ void -hammer_downgrade(struct hammer_lock *lock) +hammer_lock_downgrade(struct hammer_lock *lock) { KKASSERT(lock->lockcount == 1); crit_enter(); diff --git a/sys/vfs/hammer/hammer_vnops.c b/sys/vfs/hammer/hammer_vnops.c index ed1d8a7d2f..8ba23afec7 100644 --- a/sys/vfs/hammer/hammer_vnops.c +++ b/sys/vfs/hammer/hammer_vnops.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.21 2008/01/17 05:06:09 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.22 2008/01/18 07:02:41 dillon Exp $ */ #include @@ -555,6 +555,7 @@ hammer_vop_nresolve(struct vop_nresolve_args *ap) int i; int nlen; int flags; + u_int64_t obj_id; /* * Misc initialization, plus handle as-of name extensions. Look for @@ -629,6 +630,8 @@ hammer_vop_nresolve(struct vop_nresolve_args *ap) */ error = hammer_ip_first(&cursor, dip); rec = NULL; + obj_id = 0; + while (error == 0) { error = hammer_ip_resolve_data(&cursor); if (error) @@ -636,14 +639,15 @@ hammer_vop_nresolve(struct vop_nresolve_args *ap) rec = cursor.record; if (nlen == rec->entry.base.data_len && bcmp(ncp->nc_name, cursor.data, nlen) == 0) { + obj_id = rec->entry.obj_id; break; } error = hammer_ip_next(&cursor); } + hammer_done_cursor(&cursor); if (error == 0) { ip = hammer_get_inode(dip->hmp, &dip->cache[1], - rec->entry.obj_id, asof, - flags, &error); + obj_id, asof, flags, &error); if (error == 0) { error = hammer_get_vnode(ip, LK_EXCLUSIVE, &vp); hammer_rel_inode(ip, 0); @@ -658,7 +662,6 @@ hammer_vop_nresolve(struct vop_nresolve_args *ap) } else if (error == ENOENT) { cache_setvp(ap->a_nch, NULL); } - hammer_done_cursor(&cursor); return (error); } @@ -1151,7 +1154,7 @@ hammer_vop_nrename(struct vop_nrename_args *ap) * The key range is inclusive of both key_beg and key_end. */ namekey = hammer_directory_namekey(fncp->nc_name, fncp->nc_nlen); - +retry: hammer_init_cursor_hmp(&cursor, &fdip->cache[0], fdip->hmp); cursor.key_beg.obj_id = fdip->obj_id; cursor.key_beg.key = namekey; @@ -1186,15 +1189,18 @@ hammer_vop_nrename(struct vop_nrename_args *ap) /* * If all is ok we have to get the inode so we can adjust nlinks. + * + * WARNING: hammer_ip_del_directory() may have to terminate the + * cursor to avoid a recursion. It's ok to call hammer_done_cursor() + * twice. */ - if (error) - goto done; - error = hammer_ip_del_directory(&trans, &cursor, fdip, ip); - if (error == 0) - cache_rename(ap->a_fnch, ap->a_tnch); -done: + error = hammer_ip_del_directory(&trans, &cursor, fdip, ip); hammer_done_cursor(&cursor); + if (error == 0) + cache_rename(ap->a_fnch, ap->a_tnch); + if (error == EDEADLK) + goto retry; failed: if (error == 0) { hammer_commit_transaction(&trans); @@ -1706,8 +1712,10 @@ hammer_dounlink(struct nchandle *nch, struct vnode *dvp, struct ucred *cred, if (dip->flags & HAMMER_INODE_RO) return (EROFS); - namekey = hammer_directory_namekey(ncp->nc_name, ncp->nc_nlen); + hammer_start_transaction(&trans, dip->hmp); + namekey = hammer_directory_namekey(ncp->nc_name, ncp->nc_nlen); +retry: hammer_init_cursor_hmp(&cursor, &dip->cache[0], dip->hmp); cursor.key_beg.obj_id = dip->obj_id; cursor.key_beg.key = namekey; @@ -1721,8 +1729,6 @@ hammer_dounlink(struct nchandle *nch, struct vnode *dvp, struct ucred *cred, cursor.asof = dip->obj_asof; cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; - hammer_start_transaction(&trans, dip->hmp); - /* * Scan all matching records (the chain), locate the one matching * the requested path component. info->last_error contains the @@ -1759,6 +1765,11 @@ hammer_dounlink(struct nchandle *nch, struct vnode *dvp, struct ucred *cred, HAMMER_OBJTYPE_DIRECTORY) { error = hammer_ip_check_directory_empty(&trans, ip); } + /* + * WARNING: hammer_ip_del_directory() may have to terminate + * the cursor to avoid a lock recursion. It's ok to call + * hammer_done_cursor() twice. + */ if (error == 0) error = hammer_ip_del_directory(&trans, &cursor, dip, ip); if (error == 0) { @@ -1770,12 +1781,14 @@ hammer_dounlink(struct nchandle *nch, struct vnode *dvp, struct ucred *cred, } hammer_rel_inode(ip, 0); } + hammer_done_cursor(&cursor); + if (error == EDEADLK) + goto retry; if (error == 0) hammer_commit_transaction(&trans); else hammer_abort_transaction(&trans); - hammer_done_cursor(&cursor); return (error); } -- 2.41.0