From 98da6d8c1ec5dbc193e9870bbf0106f546a3aaff Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Fri, 11 Jul 2008 01:22:29 +0000 Subject: [PATCH] HAMMER 61B/Many: Stabilization * Giving the sync_lock's exclusive locks priority over shared locks caused a 3-way deadlock, revert to using normal shared locks. * Move the syncer_lock deeper, closer to the code that actually needs to hold but still paying attention to atomicy requirements. This reduces lock contention and improves performance with parallel loads. Reported-by: Michael Neumann (using his file-creator program) --- sys/vfs/hammer/hammer.h | 3 +-- sys/vfs/hammer/hammer_btree.c | 5 +++- sys/vfs/hammer/hammer_cursor.c | 12 ++++++--- sys/vfs/hammer/hammer_flusher.c | 5 +--- sys/vfs/hammer/hammer_inode.c | 15 ++++++++---- sys/vfs/hammer/hammer_mirror.c | 24 ++++++++++++++---- sys/vfs/hammer/hammer_object.c | 13 ++++++---- sys/vfs/hammer/hammer_prune.c | 13 +++------- sys/vfs/hammer/hammer_reblock.c | 5 +--- sys/vfs/hammer/hammer_subs.c | 43 ++++++++++----------------------- 10 files changed, 68 insertions(+), 70 deletions(-) diff --git a/sys/vfs/hammer/hammer.h b/sys/vfs/hammer/hammer.h index 110530d5ce..2f3f980c41 100644 --- a/sys/vfs/hammer/hammer.h +++ b/sys/vfs/hammer/hammer.h @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.110 2008/07/10 21:23:58 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.111 2008/07/11 01:22:29 dillon Exp $ */ /* * This header file contains structures used internally by the HAMMERFS @@ -826,7 +826,6 @@ int hammer_cursor_seek(hammer_cursor_t cursor, hammer_node_t node, void hammer_lock_ex_ident(struct hammer_lock *lock, const char *ident); int hammer_lock_ex_try(struct hammer_lock *lock); void hammer_lock_sh(struct hammer_lock *lock); -void hammer_lock_sh_lowpri(struct hammer_lock *lock); int hammer_lock_sh_try(struct hammer_lock *lock); int hammer_lock_upgrade(struct hammer_lock *lock); void hammer_lock_downgrade(struct hammer_lock *lock); diff --git a/sys/vfs/hammer/hammer_btree.c b/sys/vfs/hammer/hammer_btree.c index 6fb8d1f172..3966dad888 100644 --- a/sys/vfs/hammer/hammer_btree.c +++ b/sys/vfs/hammer/hammer_btree.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.69 2008/07/10 21:23:58 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.70 2008/07/11 01:22:29 dillon Exp $ */ /* @@ -572,6 +572,8 @@ hammer_btree_lookup(hammer_cursor_t cursor) { int error; + KKASSERT ((cursor->flags & HAMMER_CURSOR_INSERT) == 0 || + cursor->trans->sync_lock_refs > 0); ++hammer_stats_btree_lookups; if (cursor->flags & HAMMER_CURSOR_ASOF) { KKASSERT((cursor->flags & HAMMER_CURSOR_INSERT) == 0); @@ -811,6 +813,7 @@ hammer_btree_delete(hammer_cursor_t cursor) int error; int i; + KKASSERT (cursor->trans->sync_lock_refs > 0); if ((error = hammer_cursor_upgrade(cursor)) != 0) return(error); ++hammer_stats_btree_deletes; diff --git a/sys/vfs/hammer/hammer_cursor.c b/sys/vfs/hammer/hammer_cursor.c index 45d26fc8b6..5458bf914f 100644 --- a/sys/vfs/hammer/hammer_cursor.c +++ b/sys/vfs/hammer/hammer_cursor.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_cursor.c,v 1.40 2008/07/08 04:34:41 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_cursor.c,v 1.41 2008/07/11 01:22:29 dillon Exp $ */ /* @@ -75,7 +75,7 @@ hammer_init_cursor(hammer_transaction_t trans, hammer_cursor_t cursor, if (cache && cache->node) { node = hammer_ref_node_safe(trans->hmp, cache, &error); if (error == 0) { - hammer_lock_sh_lowpri(&node->lock); + hammer_lock_sh(&node->lock); if (node->flags & HAMMER_NODE_DELETED) { hammer_unlock(&node->lock); hammer_rel_node(node); @@ -100,7 +100,7 @@ hammer_init_cursor(hammer_transaction_t trans, hammer_cursor_t cursor, hammer_rel_volume(volume, 0); if (error) break; - hammer_lock_sh_lowpri(&node->lock); + hammer_lock_sh(&node->lock); /* * If someone got in before we could lock the node, retry. @@ -573,6 +573,10 @@ hammer_lock_cursor(hammer_cursor_t cursor, int also_ip) * Recover from a deadlocked cursor, tracking any node removals or * replacements. If the cursor's current node is removed by another * thread (via btree_remove()) the cursor will be seeked upwards. + * + * The caller is working a modifying operation and must be holding the + * sync lock (shared). We do not release the sync lock because this + * would break atomicy. */ int hammer_recover_cursor(hammer_cursor_t cursor) @@ -580,6 +584,7 @@ hammer_recover_cursor(hammer_cursor_t cursor) int error; hammer_unlock_cursor(cursor, 0); + KKASSERT(cursor->trans->sync_lock_refs > 0); /* * Wait for the deadlock to clear @@ -595,7 +600,6 @@ hammer_recover_cursor(hammer_cursor_t cursor) hammer_rel_mem_record(cursor->deadlk_rec); cursor->deadlk_rec = NULL; } - error = hammer_lock_cursor(cursor, 0); return(error); } diff --git a/sys/vfs/hammer/hammer_flusher.c b/sys/vfs/hammer/hammer_flusher.c index ca77e75d64..9ffeb45586 100644 --- a/sys/vfs/hammer/hammer_flusher.c +++ b/sys/vfs/hammer/hammer_flusher.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_flusher.c,v 1.34 2008/07/10 21:23:58 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_flusher.c,v 1.35 2008/07/11 01:22:29 dillon Exp $ */ /* * HAMMER dependancy flusher thread @@ -353,12 +353,10 @@ hammer_flusher_flush_inode(hammer_inode_t ip, hammer_transaction_t trans) int error; hammer_flusher_clean_loose_ios(hmp); - hammer_lock_sh(&hmp->flusher.finalize_lock); error = hammer_sync_inode(ip); if (error != EWOULDBLOCK) ip->error = error; hammer_flush_inode_done(ip); - hammer_unlock(&hmp->flusher.finalize_lock); while (hmp->flusher.finalize_want) tsleep(&hmp->flusher.finalize_want, 0, "hmrsxx", 0); if (hammer_flusher_undo_exhausted(trans, 1)) { @@ -388,7 +386,6 @@ hammer_flusher_undo_exhausted(hammer_transaction_t trans, int quarter) { if (hammer_undo_space(trans) < hammer_undo_max(trans->hmp) * quarter / 4) { - kprintf("%c", '0' + quarter); return(1); } else { return(0); diff --git a/sys/vfs/hammer/hammer_inode.c b/sys/vfs/hammer/hammer_inode.c index a1b158bbb0..0f3ec742a7 100644 --- a/sys/vfs/hammer/hammer_inode.c +++ b/sys/vfs/hammer/hammer_inode.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.98 2008/07/10 21:23:58 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.99 2008/07/11 01:22:29 dillon Exp $ */ #include "hammer.h" @@ -987,6 +987,7 @@ retry: /* * Root volume count of inodes */ + hammer_sync_lock_sh(trans); if ((ip->flags & HAMMER_INODE_ONDISK) == 0) { hammer_modify_volume_field(trans, trans->rootvol, @@ -997,6 +998,7 @@ retry: if (hammer_debug_inode) kprintf("NOWONDISK %p\n", ip); } + hammer_sync_unlock(trans); } } @@ -1062,21 +1064,25 @@ retry: * Updating MTIME requires an UNDO. Just cover * both atime and mtime. */ + hammer_sync_lock_sh(trans); hammer_modify_buffer(trans, cursor->data_buffer, HAMMER_ITIMES_BASE(&cursor->data->inode), HAMMER_ITIMES_BYTES); cursor->data->inode.atime = ip->sync_ino_data.atime; cursor->data->inode.mtime = ip->sync_ino_data.mtime; hammer_modify_buffer_done(cursor->data_buffer); + hammer_sync_unlock(trans); } else if (ip->sync_flags & HAMMER_INODE_ATIME) { /* * Updating atime only can be done in-place with * no UNDO. */ + hammer_sync_lock_sh(trans); hammer_modify_buffer(trans, cursor->data_buffer, NULL, 0); cursor->data->inode.atime = ip->sync_ino_data.atime; hammer_modify_buffer_done(cursor->data_buffer); + hammer_sync_unlock(trans); } ip->sync_flags &= ~(HAMMER_INODE_ATIME | HAMMER_INODE_MTIME); } @@ -2082,11 +2088,8 @@ done: * The finalization lock is already being held by virtue of the * flusher calling us. */ - if (hammer_flusher_meta_limit(hmp)) { - hammer_unlock(&hmp->flusher.finalize_lock); + if (hammer_flusher_meta_limit(hmp)) hammer_flusher_finalize(trans, 0); - hammer_lock_sh(&hmp->flusher.finalize_lock); - } return(error); } @@ -2303,6 +2306,7 @@ hammer_sync_inode(hammer_inode_t ip) /* * Adjust the inode count in the volume header */ + hammer_sync_lock_sh(&trans); if (ip->flags & HAMMER_INODE_ONDISK) { hammer_modify_volume_field(&trans, trans.rootvol, @@ -2310,6 +2314,7 @@ hammer_sync_inode(hammer_inode_t ip) --ip->hmp->rootvol->ondisk->vol0_stat_inodes; hammer_modify_volume_done(trans.rootvol); } + hammer_sync_unlock(&trans); } else { Debugger("hammer_ip_delete_clean errored"); } diff --git a/sys/vfs/hammer/hammer_mirror.c b/sys/vfs/hammer/hammer_mirror.c index 49486c5e10..16df3fd055 100644 --- a/sys/vfs/hammer/hammer_mirror.c +++ b/sys/vfs/hammer/hammer_mirror.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_mirror.c,v 1.10 2008/07/10 04:44:33 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_mirror.c,v 1.11 2008/07/11 01:22:29 dillon Exp $ */ /* * HAMMER mirroring ioctls - serialize and deserialize modifications made @@ -560,6 +560,8 @@ hammer_ioc_mirror_write_rec(hammer_cursor_t cursor, * mirror operations are effective an as-of operation and * delete_tid can be 0 for mirroring purposes even if it is * not actually 0 at the originator. + * + * These functions can return EDEADLK */ cursor->key_beg = mrec->leaf.base; cursor->flags |= HAMMER_CURSOR_BACKEND; @@ -567,13 +569,9 @@ hammer_ioc_mirror_write_rec(hammer_cursor_t cursor, error = hammer_btree_lookup(cursor); if (error == 0 && hammer_mirror_check(cursor, mrec)) { - hammer_sync_lock_sh(trans); error = hammer_mirror_update(cursor, mrec); - hammer_sync_unlock(trans); } else if (error == ENOENT && mrec->leaf.base.delete_tid == 0) { - hammer_sync_lock_sh(trans); error = hammer_mirror_write(cursor, mrec, uptr); - hammer_sync_unlock(trans); } else if (error == ENOENT) { error = 0; } @@ -663,6 +661,10 @@ hammer_mirror_delete_at_cursor(hammer_cursor_t cursor, { hammer_transaction_t trans; hammer_btree_elm_t elm; + int error; + + if ((error = hammer_cursor_upgrade(cursor)) != 0) + return(error); elm = &cursor->node->ondisk->elms[cursor->index]; KKASSERT(elm->leaf.base.btype == HAMMER_BTREE_TYPE_RECORD); @@ -731,6 +733,10 @@ hammer_mirror_update(hammer_cursor_t cursor, { hammer_transaction_t trans; hammer_btree_leaf_elm_t elm; + int error; + + if ((error = hammer_cursor_upgrade(cursor)) != 0) + return(error); elm = cursor->leaf; trans = cursor->trans; @@ -741,6 +747,7 @@ hammer_mirror_update(hammer_cursor_t cursor, elm->base.obj_id, elm->base.key); return(0); } + hammer_sync_lock_sh(trans); KKASSERT(elm->base.create_tid < mrec->leaf.base.delete_tid); hammer_modify_node(trans, cursor->node, elm, sizeof(*elm)); @@ -763,6 +770,7 @@ hammer_mirror_update(hammer_cursor_t cursor, --trans->hmp->rootvol->ondisk->vol0_stat_inodes; hammer_modify_volume_done(trans->rootvol); } + hammer_sync_unlock(trans); return(0); } @@ -787,6 +795,11 @@ hammer_mirror_write(hammer_cursor_t cursor, trans = cursor->trans; data_buffer = NULL; + /* + * Get the sync lock so the whole mess is atomic + */ + hammer_sync_lock_sh(trans); + /* * Allocate and adjust data */ @@ -873,6 +886,7 @@ failed: mrec->leaf.data_offset, mrec->leaf.data_len); } + hammer_sync_unlock(trans); if (data_buffer) hammer_rel_buffer(data_buffer, 0); return(error); diff --git a/sys/vfs/hammer/hammer_object.c b/sys/vfs/hammer/hammer_object.c index b99c470128..e7bb5ae9e4 100644 --- a/sys/vfs/hammer/hammer_object.c +++ b/sys/vfs/hammer/hammer_object.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.85 2008/07/10 04:44:33 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.86 2008/07/11 01:22:29 dillon Exp $ */ #include "hammer.h" @@ -1069,8 +1069,8 @@ hammer_ip_sync_record_cursor(hammer_cursor_t cursor, hammer_record_t record) * we may have to iterate the low 32 bits of the key to find an unused * key. */ + hammer_sync_lock_sh(trans); cursor->flags |= HAMMER_CURSOR_INSERT; - error = hammer_btree_lookup(cursor); if (hammer_debug_inode) kprintf("DOINSERT LOOKUP %d\n", error); @@ -1088,7 +1088,7 @@ hammer_ip_sync_record_cursor(hammer_cursor_t cursor, hammer_record_t record) #endif if (error != ENOENT) - goto done; + goto done_unlock; /* * Allocate the record and data. The result buffers will be @@ -1116,7 +1116,7 @@ hammer_ip_sync_record_cursor(hammer_cursor_t cursor, hammer_record_t record) &record->leaf.data_offset, &cursor->data_buffer, &error); if (bdata == NULL) - goto done; + goto done_unlock; hammer_crc_set_leaf(record->data, &record->leaf); hammer_modify_buffer(trans, cursor->data_buffer, NULL, 0); bcopy(record->data, bdata, record->leaf.data_len); @@ -1163,7 +1163,8 @@ hammer_ip_sync_record_cursor(hammer_cursor_t cursor, hammer_record_t record) record->leaf.data_len); } } - +done_unlock: + hammer_sync_unlock(trans); done: return(error); } @@ -1971,6 +1972,7 @@ hammer_delete_at_cursor(hammer_cursor_t cursor, int delete_flags, * Adjust the delete_tid. Update the mirror_tid propagation field * as well. */ + hammer_sync_lock_sh(cursor->trans); doprop = 0; if (delete_flags & HAMMER_DELETE_ADJUST) { hammer_modify_node(cursor->trans, node, elm, sizeof(*elm)); @@ -2057,6 +2059,7 @@ hammer_delete_at_cursor(hammer_cursor_t cursor, int delete_flags, KKASSERT(cursor->ip != NULL); hammer_btree_do_propagation(cursor, cursor->ip->pfsm, leaf); } + hammer_sync_unlock(cursor->trans); return (error); } diff --git a/sys/vfs/hammer/hammer_prune.c b/sys/vfs/hammer/hammer_prune.c index 7f1103e19e..3b83b7a06d 100644 --- a/sys/vfs/hammer/hammer_prune.c +++ b/sys/vfs/hammer/hammer_prune.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_prune.c,v 1.12 2008/07/04 07:25:36 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_prune.c,v 1.13 2008/07/11 01:22:29 dillon Exp $ */ #include "hammer.h" @@ -126,7 +126,6 @@ retry: */ cursor.flags |= HAMMER_CURSOR_PRUNING; - hammer_sync_lock_sh(trans); error = hammer_btree_last(&cursor); while (error == 0) { @@ -141,11 +140,6 @@ retry: */ if ((error = hammer_signal_check(trans->hmp)) != 0) break; - if (trans->hmp->sync_lock.wanted) { - hammer_sync_unlock(trans); - tsleep(trans, 0, "hmrslo", hz / 10); - hammer_sync_lock_sh(trans); - } if (hammer_flusher_meta_limit(trans->hmp) || hammer_flusher_undo_exhausted(trans, 2)) { error = EWOULDBLOCK; @@ -178,9 +172,11 @@ retry: */ isdir = (elm->base.rec_type == HAMMER_RECTYPE_DIRENTRY); + hammer_sync_lock_sh(trans); error = hammer_delete_at_cursor(&cursor, HAMMER_DELETE_DESTROY, &prune->stat_bytes); + hammer_sync_unlock(trans); if (error) break; @@ -210,7 +206,6 @@ retry: ++prune->stat_scanrecords; error = hammer_btree_iterate_reverse(&cursor); } - hammer_sync_unlock(trans); if (error == ENOENT) error = 0; hammer_done_cursor(&cursor); @@ -299,7 +294,6 @@ prune_check_nlinks(hammer_cursor_t cursor, hammer_btree_leaf_elm_t elm) if (cursor->data->inode.nlinks) return; hammer_cursor_downgrade(cursor); - hammer_sync_unlock(cursor->trans); ip = hammer_get_inode(cursor->trans, NULL, elm->base.obj_id, HAMMER_MAX_TID, elm->base.localization & HAMMER_LOCALIZE_PSEUDOFS_MASK, @@ -312,7 +306,6 @@ prune_check_nlinks(hammer_cursor_t cursor, hammer_btree_leaf_elm_t elm) kprintf("unable to prune disconnected inode %016llx\n", elm->base.obj_id); } - hammer_sync_lock_sh(cursor->trans); } #if 0 diff --git a/sys/vfs/hammer/hammer_reblock.c b/sys/vfs/hammer/hammer_reblock.c index ca8a3d80f6..b60e3c3ca0 100644 --- a/sys/vfs/hammer/hammer_reblock.c +++ b/sys/vfs/hammer/hammer_reblock.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_reblock.c,v 1.25 2008/07/09 10:29:20 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_reblock.c,v 1.26 2008/07/11 01:22:29 dillon Exp $ */ /* * HAMMER reblocker - This code frees up fragmented physical space @@ -125,9 +125,6 @@ retry: */ if ((error = hammer_signal_check(trans->hmp)) != 0) break; - if (trans->hmp->sync_lock.wanted) { - tsleep(trans, 0, "hmrslo", hz / 10); - } /* * If we build up too much meta-data we have to wait for diff --git a/sys/vfs/hammer/hammer_subs.c b/sys/vfs/hammer/hammer_subs.c index 83f78f6993..573c8e7fa1 100644 --- a/sys/vfs/hammer/hammer_subs.c +++ b/sys/vfs/hammer/hammer_subs.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_subs.c,v 1.33 2008/07/10 04:44:33 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_subs.c,v 1.34 2008/07/11 01:22:29 dillon Exp $ */ /* * HAMMER structural locking @@ -93,6 +93,9 @@ hammer_lock_ex_try(struct hammer_lock *lock) /* * Obtain a shared lock + * + * We do not give pending exclusive locks priority over shared locks as + * doing so could lead to a deadlock. */ void hammer_lock_sh(struct hammer_lock *lock) @@ -114,31 +117,6 @@ hammer_lock_sh(struct hammer_lock *lock) crit_exit(); } -/* - * Obtain a shared lock at a lower priority then thread waiting for an - * exclusive lock. To avoid a deadlock this may only be done if no other - * shared locks are being held by the caller. - */ -void -hammer_lock_sh_lowpri(struct hammer_lock *lock) -{ - KKASSERT(lock->refs > 0); - crit_enter(); - while (lock->locktd != NULL || lock->exwanted) { - if (lock->locktd == curthread) { - Debugger("hammer_lock_sh: lock_sh on exclusive"); - ++lock->lockcount; - crit_exit(); - return; - } - lock->wanted = 1; - tsleep(lock, 0, "hmrlck", 0); - } - KKASSERT(lock->lockcount <= 0); - --lock->lockcount; - crit_exit(); -} - int hammer_lock_sh_try(struct hammer_lock *lock) { @@ -263,11 +241,16 @@ hammer_unref(struct hammer_lock *lock) /* * The sync_lock must be held when doing any modifying operations on - * meta-data. The flusher holds the lock exclusively while the reblocker - * and pruner use a shared lock. + * meta-data. It does not have to be held when modifying non-meta-data buffers + * (backend or frontend). + * + * The flusher holds the lock exclusively while all other consumers hold it + * shared. All modifying operations made while holding the lock are atomic + * in that they will be made part of the same flush group. * - * Modifying operations can run in parallel until the flusher needs to - * sync the disk media. + * Due to the atomicy requirement deadlock recovery code CANNOT release the + * sync lock, nor can we give pending exclusive sync locks priority over + * a shared sync lock as this could lead to a 3-way deadlock. */ void hammer_sync_lock_ex(hammer_transaction_t trans) -- 2.41.0