HAMMER 61B/Many: Stabilization
authorMatthew Dillon <dillon@dragonflybsd.org>
Fri, 11 Jul 2008 01:22:29 +0000 (01:22 +0000)
committerMatthew Dillon <dillon@dragonflybsd.org>
Fri, 11 Jul 2008 01:22:29 +0000 (01:22 +0000)
* Giving the sync_lock's exclusive locks priority over shared locks caused
  a 3-way deadlock, revert to using normal shared locks.

* Move the syncer_lock deeper, closer to the code that actually needs to
  hold but still paying attention to atomicy requirements.  This reduces
  lock contention and improves performance with parallel loads.

Reported-by: Michael Neumann (using his file-creator program)
sys/vfs/hammer/hammer.h
sys/vfs/hammer/hammer_btree.c
sys/vfs/hammer/hammer_cursor.c
sys/vfs/hammer/hammer_flusher.c
sys/vfs/hammer/hammer_inode.c
sys/vfs/hammer/hammer_mirror.c
sys/vfs/hammer/hammer_object.c
sys/vfs/hammer/hammer_prune.c
sys/vfs/hammer/hammer_reblock.c
sys/vfs/hammer/hammer_subs.c

index 110530d..2f3f980 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.110 2008/07/10 21:23:58 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.111 2008/07/11 01:22:29 dillon Exp $
  */
 /*
  * This header file contains structures used internally by the HAMMERFS
@@ -826,7 +826,6 @@ int hammer_cursor_seek(hammer_cursor_t cursor, hammer_node_t node,
 void   hammer_lock_ex_ident(struct hammer_lock *lock, const char *ident);
 int    hammer_lock_ex_try(struct hammer_lock *lock);
 void   hammer_lock_sh(struct hammer_lock *lock);
-void   hammer_lock_sh_lowpri(struct hammer_lock *lock);
 int    hammer_lock_sh_try(struct hammer_lock *lock);
 int    hammer_lock_upgrade(struct hammer_lock *lock);
 void   hammer_lock_downgrade(struct hammer_lock *lock);
index 6fb8d1f..3966dad 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.69 2008/07/10 21:23:58 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.70 2008/07/11 01:22:29 dillon Exp $
  */
 
 /*
@@ -572,6 +572,8 @@ hammer_btree_lookup(hammer_cursor_t cursor)
 {
        int error;
 
+       KKASSERT ((cursor->flags & HAMMER_CURSOR_INSERT) == 0 ||
+                 cursor->trans->sync_lock_refs > 0);
        ++hammer_stats_btree_lookups;
        if (cursor->flags & HAMMER_CURSOR_ASOF) {
                KKASSERT((cursor->flags & HAMMER_CURSOR_INSERT) == 0);
@@ -811,6 +813,7 @@ hammer_btree_delete(hammer_cursor_t cursor)
        int error;
        int i;
 
+       KKASSERT (cursor->trans->sync_lock_refs > 0);
        if ((error = hammer_cursor_upgrade(cursor)) != 0)
                return(error);
        ++hammer_stats_btree_deletes;
index 45d26fc..5458bf9 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_cursor.c,v 1.40 2008/07/08 04:34:41 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_cursor.c,v 1.41 2008/07/11 01:22:29 dillon Exp $
  */
 
 /*
@@ -75,7 +75,7 @@ hammer_init_cursor(hammer_transaction_t trans, hammer_cursor_t cursor,
        if (cache && cache->node) {
                node = hammer_ref_node_safe(trans->hmp, cache, &error);
                if (error == 0) {
-                       hammer_lock_sh_lowpri(&node->lock);
+                       hammer_lock_sh(&node->lock);
                        if (node->flags & HAMMER_NODE_DELETED) {
                                hammer_unlock(&node->lock);
                                hammer_rel_node(node);
@@ -100,7 +100,7 @@ hammer_init_cursor(hammer_transaction_t trans, hammer_cursor_t cursor,
                hammer_rel_volume(volume, 0);
                if (error)
                        break;
-               hammer_lock_sh_lowpri(&node->lock);
+               hammer_lock_sh(&node->lock);
 
                /*
                 * If someone got in before we could lock the node, retry.
@@ -573,6 +573,10 @@ hammer_lock_cursor(hammer_cursor_t cursor, int also_ip)
  * Recover from a deadlocked cursor, tracking any node removals or
  * replacements.  If the cursor's current node is removed by another
  * thread (via btree_remove()) the cursor will be seeked upwards.
+ *
+ * The caller is working a modifying operation and must be holding the
+ * sync lock (shared).  We do not release the sync lock because this
+ * would break atomicy.
  */
 int
 hammer_recover_cursor(hammer_cursor_t cursor)
@@ -580,6 +584,7 @@ hammer_recover_cursor(hammer_cursor_t cursor)
        int error;
 
        hammer_unlock_cursor(cursor, 0);
+       KKASSERT(cursor->trans->sync_lock_refs > 0);
 
        /*
         * Wait for the deadlock to clear
@@ -595,7 +600,6 @@ hammer_recover_cursor(hammer_cursor_t cursor)
                hammer_rel_mem_record(cursor->deadlk_rec);
                cursor->deadlk_rec = NULL;
        }
-
        error = hammer_lock_cursor(cursor, 0);
        return(error);
 }
index ca77e75..9ffeb45 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_flusher.c,v 1.34 2008/07/10 21:23:58 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_flusher.c,v 1.35 2008/07/11 01:22:29 dillon Exp $
  */
 /*
  * HAMMER dependancy flusher thread
@@ -353,12 +353,10 @@ hammer_flusher_flush_inode(hammer_inode_t ip, hammer_transaction_t trans)
        int error;
 
        hammer_flusher_clean_loose_ios(hmp);
-       hammer_lock_sh(&hmp->flusher.finalize_lock);
        error = hammer_sync_inode(ip);
        if (error != EWOULDBLOCK)
                ip->error = error;
        hammer_flush_inode_done(ip);
-       hammer_unlock(&hmp->flusher.finalize_lock);
        while (hmp->flusher.finalize_want)
                tsleep(&hmp->flusher.finalize_want, 0, "hmrsxx", 0);
        if (hammer_flusher_undo_exhausted(trans, 1)) {
@@ -388,7 +386,6 @@ hammer_flusher_undo_exhausted(hammer_transaction_t trans, int quarter)
 {
        if (hammer_undo_space(trans) <
            hammer_undo_max(trans->hmp) * quarter / 4) {
-               kprintf("%c", '0' + quarter);
                return(1);
        } else {
                return(0);
index a1b158b..0f3ec74 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.98 2008/07/10 21:23:58 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.99 2008/07/11 01:22:29 dillon Exp $
  */
 
 #include "hammer.h"
@@ -987,6 +987,7 @@ retry:
                        /*
                         * Root volume count of inodes
                         */
+                       hammer_sync_lock_sh(trans);
                        if ((ip->flags & HAMMER_INODE_ONDISK) == 0) {
                                hammer_modify_volume_field(trans,
                                                           trans->rootvol,
@@ -997,6 +998,7 @@ retry:
                                if (hammer_debug_inode)
                                        kprintf("NOWONDISK %p\n", ip);
                        }
+                       hammer_sync_unlock(trans);
                }
        }
 
@@ -1062,21 +1064,25 @@ retry:
                         * Updating MTIME requires an UNDO.  Just cover
                         * both atime and mtime.
                         */
+                       hammer_sync_lock_sh(trans);
                        hammer_modify_buffer(trans, cursor->data_buffer,
                                     HAMMER_ITIMES_BASE(&cursor->data->inode),
                                     HAMMER_ITIMES_BYTES);
                        cursor->data->inode.atime = ip->sync_ino_data.atime;
                        cursor->data->inode.mtime = ip->sync_ino_data.mtime;
                        hammer_modify_buffer_done(cursor->data_buffer);
+                       hammer_sync_unlock(trans);
                } else if (ip->sync_flags & HAMMER_INODE_ATIME) {
                        /*
                         * Updating atime only can be done in-place with
                         * no UNDO.
                         */
+                       hammer_sync_lock_sh(trans);
                        hammer_modify_buffer(trans, cursor->data_buffer,
                                             NULL, 0);
                        cursor->data->inode.atime = ip->sync_ino_data.atime;
                        hammer_modify_buffer_done(cursor->data_buffer);
+                       hammer_sync_unlock(trans);
                }
                ip->sync_flags &= ~(HAMMER_INODE_ATIME | HAMMER_INODE_MTIME);
        }
@@ -2082,11 +2088,8 @@ done:
         * The finalization lock is already being held by virtue of the
         * flusher calling us.
         */
-        if (hammer_flusher_meta_limit(hmp)) {
-               hammer_unlock(&hmp->flusher.finalize_lock);
+        if (hammer_flusher_meta_limit(hmp))
                 hammer_flusher_finalize(trans, 0);
-               hammer_lock_sh(&hmp->flusher.finalize_lock);
-       }
 
        return(error);
 }
@@ -2303,6 +2306,7 @@ hammer_sync_inode(hammer_inode_t ip)
                        /*
                         * Adjust the inode count in the volume header
                         */
+                       hammer_sync_lock_sh(&trans);
                        if (ip->flags & HAMMER_INODE_ONDISK) {
                                hammer_modify_volume_field(&trans,
                                                           trans.rootvol,
@@ -2310,6 +2314,7 @@ hammer_sync_inode(hammer_inode_t ip)
                                --ip->hmp->rootvol->ondisk->vol0_stat_inodes;
                                hammer_modify_volume_done(trans.rootvol);
                        }
+                       hammer_sync_unlock(&trans);
                } else {
                        Debugger("hammer_ip_delete_clean errored");
                }
index 49486c5..16df3fd 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_mirror.c,v 1.10 2008/07/10 04:44:33 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_mirror.c,v 1.11 2008/07/11 01:22:29 dillon Exp $
  */
 /*
  * HAMMER mirroring ioctls - serialize and deserialize modifications made
@@ -560,6 +560,8 @@ hammer_ioc_mirror_write_rec(hammer_cursor_t cursor,
         * mirror operations are effective an as-of operation and
         * delete_tid can be 0 for mirroring purposes even if it is
         * not actually 0 at the originator.
+        *
+        * These functions can return EDEADLK
         */
        cursor->key_beg = mrec->leaf.base;
        cursor->flags |= HAMMER_CURSOR_BACKEND;
@@ -567,13 +569,9 @@ hammer_ioc_mirror_write_rec(hammer_cursor_t cursor,
        error = hammer_btree_lookup(cursor);
 
        if (error == 0 && hammer_mirror_check(cursor, mrec)) {
-               hammer_sync_lock_sh(trans);
                error = hammer_mirror_update(cursor, mrec);
-               hammer_sync_unlock(trans);
        } else if (error == ENOENT && mrec->leaf.base.delete_tid == 0) {
-               hammer_sync_lock_sh(trans);
                error = hammer_mirror_write(cursor, mrec, uptr);
-               hammer_sync_unlock(trans);
        } else if (error == ENOENT) {
                error = 0;
        }
@@ -663,6 +661,10 @@ hammer_mirror_delete_at_cursor(hammer_cursor_t cursor,
 {
        hammer_transaction_t trans;
        hammer_btree_elm_t elm;
+       int error;
+
+       if ((error = hammer_cursor_upgrade(cursor)) != 0)
+               return(error);
 
        elm = &cursor->node->ondisk->elms[cursor->index];
        KKASSERT(elm->leaf.base.btype == HAMMER_BTREE_TYPE_RECORD);
@@ -731,6 +733,10 @@ hammer_mirror_update(hammer_cursor_t cursor,
 {
        hammer_transaction_t trans;
        hammer_btree_leaf_elm_t elm;
+       int error;
+
+       if ((error = hammer_cursor_upgrade(cursor)) != 0)
+               return(error);
 
        elm = cursor->leaf;
        trans = cursor->trans;
@@ -741,6 +747,7 @@ hammer_mirror_update(hammer_cursor_t cursor,
                        elm->base.obj_id, elm->base.key);
                return(0);
        }
+       hammer_sync_lock_sh(trans);
 
        KKASSERT(elm->base.create_tid < mrec->leaf.base.delete_tid);
        hammer_modify_node(trans, cursor->node, elm, sizeof(*elm));
@@ -763,6 +770,7 @@ hammer_mirror_update(hammer_cursor_t cursor,
                --trans->hmp->rootvol->ondisk->vol0_stat_inodes;
                hammer_modify_volume_done(trans->rootvol);
        }
+       hammer_sync_unlock(trans);
 
        return(0);
 }
@@ -787,6 +795,11 @@ hammer_mirror_write(hammer_cursor_t cursor,
        trans = cursor->trans;
        data_buffer = NULL;
 
+       /*
+        * Get the sync lock so the whole mess is atomic
+        */
+       hammer_sync_lock_sh(trans);
+
        /*
         * Allocate and adjust data
         */
@@ -873,6 +886,7 @@ failed:
                                     mrec->leaf.data_offset,
                                     mrec->leaf.data_len);
        }
+       hammer_sync_unlock(trans);
        if (data_buffer)
                hammer_rel_buffer(data_buffer, 0);
        return(error);
index b99c470..e7bb5ae 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.85 2008/07/10 04:44:33 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.86 2008/07/11 01:22:29 dillon Exp $
  */
 
 #include "hammer.h"
@@ -1069,8 +1069,8 @@ hammer_ip_sync_record_cursor(hammer_cursor_t cursor, hammer_record_t record)
         * we may have to iterate the low 32 bits of the key to find an unused
         * key.
         */
+       hammer_sync_lock_sh(trans);
        cursor->flags |= HAMMER_CURSOR_INSERT;
-
        error = hammer_btree_lookup(cursor);
        if (hammer_debug_inode)
                kprintf("DOINSERT LOOKUP %d\n", error);
@@ -1088,7 +1088,7 @@ hammer_ip_sync_record_cursor(hammer_cursor_t cursor, hammer_record_t record)
 #endif
 
        if (error != ENOENT)
-               goto done;
+               goto done_unlock;
 
        /*
         * Allocate the record and data.  The result buffers will be
@@ -1116,7 +1116,7 @@ hammer_ip_sync_record_cursor(hammer_cursor_t cursor, hammer_record_t record)
                                          &record->leaf.data_offset,
                                          &cursor->data_buffer, &error);
                if (bdata == NULL)
-                       goto done;
+                       goto done_unlock;
                hammer_crc_set_leaf(record->data, &record->leaf);
                hammer_modify_buffer(trans, cursor->data_buffer, NULL, 0);
                bcopy(record->data, bdata, record->leaf.data_len);
@@ -1163,7 +1163,8 @@ hammer_ip_sync_record_cursor(hammer_cursor_t cursor, hammer_record_t record)
                                             record->leaf.data_len);
                }
        }
-
+done_unlock:
+       hammer_sync_unlock(trans);
 done:
        return(error);
 }
@@ -1971,6 +1972,7 @@ hammer_delete_at_cursor(hammer_cursor_t cursor, int delete_flags,
         * Adjust the delete_tid.  Update the mirror_tid propagation field
         * as well.
         */
+       hammer_sync_lock_sh(cursor->trans);
        doprop = 0;
        if (delete_flags & HAMMER_DELETE_ADJUST) {
                hammer_modify_node(cursor->trans, node, elm, sizeof(*elm));
@@ -2057,6 +2059,7 @@ hammer_delete_at_cursor(hammer_cursor_t cursor, int delete_flags,
                KKASSERT(cursor->ip != NULL);
                hammer_btree_do_propagation(cursor, cursor->ip->pfsm, leaf);
        }
+       hammer_sync_unlock(cursor->trans);
        return (error);
 }
 
index 7f1103e..3b83b7a 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_prune.c,v 1.12 2008/07/04 07:25:36 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_prune.c,v 1.13 2008/07/11 01:22:29 dillon Exp $
  */
 
 #include "hammer.h"
@@ -126,7 +126,6 @@ retry:
         */
        cursor.flags |= HAMMER_CURSOR_PRUNING;
 
-       hammer_sync_lock_sh(trans);
        error = hammer_btree_last(&cursor);
 
        while (error == 0) {
@@ -141,11 +140,6 @@ retry:
                 */
                if ((error = hammer_signal_check(trans->hmp)) != 0)
                        break;
-               if (trans->hmp->sync_lock.wanted) {
-                       hammer_sync_unlock(trans);
-                       tsleep(trans, 0, "hmrslo", hz / 10);
-                       hammer_sync_lock_sh(trans);
-               }
                if (hammer_flusher_meta_limit(trans->hmp) ||
                    hammer_flusher_undo_exhausted(trans, 2)) {
                        error = EWOULDBLOCK;
@@ -178,9 +172,11 @@ retry:
                         */
                        isdir = (elm->base.rec_type == HAMMER_RECTYPE_DIRENTRY);
 
+                       hammer_sync_lock_sh(trans);
                        error = hammer_delete_at_cursor(&cursor,
                                                        HAMMER_DELETE_DESTROY,
                                                        &prune->stat_bytes);
+                       hammer_sync_unlock(trans);
                        if (error)
                                break;
 
@@ -210,7 +206,6 @@ retry:
                ++prune->stat_scanrecords;
                error = hammer_btree_iterate_reverse(&cursor);
        }
-       hammer_sync_unlock(trans);
        if (error == ENOENT)
                error = 0;
        hammer_done_cursor(&cursor);
@@ -299,7 +294,6 @@ prune_check_nlinks(hammer_cursor_t cursor, hammer_btree_leaf_elm_t elm)
        if (cursor->data->inode.nlinks)
                return;
        hammer_cursor_downgrade(cursor);
-       hammer_sync_unlock(cursor->trans);
        ip = hammer_get_inode(cursor->trans, NULL, elm->base.obj_id,
                      HAMMER_MAX_TID,
                      elm->base.localization & HAMMER_LOCALIZE_PSEUDOFS_MASK,
@@ -312,7 +306,6 @@ prune_check_nlinks(hammer_cursor_t cursor, hammer_btree_leaf_elm_t elm)
                kprintf("unable to prune disconnected inode %016llx\n",
                        elm->base.obj_id);
        }
-       hammer_sync_lock_sh(cursor->trans);
 }
 
 #if 0
index ca8a3d8..b60e3c3 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_reblock.c,v 1.25 2008/07/09 10:29:20 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_reblock.c,v 1.26 2008/07/11 01:22:29 dillon Exp $
  */
 /*
  * HAMMER reblocker - This code frees up fragmented physical space
@@ -125,9 +125,6 @@ retry:
                 */
                if ((error = hammer_signal_check(trans->hmp)) != 0)
                        break;
-               if (trans->hmp->sync_lock.wanted) {
-                       tsleep(trans, 0, "hmrslo", hz / 10);
-               }
 
                /*
                 * If we build up too much meta-data we have to wait for
index 83f78f6..573c8e7 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_subs.c,v 1.33 2008/07/10 04:44:33 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_subs.c,v 1.34 2008/07/11 01:22:29 dillon Exp $
  */
 /*
  * HAMMER structural locking
@@ -93,6 +93,9 @@ hammer_lock_ex_try(struct hammer_lock *lock)
 
 /*
  * Obtain a shared lock
+ *
+ * We do not give pending exclusive locks priority over shared locks as
+ * doing so could lead to a deadlock.
  */
 void
 hammer_lock_sh(struct hammer_lock *lock)
@@ -114,31 +117,6 @@ hammer_lock_sh(struct hammer_lock *lock)
        crit_exit();
 }
 
-/*
- * Obtain a shared lock at a lower priority then thread waiting for an
- * exclusive lock.  To avoid a deadlock this may only be done if no other
- * shared locks are being held by the caller.
- */
-void
-hammer_lock_sh_lowpri(struct hammer_lock *lock)
-{
-       KKASSERT(lock->refs > 0);
-       crit_enter();
-       while (lock->locktd != NULL || lock->exwanted) {
-               if (lock->locktd == curthread) {
-                       Debugger("hammer_lock_sh: lock_sh on exclusive");
-                       ++lock->lockcount;
-                       crit_exit();
-                       return;
-               }
-               lock->wanted = 1;
-               tsleep(lock, 0, "hmrlck", 0);
-       }
-       KKASSERT(lock->lockcount <= 0);
-       --lock->lockcount;
-       crit_exit();
-}
-
 int
 hammer_lock_sh_try(struct hammer_lock *lock)
 {
@@ -263,11 +241,16 @@ hammer_unref(struct hammer_lock *lock)
 
 /*
  * The sync_lock must be held when doing any modifying operations on
- * meta-data.  The flusher holds the lock exclusively while the reblocker
- * and pruner use a shared lock.
+ * meta-data.  It does not have to be held when modifying non-meta-data buffers
+ * (backend or frontend).
+ *
+ * The flusher holds the lock exclusively while all other consumers hold it
+ * shared.  All modifying operations made while holding the lock are atomic
+ * in that they will be made part of the same flush group.
  *
- * Modifying operations can run in parallel until the flusher needs to
- * sync the disk media.
+ * Due to the atomicy requirement deadlock recovery code CANNOT release the
+ * sync lock, nor can we give pending exclusive sync locks priority over
+ * a shared sync lock as this could lead to a 3-way deadlock.
  */
 void
 hammer_sync_lock_ex(hammer_transaction_t trans)