HAMMER VFS - Reorganize the use of hammer_del_buffers(), fix rare deadlocks
authorMatthew Dillon <dillon@apollo.backplane.com>
Mon, 26 Jan 2009 06:34:42 +0000 (22:34 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Mon, 26 Jan 2009 06:34:42 +0000 (22:34 -0800)
The previous attempt at fixing a deadlock due to a hammer_buffer being
held cached across long-blocking procedures failed to catch all cases.
Implement a more complete and less invasive fix.  These buffers are
typically held in the hammer_cursor structure.

Instead of trying to release such cached data buffers proactively,
we instead allow hammer_del_buffers() to return a temporary failure when
called from the reservation code.  If hammer_del_buffers() is unable to
completely eradicate underlying buffers due to them being referenced via
the above cache entities the related hammer_reserve structure is simply
re-added to the delayed-release queue.

The related big-block are thus not reused until all such cached entites
have been dereferenced.  The relaxed requirements on the cached entities
removes the possibility of a deadlock in the invalidation code and also
simplifies hammer_io_inval()'s handling of the hammer_buffer<->buffer_cache
coupling case.

sys/vfs/hammer/hammer.h
sys/vfs/hammer/hammer_blockmap.c
sys/vfs/hammer/hammer_cursor.c
sys/vfs/hammer/hammer_io.c
sys/vfs/hammer/hammer_ondisk.c

index 83360b1..7561500 100644 (file)
@@ -991,9 +991,10 @@ hammer_buffer_t    hammer_get_buffer(hammer_mount_t hmp, hammer_off_t buf_offset,
                        int bytes, int isnew, int *errorp);
 void           hammer_sync_buffers(hammer_mount_t hmp,
                        hammer_off_t base_offset, int bytes);
-void           hammer_del_buffers(hammer_mount_t hmp,
+int            hammer_del_buffers(hammer_mount_t hmp,
                        hammer_off_t base_offset,
-                       hammer_off_t zone2_offset, int bytes);
+                       hammer_off_t zone2_offset, int bytes,
+                       int report_conflicts);
 
 int            hammer_ref_volume(hammer_volume_t volume);
 int            hammer_ref_buffer(hammer_buffer_t buffer);
@@ -1120,7 +1121,7 @@ void hammer_io_init(hammer_io_t io, hammer_volume_t volume,
 int hammer_io_read(struct vnode *devvp, struct hammer_io *io,
                        hammer_off_t limit);
 int hammer_io_new(struct vnode *devvp, struct hammer_io *io);
-void hammer_io_inval(hammer_volume_t volume, hammer_off_t zone2_offset);
+int hammer_io_inval(hammer_volume_t volume, hammer_off_t zone2_offset);
 struct buf *hammer_io_release(struct hammer_io *io, int flush);
 void hammer_io_flush(struct hammer_io *io);
 void hammer_io_wait(struct hammer_io *io);
index b3d9fbe..216b0d8 100644 (file)
 #include "hammer.h"
 
 static int hammer_res_rb_compare(hammer_reserve_t res1, hammer_reserve_t res2);
-static void hammer_reserve_setdelay(hammer_mount_t hmp,
+static void hammer_reserve_setdelay_offset(hammer_mount_t hmp,
                                    hammer_off_t base_offset,
                                    struct hammer_blockmap_layer2 *layer2);
-
+static void hammer_reserve_setdelay(hammer_mount_t hmp, hammer_reserve_t resv);
 
 /*
  * Reserved big-blocks red-black tree support
@@ -580,6 +580,7 @@ void
 hammer_blockmap_reserve_complete(hammer_mount_t hmp, hammer_reserve_t resv)
 {
        hammer_off_t base_offset;
+       int error;
 
        KKASSERT(resv->refs > 0);
        KKASSERT((resv->zone_offset & HAMMER_OFF_ZONE_MASK) ==
@@ -589,14 +590,21 @@ hammer_blockmap_reserve_complete(hammer_mount_t hmp, hammer_reserve_t resv)
         * Setting append_off to the max prevents any new allocations
         * from occuring while we are trying to dispose of the reservation,
         * allowing us to safely delete any related HAMMER buffers.
+        *
+        * If we are unable to clean out all related HAMMER buffers we
+        * requeue the delay.
         */
        if (resv->refs == 1 && (resv->flags & HAMMER_RESF_LAYER2FREE)) {
                resv->append_off = HAMMER_LARGEBLOCK_SIZE;
                resv->flags &= ~HAMMER_RESF_LAYER2FREE;
                base_offset = resv->zone_offset & ~HAMMER_ZONE_RAW_BUFFER;
                base_offset = HAMMER_ZONE_ENCODE(base_offset, resv->zone);
-               hammer_del_buffers(hmp, base_offset, resv->zone_offset,
-                                  HAMMER_LARGEBLOCK_SIZE);
+               error = hammer_del_buffers(hmp, base_offset,
+                                          resv->zone_offset,
+                                          HAMMER_LARGEBLOCK_SIZE,
+                                          0);
+               if (error)
+                       hammer_reserve_setdelay(hmp, resv);
        }
        if (--resv->refs == 0) {
                KKASSERT((resv->flags & HAMMER_RESF_ONDELAY) == 0);
@@ -615,7 +623,7 @@ hammer_blockmap_reserve_complete(hammer_mount_t hmp, hammer_reserve_t resv)
  * reservation has been allocated.
  */
 static void
-hammer_reserve_setdelay(hammer_mount_t hmp, hammer_off_t base_offset,
+hammer_reserve_setdelay_offset(hammer_mount_t hmp, hammer_off_t base_offset,
                        struct hammer_blockmap_layer2 *layer2)
 {
        hammer_reserve_t resv;
@@ -639,11 +647,15 @@ again:
                }
                ++hammer_count_reservations;
        }
+}
 
-       /*
-        * Enter the reservation on the on-delay list, or move it if it
-        * is already on the list.
-        */
+/*
+ * Enter the reservation on the on-delay list, or move it if it
+ * is already on the list.
+ */
+static void
+hammer_reserve_setdelay(hammer_mount_t hmp, hammer_reserve_t resv)
+{
        if (resv->flags & HAMMER_RESF_ONDELAY) {
                TAILQ_REMOVE(&hmp->delay_list, resv, delay_entry);
                resv->flush_group = hmp->flusher.next + 1;
@@ -776,7 +788,7 @@ hammer_blockmap_free(hammer_transaction_t trans,
        if (layer2->bytes_free == HAMMER_LARGEBLOCK_SIZE) {
                base_off = (zone_offset & (~HAMMER_LARGEBLOCK_MASK64 & ~HAMMER_OFF_ZONE_MASK)) | HAMMER_ZONE_RAW_BUFFER;
 
-               hammer_reserve_setdelay(hmp, base_off, layer2);
+               hammer_reserve_setdelay_offset(hmp, base_off, layer2);
                if (layer2->bytes_free == HAMMER_LARGEBLOCK_SIZE) {
                        layer2->zone = 0;
                        layer2->append_off = 0;
index dc048dc..d6ce439 100644 (file)
@@ -489,24 +489,11 @@ hammer_cursor_down(hammer_cursor_t cursor)
 void
 hammer_unlock_cursor(hammer_cursor_t cursor)
 {
-       hammer_buffer_t data_buffer;
        hammer_node_t node;
 
        KKASSERT((cursor->flags & HAMMER_CURSOR_TRACKED) == 0);
        KKASSERT(cursor->node);
 
-       /*
-        * We must release any cached data buffer held by the cursor,
-        * otherwise we can deadlock against other threads attempting
-        * to invalidate the underlying buffer cache buffers which
-        * become stale due to a pruning or reblocking operation.
-        */
-       if ((data_buffer = cursor->data_buffer) != NULL) {
-               cursor->data_buffer = NULL;
-               cursor->data = NULL;
-               hammer_rel_buffer(data_buffer, 0);
-       }
-
        /*
         * Release the cursor's locks and track B-Tree operations on node.
         * While being tracked our cursor can be modified by other threads
index 561a567..8d1fef4 100644 (file)
@@ -253,18 +253,23 @@ hammer_io_new(struct vnode *devvp, struct hammer_io *io)
 
 /*
  * Remove potential device level aliases against buffers managed by high level
- * vnodes.  Aliases can also be created due to mixed buffer sizes.
+ * vnodes.  Aliases can also be created due to mixed buffer sizes or via
+ * direct access to the backing store device.
  *
  * This is nasty because the buffers are also VMIO-backed.  Even if a buffer
  * does not exist its backing VM pages might, and we have to invalidate
  * those as well or a getblk() will reinstate them.
+ *
+ * Buffer cache buffers associated with hammer_buffers cannot be
+ * invalidated.
  */
-void
+int
 hammer_io_inval(hammer_volume_t volume, hammer_off_t zone2_offset)
 {
        hammer_io_structure_t iou;
        hammer_off_t phys_offset;
        struct buf *bp;
+       int error;
 
        phys_offset = volume->ondisk->vol_buf_beg +
                      (zone2_offset & HAMMER_OFF_SHORT_MASK);
@@ -274,6 +279,7 @@ hammer_io_inval(hammer_volume_t volume, hammer_off_t zone2_offset)
        else
                bp = getblk(volume->devvp, phys_offset, HAMMER_BUFSIZE, 0, 0);
        if ((iou = (void *)LIST_FIRST(&bp->b_dep)) != NULL) {
+#if 0
                hammer_ref(&iou->io.lock);
                hammer_io_clear_modify(&iou->io, 1);
                bundirty(bp);
@@ -284,13 +290,17 @@ hammer_io_inval(hammer_volume_t volume, hammer_off_t zone2_offset)
                KKASSERT(iou->io.lock.refs == 1);
                hammer_rel_buffer(&iou->buffer, 0);
                /*hammer_io_deallocate(bp);*/
+#endif
+               error = EAGAIN;
        } else {
                KKASSERT((bp->b_flags & B_LOCKED) == 0);
                bundirty(bp);
                bp->b_flags |= B_NOCACHE|B_RELBUF;
                brelse(bp);
+               error = 0;
        }
        crit_exit();
+       return(error);
 }
 
 /*
@@ -1318,14 +1328,20 @@ hammer_io_direct_wait(hammer_record_t record)
        }
 
        /*
-        * Invalidate any related buffer cache aliases.
+        * Invalidate any related buffer cache aliases associated with the
+        * backing device.  This is needed because the buffer cache buffer
+        * for file data is associated with the file vnode, not the backing
+        * device vnode.
+        *
+        * XXX I do not think this case can occur any more now that
+        * reservations ensure that all such buffers are removed before
+        * an area can be reused.
         */
        if (record->flags & HAMMER_RECF_DIRECT_INVAL) {
                KKASSERT(record->leaf.data_offset);
-               hammer_del_buffers(record->ip->hmp,
-                                  record->leaf.data_offset,
-                                  record->zone2_offset,
-                                  record->leaf.data_len);
+               hammer_del_buffers(record->ip->hmp, record->leaf.data_offset,
+                                  record->zone2_offset, record->leaf.data_len,
+                                  1);
                record->flags &= ~HAMMER_RECF_DIRECT_INVAL;
        }
 }
index 656b7b9..44e4e00 100644 (file)
@@ -60,6 +60,10 @@ hammer_vol_rb_compare(hammer_volume_t vol1, hammer_volume_t vol2)
        return(0);
 }
 
+/*
+ * hammer_buffer structures are indexed via their zoneX_offset, not
+ * their zone2_offset.
+ */
 static int
 hammer_buf_rb_compare(hammer_buffer_t buf1, hammer_buffer_t buf2)
 {
@@ -503,8 +507,14 @@ hammer_mountcheck_volumes(struct hammer_mount *hmp)
  *                             BUFFERS                                 *
  ************************************************************************
  *
- * Manage buffers.  Currently all blockmap-backed zones are translated
- * to zone-2 buffer offsets.
+ * Manage buffers.  Currently all blockmap-backed zones are direct-mapped
+ * to zone-2 buffer offsets, without a translation stage.  However, the
+ * hammer_buffer structure is indexed by its zoneX_offset, not its
+ * zone2_offset.
+ *
+ * The proper zone must be maintained throughout the code-base all the way
+ * through to the big-block allocator, or routines like hammer_del_buffers()
+ * will not be able to locate all potentially conflicting buffers.
  */
 hammer_buffer_t
 hammer_get_buffer(hammer_mount_t hmp, hammer_off_t buf_offset,
@@ -701,25 +711,33 @@ hammer_sync_buffers(hammer_mount_t hmp, hammer_off_t base_offset, int bytes)
  *
  * The buffers may be referenced by the caller itself.  Setting reclaim
  * will cause the buffer to be destroyed when it's ref count reaches zero.
+ *
+ * Return 0 on success, EAGAIN if some buffers could not be destroyed due
+ * to additional references held by other threads, or some other (typically
+ * fatal) error.
  */
-void
+int
 hammer_del_buffers(hammer_mount_t hmp, hammer_off_t base_offset,
-                  hammer_off_t zone2_offset, int bytes)
+                  hammer_off_t zone2_offset, int bytes,
+                  int report_conflicts)
 {
        hammer_buffer_t buffer;
        hammer_volume_t volume;
        int vol_no;
        int error;
+       int ret_error;
 
        vol_no = HAMMER_VOL_DECODE(zone2_offset);
-       volume = hammer_get_volume(hmp, vol_no, &error);
-       KKASSERT(error == 0);
+       volume = hammer_get_volume(hmp, vol_no, &ret_error);
+       KKASSERT(ret_error == 0);
 
        while (bytes > 0) {
                buffer = RB_LOOKUP(hammer_buf_rb_tree, &hmp->rb_bufs_root,
                                   base_offset);
                if (buffer) {
                        error = hammer_ref_buffer(buffer);
+                       if (error == 0 && buffer->io.lock.refs != 1)
+                               error = EAGAIN;
                        if (error == 0) {
                                KKASSERT(buffer->zone2_offset == zone2_offset);
                                hammer_io_clear_modify(&buffer->io, 1);
@@ -729,13 +747,19 @@ hammer_del_buffers(hammer_mount_t hmp, hammer_off_t base_offset,
                                hammer_rel_buffer(buffer, 0);
                        }
                } else {
-                       hammer_io_inval(volume, zone2_offset);
+                       error = hammer_io_inval(volume, zone2_offset);
+               }
+               if (error) {
+                       ret_error = error;
+                       if (report_conflicts || (hammer_debug_general & 0x8000))
+                               kprintf("hammer_del_buffers: unable to invalidate %016llx rep=%d\n", base_offset, report_conflicts);
                }
                base_offset += HAMMER_BUFSIZE;
                zone2_offset += HAMMER_BUFSIZE;
                bytes -= HAMMER_BUFSIZE;
        }
        hammer_rel_volume(volume, 0);
+       return (ret_error);
 }
 
 static int
@@ -920,6 +944,9 @@ hammer_rel_buffer(hammer_buffer_t buffer, int flush)
  *
  * Any prior buffer in *bufferp will be released and replaced by the
  * requested buffer.
+ *
+ * NOTE: The buffer is indexed via its zoneX_offset but we allow the
+ * passed cached *bufferp to match against either zoneX or zone2.
  */
 static __inline
 void *