HAMMER 54D/Many: Performance tuning.
authorMatthew Dillon <dillon@dragonflybsd.org>
Fri, 13 Jun 2008 00:25:33 +0000 (00:25 +0000)
committerMatthew Dillon <dillon@dragonflybsd.org>
Fri, 13 Jun 2008 00:25:33 +0000 (00:25 +0000)
* Remove major barriers to write performance and fix hicups revealed by
  blogbench.

  Change the HAMMER reclaim-delay algorithm to operate like a FIFO instead
  of as a free-for-all.  The idea of introducing a dynamic delay helped some,
  but the addition of the wakeup FIFO allows burst completions by the flusher
  to immediately wakeup processes that were waiting for the reclaim count to
  drain.  The result is far, far smoother operation.

* Remove a major blocking conflict between the buffer cache daemon and
  HAMMER.  The buffer cache was getting stuck on trying to overwrite dirty
  records that had already been queued to the flusher.  The flusher might
  not act on the record(s) for a long period of time, causing the buffer
  cache daemon to stall.

  Fix the problem by properly using the HAMMER_RECF_INTERLOCK_BE flag,
  which stays on only for a very short period of time, instead of testing
  the record's flush state (record->flush_state), which can stay in
  the HAMMER_FST_FLUSH state for a very long time.

* The parent B-Tree node does not need to be locked when inserting
  into the child.

* Use the new B_AGE semantics to keep meta-data intact longer.  This results
  in another big improvement in random read and write performance.

sys/vfs/hammer/hammer.h
sys/vfs/hammer/hammer_btree.c
sys/vfs/hammer/hammer_cursor.c
sys/vfs/hammer/hammer_flusher.c
sys/vfs/hammer/hammer_inode.c
sys/vfs/hammer/hammer_io.c
sys/vfs/hammer/hammer_object.c
sys/vfs/hammer/hammer_ondisk.c
sys/vfs/hammer/hammer_vfsops.c
sys/vfs/hammer/hammer_vnops.c

index 7b7e27d..ef1c5f7 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.82 2008/06/12 00:16:10 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.83 2008/06/13 00:25:33 dillon Exp $
  */
 /*
  * This header file contains structures used internally by the HAMMERFS
@@ -210,7 +210,6 @@ struct hammer_inode {
        int                     cursor_ip_refs; /* sanity */
        int                     rsv_databufs;
        int                     rsv_recs;
-       int                     idle_wakeup;
        struct vnode            *vp;
        struct lockf            advlock;
        struct hammer_lock      lock;           /* sync copy interlock */
@@ -271,9 +270,17 @@ typedef struct hammer_inode *hammer_inode_t;
 #define HAMMER_FLUSH_SIGNAL    0x0001
 #define HAMMER_FLUSH_RECURSION 0x0002
 
-#define HAMMER_RECLAIM_MIN     2000    /* absolute value */
-#define HAMMER_RECLAIM_MID     4000    /* absolute value */
-#define HAMMER_RECLAIM_MAX     6000    /* absolute value */
+/*
+ * Used by the inode reclaim code to pipeline reclaims and avoid
+ * blowing out kernel memory or letting the flusher get too far
+ * behind.
+ */
+struct hammer_reclaim {
+       TAILQ_ENTRY(hammer_reclaim) entry;
+       int     okydoky;
+};
+
+#define HAMMER_RECLAIM_PIPESIZE        1000
 
 /*
  * Structure used to represent an unsynchronized record in-memory.  These
@@ -328,7 +335,6 @@ typedef struct hammer_record *hammer_record_t;
 #define HAMMER_RECF_UNUSED0010         0x0010
 #define HAMMER_RECF_INTERLOCK_BE       0x0020  /* backend interlock */
 #define HAMMER_RECF_WANTED             0x0040  /* wanted by the frontend */
-#define HAMMER_RECF_WANTIDLE           0x0080  /* wanted when idle */
 #define HAMMER_RECF_CONVERT_DELETE     0x0100 /* special case */
 
 /*
@@ -657,11 +663,12 @@ struct hammer_mount {
        TAILQ_HEAD(, hammer_inode) flush_list;
        TAILQ_HEAD(, hammer_reserve) delay_list;
        TAILQ_HEAD(, hammer_objid_cache) objid_cache_list;
+       TAILQ_HEAD(, hammer_reclaim) reclaim_list;
 };
 
 typedef struct hammer_mount    *hammer_mount_t;
 
-#define HAMMER_MOUNT_WAITIMAX  0x0001
+#define HAMMER_MOUNT_UNUSED0001        0x0001
 
 struct hammer_sync_info {
        int error;
@@ -686,7 +693,6 @@ extern int hammer_debug_btree;
 extern int hammer_debug_tid;
 extern int hammer_debug_recover;
 extern int hammer_debug_recover_faults;
-extern int hammer_debug_write_release;
 extern int hammer_debug_cluster_enable;
 extern int hammer_count_inodes;
 extern int hammer_count_iqueued;
@@ -750,6 +756,7 @@ int hammer_cursor_up(hammer_cursor_t cursor);
 int    hammer_cursor_up_locked(hammer_cursor_t cursor);
 int    hammer_cursor_down(hammer_cursor_t cursor);
 int    hammer_cursor_upgrade(hammer_cursor_t cursor);
+int    hammer_cursor_upgrade_node(hammer_cursor_t cursor);
 void   hammer_cursor_downgrade(hammer_cursor_t cursor);
 int    hammer_cursor_seek(hammer_cursor_t cursor, hammer_node_t node,
                        int index);
@@ -914,7 +921,6 @@ int hammer_ino_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2);
 int hammer_sync_inode(hammer_inode_t ip);
 void hammer_test_inode(hammer_inode_t ip);
 void hammer_inode_unloadable_check(hammer_inode_t ip, int getvp);
-void hammer_inode_waitreclaims(hammer_inode_t ip);
 
 int  hammer_ip_add_directory(struct hammer_transaction *trans,
                        hammer_inode_t dip, struct namecache *ncp,
index e54b11f..664ca7d 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.51 2008/06/10 22:30:21 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.52 2008/06/13 00:25:33 dillon Exp $
  */
 
 /*
@@ -660,7 +660,7 @@ hammer_btree_insert(hammer_cursor_t cursor, hammer_btree_leaf_elm_t elm)
        int i;
        int error;
 
-       if ((error = hammer_cursor_upgrade(cursor)) != 0)
+       if ((error = hammer_cursor_upgrade_node(cursor)) != 0)
                return(error);
 
        /*
@@ -1316,11 +1316,11 @@ btree_split_internal(hammer_cursor_t cursor)
        int i;
        const int esize = sizeof(*elm);
 
-       if ((error = hammer_cursor_upgrade(cursor)) != 0)
-               return(error);
        error = hammer_btree_lock_children(cursor, &locklist);
        if (error)
                goto done;
+       if ((error = hammer_cursor_upgrade(cursor)) != 0)
+               goto done;
 
        /* 
         * We are splitting but elms[split] will be promoted to the parent,
@@ -2116,6 +2116,27 @@ hammer_btree_lock_children(hammer_cursor_t cursor,
        node = cursor->node;
        ondisk = node->ondisk;
        error = 0;
+
+       /*
+        * We really do not want to block on I/O with exclusive locks held,
+        * pre-get the children before trying to lock the mess.
+        */
+       for (i = 0; i < ondisk->count; ++i) {
+               elm = &ondisk->elms[i];
+               if (elm->base.btype != HAMMER_BTREE_TYPE_LEAF &&
+                   elm->base.btype != HAMMER_BTREE_TYPE_INTERNAL) {
+                       continue;
+               }
+               child = hammer_get_node(node->hmp,
+                                       elm->internal.subtree_offset,
+                                       0, &error);
+               if (child)
+                       hammer_rel_node(child);
+       }
+
+       /*
+        * Do it for real
+        */
        for (i = 0; error == 0 && i < ondisk->count; ++i) {
                elm = &ondisk->elms[i];
 
index 5dfeef6..2252a4f 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_cursor.c,v 1.28 2008/06/11 22:33:21 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_cursor.c,v 1.29 2008/06/13 00:25:33 dillon Exp $
  */
 
 /*
@@ -206,13 +206,11 @@ hammer_done_cursor(hammer_cursor_t cursor)
                hammer_lock_ex_ident(&cursor->deadlk_node->lock, "hmrdlk");
                hammer_unlock(&cursor->deadlk_node->lock);
                hammer_rel_node(cursor->deadlk_node);
-               tsleep(&cursor->deadlk_node, 0, "hmrdel", 1);
                cursor->deadlk_node = NULL;
        }
        if (cursor->deadlk_rec) {
                hammer_wait_mem_record_ident(cursor->deadlk_rec, "hmmdlr");
                hammer_rel_mem_record(cursor->deadlk_rec);
-               tsleep(&cursor->deadlk_rec, 0, "hmrdel", 1);
                cursor->deadlk_rec = NULL;
        }
 
@@ -253,6 +251,19 @@ hammer_cursor_upgrade(hammer_cursor_t cursor)
        return(error);
 }
 
+int
+hammer_cursor_upgrade_node(hammer_cursor_t cursor)
+{
+       int error;
+
+       error = hammer_lock_upgrade(&cursor->node->lock);
+       if (error && cursor->deadlk_node == NULL) {
+               cursor->deadlk_node = cursor->node;
+               hammer_ref_node(cursor->deadlk_node);
+       }
+       return(error);
+}
+
 /*
  * Downgrade cursor->node and cursor->parent to shared locks.  This
  * function can return EDEADLK.
index d117b43..9acbe16 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_flusher.c,v 1.25 2008/06/11 22:33:21 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_flusher.c,v 1.26 2008/06/13 00:25:33 dillon Exp $
  */
 /*
  * HAMMER dependancy flusher thread
@@ -63,6 +63,10 @@ struct hammer_flusher_info {
 
 typedef struct hammer_flusher_info *hammer_flusher_info_t;
 
+/*
+ * Sync all inodes pending on the flusher.  This routine may have to be
+ * called twice to get them all as some may be queued to a later flush group.
+ */
 void
 hammer_flusher_sync(hammer_mount_t hmp)
 {
@@ -77,6 +81,9 @@ hammer_flusher_sync(hammer_mount_t hmp)
        }
 }
 
+/*
+ * Sync all inodes pending on the flusher - return immediately.
+ */
 void
 hammer_flusher_async(hammer_mount_t hmp)
 {
index 2c02f89..fb74ed5 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.73 2008/06/12 01:55:58 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.74 2008/06/13 00:25:33 dillon Exp $
  */
 
 #include "hammer.h"
@@ -43,7 +43,7 @@ static int    hammer_unload_inode(struct hammer_inode *ip);
 static void    hammer_flush_inode_core(hammer_inode_t ip, int flags);
 static int     hammer_setup_child_callback(hammer_record_t rec, void *data);
 static int     hammer_setup_parent_inodes(hammer_record_t record);
-static void    hammer_inode_wakereclaims(hammer_mount_t hmp);
+static void    hammer_inode_wakereclaims(hammer_inode_t ip);
 
 #ifdef DEBUG_TRUNCATE
 extern struct hammer_inode *HammerTruncIp;
@@ -97,9 +97,11 @@ hammer_vop_inactive(struct vop_inactive_args *ap)
 int
 hammer_vop_reclaim(struct vop_reclaim_args *ap)
 {
-       hammer_mount_t hmp;
+       struct hammer_reclaim reclaim;
        struct hammer_inode *ip;
+       hammer_mount_t hmp;
        struct vnode *vp;
+       int delay;
 
        vp = ap->a_vp;
 
@@ -107,12 +109,56 @@ hammer_vop_reclaim(struct vop_reclaim_args *ap)
                hmp = ip->hmp;
                vp->v_data = NULL;
                ip->vp = NULL;
-               if ((ip->flags & HAMMER_INODE_RECLAIM) == 0) {
+
+               /*
+                * Setup our reclaim pipeline.  We only let so many detached
+                * (and dirty) inodes build up before we start blocking.  Do
+                * not bother tracking the immediate increment/decrement if
+                * the inode is not actually dirty.
+                *
+                * When we block we don't care *which* inode has finished
+                * reclaiming, as lone as one does.
+                */
+               if ((ip->flags & HAMMER_INODE_RECLAIM) == 0 &&
+                   ((ip->flags|ip->sync_flags) & HAMMER_INODE_MODMASK)) {
                        ++hammer_count_reclaiming;
                        ++hmp->inode_reclaims;
                        ip->flags |= HAMMER_INODE_RECLAIM;
+                       if (hmp->inode_reclaims > HAMMER_RECLAIM_PIPESIZE) {
+                               reclaim.okydoky = 0;
+                               TAILQ_INSERT_TAIL(&hmp->reclaim_list,
+                                                 &reclaim, entry);
+                       } else {
+                               reclaim.okydoky = 1;
+                       }
+               } else {
+                       reclaim.okydoky = 1;
                }
                hammer_rel_inode(ip, 1);
+
+               /*
+                * Reclaim pipeline.  We can't let too many reclaimed inodes
+                * build-up in the flusher or the flusher loses its locality
+                * of reference, or worse blows out our memory.  Once we have
+                * exceeded the reclaim pipe size start slowing down.  Our
+                * imposed delay can be cut short if the flusher catches up
+                * to us.
+                */
+               if (reclaim.okydoky == 0) {
+                       delay = (hmp->inode_reclaims -
+                                HAMMER_RECLAIM_PIPESIZE) * hz /
+                               HAMMER_RECLAIM_PIPESIZE;
+                       if (delay <= 0)
+                               delay = 1;
+                       hammer_flusher_async(hmp);
+                       if (reclaim.okydoky == 0) {
+                               tsleep(&reclaim, 0, "hmrrcm", delay);
+                       }
+                       if (reclaim.okydoky == 0) {
+                               TAILQ_REMOVE(&hmp->reclaim_list, &reclaim,
+                                            entry);
+                       }
+               }
        }
        return(0);
 }
@@ -151,13 +197,7 @@ hammer_get_vnode(struct hammer_inode *ip, struct vnode **vpp)
                        vp->v_type =
                                hammer_get_vnode_type(ip->ino_data.obj_type);
 
-                       if (ip->flags & HAMMER_INODE_RECLAIM) {
-                               --hammer_count_reclaiming;
-                               --hmp->inode_reclaims;
-                               ip->flags &= ~HAMMER_INODE_RECLAIM;
-                               if (hmp->flags & HAMMER_MOUNT_WAITIMAX)
-                                       hammer_inode_wakereclaims(hmp);
-                       }
+                       hammer_inode_wakereclaims(ip);
 
                        switch(ip->ino_data.obj_type) {
                        case HAMMER_OBJTYPE_CDEV:
@@ -732,13 +772,7 @@ hammer_unload_inode(struct hammer_inode *ip)
        --hammer_count_inodes;
        --hmp->count_inodes;
 
-       if (ip->flags & HAMMER_INODE_RECLAIM) {
-               --hammer_count_reclaiming;
-               --hmp->inode_reclaims;
-               ip->flags &= ~HAMMER_INODE_RECLAIM;
-               if (hmp->flags & HAMMER_MOUNT_WAITIMAX)
-                       hammer_inode_wakereclaims(hmp);
-       }
+       hammer_inode_wakereclaims(ip);
        kfree(ip, M_HAMMER);
 
        return(0);
@@ -789,12 +823,15 @@ hammer_modify_inode(hammer_inode_t ip, int flags)
 
 /*
  * Request that an inode be flushed.  This whole mess cannot block and may
- * recurse.  Once requested HAMMER will attempt to actively flush it until
- * the flush can be done.
+ * recurse (if not synchronous).  Once requested HAMMER will attempt to
+ * actively flush the inode until the flush can be done.
  *
  * The inode may already be flushing, or may be in a setup state.  We can
  * place the inode in a flushing state if it is currently idle and flag it
  * to reflush if it is currently flushing.
+ *
+ * If the HAMMER_FLUSH_SYNCHRONOUS flag is specified we will attempt to
+ * flush the indoe synchronously using the caller's context.
  */
 void
 hammer_flush_inode(hammer_inode_t ip, int flags)
@@ -1114,10 +1151,16 @@ hammer_setup_child_callback(hammer_record_t rec, void *data)
        int r;
 
        /*
-        * If the record has been deleted by the backend (it's being held
-        * by the frontend in a race), just ignore it.
+        * Deleted records are ignored.  Note that the flush detects deleted
+        * front-end records at multiple points to deal with races.  This is
+        * just the first line of defense.  The only time DELETED_FE cannot
+        * be set is when HAMMER_RECF_INTERLOCK_BE is set. 
+        *
+        * Don't get confused between record deletion and, say, directory
+        * entry deletion.  The deletion of a directory entry that is on
+        * the media has nothing to do with the record deletion flags.
         */
-       if (rec->flags & HAMMER_RECF_DELETED_BE)
+       if (rec->flags & (HAMMER_RECF_DELETED_FE|HAMMER_RECF_DELETED_BE))
                return(0);
 
        /*
@@ -1441,19 +1484,30 @@ hammer_sync_record_callback(hammer_record_t record, void *data)
        }
 
        /*
-        * If DELETED_FE is set we may have already sent dependant pieces
-        * to the disk and we must flush the record as if it hadn't been
-        * deleted.  This creates a bit of a mess because we have to
-        * have ip_sync_record convert the record to MEM_RECORD_DEL before
-        * it inserts the B-Tree record.  Otherwise the media sync might
-        * be visible to the frontend.
+        * If DELETED_FE is set special handling is needed for directory
+        * entries.  Dependant pieces related to the directory entry may
+        * have already been synced to disk.  If this occurs we have to
+        * sync the directory entry and then change the in-memory record
+        * from an ADD to a DELETE to cover the fact that it's been
+        * deleted by the frontend.
+        *
+        * A directory delete covering record (MEM_RECORD_DEL) can never
+        * be deleted by the frontend.
+        *
+        * Any other record type (aka DATA) can be deleted by the frontend.
+        * XXX At the moment the flusher must skip it because there may
+        * be another data record in the flush group for the same block,
+        * meaning that some frontend data changes can leak into the backend's
+        * synchronization point.
         */
        if (record->flags & HAMMER_RECF_DELETED_FE) {
                if (record->type == HAMMER_MEM_RECORD_ADD) {
                        record->flags |= HAMMER_RECF_CONVERT_DELETE;
                } else {
                        KKASSERT(record->type != HAMMER_MEM_RECORD_DEL);
-                       return(0);
+                       record->flags |= HAMMER_RECF_DELETED_BE;
+                       error = 0;
+                       goto done;
                }
        }
 
@@ -1829,60 +1883,30 @@ hammer_test_inode(hammer_inode_t ip)
 }
 
 /*
- * We need to slow down user processes if we get too large a backlog of
- * inodes in the flusher.  Even though the frontend can theoretically
- * get way, way ahead of the flusher, if we let it do that the flusher
- * will have no buffer cache locality of reference and will have to re-read
- * everything a second time, causing performance to drop precipitously.
+ * Clear the RECLAIM flag on an inode.  This occurs when the inode is
+ * reassociated with a vp or just before it gets freed.
  *
- * Reclaims are especially senssitive to this effect because the kernel has
- * already abandoned the related vnode.
+ * Wakeup one thread blocked waiting on reclaims to complete.  Note that
+ * the inode the thread is waiting on behalf of is a different inode then
+ * the inode we are called with.  This is to create a pipeline.
  */
-
-void
-hammer_inode_waitreclaims(hammer_inode_t ip)
+static void
+hammer_inode_wakereclaims(hammer_inode_t ip)
 {
+       struct hammer_reclaim *reclaim;
        hammer_mount_t hmp = ip->hmp;
-       int delay;
-       int factor;
-       int flags = (ip->flags | ip->sync_flags);
 
-       if ((flags & HAMMER_INODE_MODMASK) == 0)
+       if ((ip->flags & HAMMER_INODE_RECLAIM) == 0)
                return;
-       if ((flags & (HAMMER_INODE_MODMASK & ~HAMMER_INODE_MODEASY)) == 0) {
-               factor = 2;
-       } else {
-               factor = 1;
-       }
 
-       while (hmp->inode_reclaims > HAMMER_RECLAIM_MIN) {
-               if (hmp->inode_reclaims < HAMMER_RECLAIM_MID) {
-                       hammer_flusher_async(hmp);
-                       break;
-               }
-               if (hmp->inode_reclaims < HAMMER_RECLAIM_MAX) {
-                       delay = (hmp->inode_reclaims - HAMMER_RECLAIM_MID) *
-                               hz / (HAMMER_RECLAIM_MAX - HAMMER_RECLAIM_MID);
-                       delay = delay / factor;
-                       if (delay == 0)
-                               delay = 1;
-                       hammer_flusher_async(hmp);
-                       tsleep(&delay, 0, "hmitik", delay);
-                       break;
-               }
-               hmp->flags |= HAMMER_MOUNT_WAITIMAX;
-               hammer_flusher_async(hmp);
-               tsleep(&hmp->inode_reclaims, 0, "hmimax", hz / 10);
-       }
-}
+       --hammer_count_reclaiming;
+       --hmp->inode_reclaims;
+       ip->flags &= ~HAMMER_INODE_RECLAIM;
 
-void
-hammer_inode_wakereclaims(hammer_mount_t hmp)
-{
-       if ((hmp->flags & HAMMER_MOUNT_WAITIMAX) &&
-           hmp->inode_reclaims < HAMMER_RECLAIM_MAX) {
-               hmp->flags &= ~HAMMER_MOUNT_WAITIMAX;
-               wakeup(&hmp->inode_reclaims);
+       if ((reclaim = TAILQ_FIRST(&hmp->reclaim_list)) != NULL) {
+               TAILQ_REMOVE(&hmp->reclaim_list, reclaim, entry);
+               reclaim->okydoky = 1;
+               wakeup(reclaim);
        }
 }
 
index 3bbe1ff..5baeef1 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_io.c,v 1.39 2008/06/11 22:33:21 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_io.c,v 1.40 2008/06/13 00:25:33 dillon Exp $
  */
 /*
  * IO Primitives and buffer cache management
@@ -671,7 +671,7 @@ hammer_io_start(struct buf *bp)
 }
 
 /*
- * Post-IO completion kernel callback
+ * Post-IO completion kernel callback - MAY BE CALLED FROM INTERRUPT!
  *
  * NOTE: HAMMER may modify a buffer after initiating I/O.  The modified bit
  * may also be set if we were marking a cluster header open.  Only remove
@@ -718,7 +718,7 @@ hammer_io_complete(struct buf *bp)
  * Callback from kernel when it wishes to deallocate a passively
  * associated structure.  This mostly occurs with clean buffers
  * but it may be possible for a holding structure to be marked dirty
- * while its buffer is passively associated.
+ * while its buffer is passively associated.  The caller owns the bp.
  *
  * If we cannot disassociate we set B_LOCKED to prevent the buffer
  * from getting reused.
@@ -726,6 +726,8 @@ hammer_io_complete(struct buf *bp)
  * WARNING: Because this can be called directly by getnewbuf we cannot
  * recurse into the tree.  If a bp cannot be immediately disassociated
  * our only recourse is to set B_LOCKED.
+ *
+ * WARNING: This may be called from an interrupt via hammer_io_complete()
  */
 static void
 hammer_io_deallocate(struct buf *bp)
@@ -960,8 +962,6 @@ hammer_io_direct_write(hammer_mount_t hmp, hammer_btree_leaf_elm_t leaf,
                        nbio = push_bio(bio);
                        nbio->bio_offset = volume->ondisk->vol_buf_beg +
                                           zone2_offset;
-                       if (hammer_debug_write_release & 1)
-                               nbio->bio_buf->b_flags |= B_RELBUF|B_NOCACHE;
                        vn_strategy(volume->devvp, nbio);
                }
                hammer_rel_volume(volume, 0);
@@ -971,10 +971,11 @@ hammer_io_direct_write(hammer_mount_t hmp, hammer_btree_leaf_elm_t leaf,
                ptr = hammer_bread(hmp, buf_offset, &error, &buffer);
                if (error == 0) {
                        bp = bio->bio_buf;
+                       bp->b_flags |= B_AGE;
                        hammer_io_modify(&buffer->io, 1);
                        bcopy(bp->b_data, ptr, leaf->data_len);
                        hammer_io_modify_done(&buffer->io);
-                       hammer_rel_buffer(buffer, (hammer_debug_write_release & 2));
+                       hammer_rel_buffer(buffer, 0);
                        bp->b_resid = 0;
                        biodone(bio);
                }
index f09c4da..4ea25b2 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.66 2008/06/11 22:33:21 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.67 2008/06/13 00:25:33 dillon Exp $
  */
 
 #include "hammer.h"
@@ -374,11 +374,6 @@ hammer_rel_mem_record(struct hammer_record *record)
                 * loop up and do a relookup.
                 */
                ip = record->ip;
-               if (record->flags & HAMMER_RECF_WANTIDLE) {
-                       record->flags &= ~HAMMER_RECF_WANTIDLE;
-                       ++ip->idle_wakeup;
-                       wakeup(&ip->idle_wakeup);
-               }
 
                /*
                 * Upon release of the last reference a record marked deleted
@@ -813,28 +808,26 @@ hammer_ip_add_bulk(hammer_inode_t ip, off_t file_offset, void *data, int bytes,
        hammer_record_t record;
        hammer_record_t conflict;
        int zone;
-       int save_wakeup;
 
        /*
-        * Deal with conflicting in-memory records.
+        * Deal with conflicting in-memory records.  We cannot have multiple
+        * in-memory records for the same offset without seriously confusing
+        * the backend, including but not limited to the backend issuing
+        * delete-create-delete sequences and asserting on the delete_tid
+        * being the same as the create_tid.
         *
-        * We must wait for the record to become idle so we can ensure
-        * its deletion.
+        * If we encounter a record with the backend interlock set we cannot
+        * immediately delete it without confusing the backend.
         */
        while ((conflict = hammer_ip_get_bulk(ip, file_offset, bytes)) !=NULL) {
-               if (conflict->lock.refs != 1) {
-                       conflict->flags |= HAMMER_RECF_WANTIDLE;
-                       save_wakeup = ip->idle_wakeup;
-                       hammer_rel_mem_record(conflict);
-                       hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
-                       if (save_wakeup == ip->idle_wakeup)
-                               tsleep(&ip->idle_wakeup, 0, "hmrrc3", 0);
-               } else {
-                       /* flush state adds a ref, shouldn't be posible */
-                       KKASSERT(conflict->flush_state != HAMMER_FST_FLUSH);
-                       conflict->flags |= HAMMER_RECF_DELETED_FE;
+               if (conflict->flags & HAMMER_RECF_INTERLOCK_BE) {
+                       conflict->flags |= HAMMER_RECF_WANTED;
+                       tsleep(conflict, 0, "hmrrc3", 0);
                        hammer_rel_mem_record(conflict);
+                       continue;
                }
+               conflict->flags |= HAMMER_RECF_DELETED_FE;
+               hammer_rel_mem_record(conflict);
        }
 
        /*
index a972ef1..e232453 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.55 2008/06/12 00:16:10 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.56 2008/06/13 00:25:33 dillon Exp $
  */
 /*
  * Manage HAMMER's on-disk structures.  These routines are primarily
@@ -514,13 +514,19 @@ again:
                if (buffer->io.lock.refs == 0)
                        ++hammer_count_refedbufs;
                hammer_ref(&buffer->io.lock);
+
+               /*
+                * Onced refed the ondisk field will not be cleared by
+                * any other action.
+                */
                if (buffer->ondisk && buffer->io.loading == 0) {
                        *errorp = 0;
                        return(buffer);
                }
 
                /*
-                * The buffer is no longer loose if it has a ref.  Loose
+                * The buffer is no longer loose if it has a ref, and
+                * cannot become loose once it gains a ref.  Loose
                 * buffers will never be in a modified state.  This should
                 * only occur on the 0->1 transition of refs.
                 */
index 7a35864..911490d 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_vfsops.c,v 1.46 2008/06/12 00:16:10 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_vfsops.c,v 1.47 2008/06/13 00:25:33 dillon Exp $
  */
 
 #include <sys/param.h>
@@ -56,7 +56,6 @@ int hammer_debug_btree;
 int hammer_debug_tid;
 int hammer_debug_recover;              /* -1 will disable, +1 will force */
 int hammer_debug_recover_faults;
-int hammer_debug_write_release;                /* if 1 release buffer on strategy */
 int hammer_debug_cluster_enable = 1;   /* enable read clustering by default */
 int hammer_count_inodes;
 int hammer_count_iqueued;
@@ -101,8 +100,6 @@ SYSCTL_INT(_vfs_hammer, OID_AUTO, debug_recover, CTLFLAG_RW,
           &hammer_debug_recover, 0, "");
 SYSCTL_INT(_vfs_hammer, OID_AUTO, debug_recover_faults, CTLFLAG_RW,
           &hammer_debug_recover_faults, 0, "");
-SYSCTL_INT(_vfs_hammer, OID_AUTO, debug_write_release, CTLFLAG_RW,
-          &hammer_debug_write_release, 0, "");
 SYSCTL_INT(_vfs_hammer, OID_AUTO, debug_cluster_enable, CTLFLAG_RW,
           &hammer_debug_cluster_enable, 0, "");
 
@@ -267,6 +264,7 @@ hammer_vfs_mount(struct mount *mp, char *mntpt, caddr_t data,
                TAILQ_INIT(&hmp->delay_list);
                TAILQ_INIT(&hmp->objid_cache_list);
                TAILQ_INIT(&hmp->undo_lru_list);
+               TAILQ_INIT(&hmp->reclaim_list);
 
                /*
                 * Set default zone limits.  This value can be reduced
index b511f5a..37a6d4f 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.68 2008/06/12 01:55:58 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.69 2008/06/13 00:25:33 dillon Exp $
  */
 
 #include <sys/param.h>
@@ -230,6 +230,7 @@ hammer_vop_read(struct vop_read_args *ap)
                        brelse(bp);
                        break;
                }
+
                /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
                n = HAMMER_BUFSIZE - offset;
                if (n > uio->uio_resid)
@@ -237,6 +238,9 @@ hammer_vop_read(struct vop_read_args *ap)
                if (n > ip->ino_data.size - uio->uio_offset)
                        n = (int)(ip->ino_data.size - uio->uio_offset);
                error = uiomove((char *)bp->b_data + offset, n, uio);
+
+               /* data has a lower priority then meta-data */
+               bp->b_flags |= B_AGE;
                bqrelse(bp);
                if (error)
                        break;
@@ -497,11 +501,6 @@ static
 int
 hammer_vop_close(struct vop_close_args *ap)
 {
-       struct hammer_inode *ip = VTOI(ap->a_vp);
-
-       if (ap->a_vp->v_opencount == 1)
-               hammer_inode_waitreclaims(ip);
-
        return (vop_stdclose(ap));
 }