HAMMER VFS - Improve saturated write performance.
authorMatthew Dillon <dillon@apollo.backplane.com>
Tue, 11 Jan 2011 02:07:20 +0000 (18:07 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Tue, 11 Jan 2011 06:17:39 +0000 (22:17 -0800)
* Change vfs.hammer.limit_reclaim to auto-scale to 1/10 the maximum
  number of vnodes, instead of using a fixed value of 4000.  On a
  typical i386 box this will be around ~10000 and on a typical x86-64
  box this will be ~50000.

* Greatly reduce the vfs.hammer.autoflush default from 2000 inodes
  to 500 inodes.

  This results in better locality of reference within the flush groups
  and better pipelining when the reclaim limit is approached (when the
  vnode cache is saturated).

* Refactor the sequencing of the hammer_flush_group structure, fixing
  a number of issues which caused the structures to hold an inconsistent
  number of inodes.

* Refactor the flusher's sequence space management.

* Blogbench performance is greatly improved, as is mixed random read/write
  I/O in general.

  Note, however, that typical production operations do not involve
  hundreds of thousands of files and will not be impacted as much.

sys/vfs/hammer/hammer.h
sys/vfs/hammer/hammer_flusher.c
sys/vfs/hammer/hammer_inode.c
sys/vfs/hammer/hammer_vfsops.c

index fa4d62c..c76de4b 100644 (file)
@@ -312,7 +312,7 @@ RB_PROTOTYPE(hammer_fls_rb_tree, hammer_inode, rb_flsnode,
 struct hammer_flush_group {
        TAILQ_ENTRY(hammer_flush_group) flush_entry;
        struct hammer_fls_rb_tree       flush_tree;
-       int                             unused01;
+       int                             seq;            /* our seq no */
        int                             total_count;    /* record load */
        int                             running;        /* group is running */
        int                             closed;
@@ -483,8 +483,6 @@ struct hammer_reclaim {
        int     count;
 };
 
-#define HAMMER_RECLAIM_WAIT    4000    /* default vfs.hammer.limit_reclaim */
-
 /*
  * Track who is creating the greatest burden on the
  * inode cache.
@@ -840,7 +838,7 @@ struct hammer_flusher {
        int             signal;         /* flusher thread sequencer */
        int             act;            /* currently active flush group */
        int             done;           /* set to act when complete */
-       int             next;           /* next flush group */
+       int             next;           /* next unallocated flg seqno */
        int             group_lock;     /* lock sequencing of the next flush */
        int             exiting;        /* request master exit */
        thread_t        td;             /* master flusher thread */
@@ -941,6 +939,7 @@ struct hammer_mount {
        TAILQ_HEAD(, hammer_undo)  undo_lru_list;
        TAILQ_HEAD(, hammer_reserve) delay_list;
        struct hammer_flush_group_list  flush_group_list;
+       hammer_flush_group_t    fill_flush_group;
        hammer_flush_group_t    next_flush_group;
        TAILQ_HEAD(, hammer_objid_cache) objid_cache_list;
        TAILQ_HEAD(, hammer_dedup_cache) dedup_lru_list;
index f86f783..5e1b0b3 100644 (file)
@@ -90,41 +90,71 @@ hammer_flusher_sync(hammer_mount_t hmp)
 }
 
 /*
- * Sync all inodes pending on the flusher - return immediately.
+ * Sync all flush groups through to close_flg - return immediately.
+ * If close_flg is NULL all flush groups are synced.
  *
- * All flush groups will be flushed.
+ * Returns the sequence number of the last closed flush group,
+ * which may be close_flg.  When syncing to the end if there
+ * are no flush groups pending we still cycle the flusher, so
+ * we return the next seq number not yet allocated.
  */
 int
 hammer_flusher_async(hammer_mount_t hmp, hammer_flush_group_t close_flg)
 {
        hammer_flush_group_t flg;
-       int seq = hmp->flusher.next;
+       int seq;
+
+       /*
+        * Already closed
+        */
+       if (close_flg && close_flg->closed)
+               return(close_flg->seq);
 
-       TAILQ_FOREACH(flg, &hmp->flush_group_list, flush_entry) {
-               if (flg->running == 0)
-                       ++seq;
+       /*
+        * Close flush groups until we hit the end of the list
+        * or close_flg.
+        */
+       while ((flg = hmp->next_flush_group) != NULL) {
+               KKASSERT(flg->closed == 0 && flg->running == 0);
                flg->closed = 1;
+               hmp->next_flush_group = TAILQ_NEXT(flg, flush_entry);
                if (flg == close_flg)
                        break;
        }
+
        if (hmp->flusher.td) {
                if (hmp->flusher.signal++ == 0)
                        wakeup(&hmp->flusher.signal);
+               seq = flg ? flg->seq : hmp->flusher.next;
        } else {
                seq = hmp->flusher.done;
        }
        return(seq);
 }
 
+/*
+ * Flush the current/next flushable flg.  This function is typically called
+ * in a loop along with hammer_flusher_wait(hmp, returned_seq) to iterate
+ * flush groups until specific conditions are met.
+ *
+ * If a flush is currently in progress its seq is returned.
+ *
+ * If no flush is currently in progress the next available flush group
+ * will be flushed and its seq returned.
+ *
+ * If no flush groups are present a dummy seq will be allocated and
+ * returned and the flusher will be activated (e.g. to flush the
+ * undo/redo and the volume header).
+ */
 int
 hammer_flusher_async_one(hammer_mount_t hmp)
 {
+       hammer_flush_group_t flg;
        int seq;
 
        if (hmp->flusher.td) {
-               seq = hmp->flusher.next;
-               if (hmp->flusher.signal++ == 0)
-                       wakeup(&hmp->flusher.signal);
+               flg = TAILQ_FIRST(&hmp->flush_group_list);
+               seq = hammer_flusher_async(hmp, flg);
        } else {
                seq = hmp->flusher.done;
        }
@@ -139,7 +169,7 @@ void
 hammer_flusher_wait(hammer_mount_t hmp, int seq)
 {
        while ((int)(seq - hmp->flusher.done) > 0) {
-               if (hmp->flusher.act != seq) {
+               if ((int)(seq - hmp->flusher.act) > 0) {
                        if (hmp->flusher.signal++ == 0)
                                wakeup(&hmp->flusher.signal);
                }
@@ -226,14 +256,13 @@ hammer_flusher_master_thread(void *arg)
 
        for (;;) {
                /*
-                * Do at least one flush cycle.  We may have to update the
-                * UNDO FIFO even if no inodes are queued.
+                * Flush all closed flgs.  If no flg's are closed we still
+                * do at least one flush cycle as we may have to update
+                * the UNDO FIFO even if no inodes are queued.
                 */
                for (;;) {
                        while (hmp->flusher.group_lock)
                                tsleep(&hmp->flusher.group_lock, 0, "hmrhld", 0);
-                       hmp->flusher.act = hmp->flusher.next;
-                       ++hmp->flusher.next;
                        hammer_flusher_clean_loose_ios(hmp);
                        hammer_flusher_flush(hmp);
                        hmp->flusher.done = hmp->flusher.act;
@@ -287,19 +316,41 @@ hammer_flusher_flush(hammer_mount_t hmp)
        /*
         * Just in-case there's a flush race on mount
         */
-       if (TAILQ_FIRST(&hmp->flusher.ready_list) == NULL)
+       if (TAILQ_FIRST(&hmp->flusher.ready_list) == NULL) {
                return;
+       }
+
+       /*
+        * Set the actively flushing sequence number.  If no flushable
+        * groups are present allocate a dummy sequence number for the
+        * operation.
+        */
+       flg = TAILQ_FIRST(&hmp->flush_group_list);
+       if (flg == NULL) {
+               hmp->flusher.act = hmp->flusher.next;
+               ++hmp->flusher.next;
+       } else if (flg->closed) {
+               KKASSERT(flg->running == 0);
+               flg->running = 1;
+               hmp->flusher.act = flg->seq;
+               if (hmp->fill_flush_group == flg)
+                       hmp->fill_flush_group = TAILQ_NEXT(flg, flush_entry);
+       }
 
        /*
         * We only do one flg but we may have to loop/retry.
+        *
+        * Due to various races it is possible to come across a flush
+        * group which as not yet been closed.
         */
        count = 0;
-       while ((flg = TAILQ_FIRST(&hmp->flush_group_list)) != NULL) {
+       while (flg && flg->running) {
                ++count;
                if (hammer_debug_general & 0x0001) {
                        kprintf("hammer_flush %d ttl=%d recs=%d\n",
                                hmp->flusher.act,
-                               flg->total_count, flg->refs);
+                               flg->total_count,
+                               flg->refs);
                }
                if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR)
                        break;
@@ -314,13 +365,7 @@ hammer_flusher_flush(hammer_mount_t hmp)
                if (hammer_flusher_undo_exhausted(&hmp->flusher.trans, 3))
                        hammer_flusher_finalize(&hmp->flusher.trans, 0);
 
-               /*
-                * Ok, we are running this flush group now (this prevents new
-                * additions to it).
-                */
-               flg->running = 1;
-               if (hmp->next_flush_group == flg)
-                       hmp->next_flush_group = TAILQ_NEXT(flg, flush_entry);
+               KKASSERT(hmp->next_flush_group != flg);
 
                /*
                 * Iterate the inodes in the flg's flush_tree and assign
@@ -400,6 +445,7 @@ hammer_flusher_flush(hammer_mount_t hmp)
                        kfree(flg, hmp->m_misc);
                        break;
                }
+               KKASSERT(TAILQ_FIRST(&hmp->flush_group_list) == flg);
        }
 
        /*
index ee841b4..231883e 100644 (file)
@@ -1655,21 +1655,29 @@ hammer_flush_inode(hammer_inode_t ip, int flags)
        int good;
 
        /*
-        * next_flush_group is the first flush group we can place the inode
-        * in.  It may be NULL.  If it becomes full we append a new flush
-        * group and make that the next_flush_group.
+        * fill_flush_group is the first flush group we may be able to
+        * continue filling, it may be open or closed but it will always
+        * be past the currently flushing (running) flg.
+        *
+        * next_flush_group is the next open flush group.
         */
        hmp = ip->hmp;
-       while ((flg = hmp->next_flush_group) != NULL) {
+       while ((flg = hmp->fill_flush_group) != NULL) {
                KKASSERT(flg->running == 0);
-               if (flg->total_count + flg->refs <= ip->hmp->undo_rec_limit)
+               if (flg->total_count + flg->refs <= ip->hmp->undo_rec_limit &&
+                   flg->total_count <= hammer_autoflush) {
                        break;
-               hmp->next_flush_group = TAILQ_NEXT(flg, flush_entry);
+               }
+               hmp->fill_flush_group = TAILQ_NEXT(flg, flush_entry);
                hammer_flusher_async(ip->hmp, flg);
        }
        if (flg == NULL) {
                flg = kmalloc(sizeof(*flg), hmp->m_misc, M_WAITOK|M_ZERO);
-               hmp->next_flush_group = flg;
+               flg->seq = hmp->flusher.next++;
+               if (hmp->next_flush_group == NULL)
+                       hmp->next_flush_group = flg;
+               if (hmp->fill_flush_group == NULL)
+                       hmp->fill_flush_group = flg;
                RB_INIT(&flg->flush_tree);
                TAILQ_INSERT_TAIL(&hmp->flush_group_list, flg, flush_entry);
        }
@@ -1973,6 +1981,7 @@ hammer_setup_parent_inodes_helper(hammer_record_t record, int depth,
 static void
 hammer_flush_inode_core(hammer_inode_t ip, hammer_flush_group_t flg, int flags)
 {
+       hammer_mount_t hmp = ip->hmp;
        int go_count;
 
        /*
@@ -1985,24 +1994,12 @@ hammer_flush_inode_core(hammer_inode_t ip, hammer_flush_group_t flg, int flags)
                hammer_ref(&ip->lock);
        ip->flush_state = HAMMER_FST_FLUSH;
        ip->flush_group = flg;
-       ++ip->hmp->flusher.group_lock;
-       ++ip->hmp->count_iqueued;
+       ++hmp->flusher.group_lock;
+       ++hmp->count_iqueued;
        ++hammer_count_iqueued;
        ++flg->total_count;
        hammer_redo_fifo_start_flush(ip);
 
-       /*
-        * If the flush group reaches the autoflush limit we want to signal
-        * the flusher.  This is particularly important for remove()s.
-        *
-        * If the default hammer_limit_reclaim is changed via sysctl
-        * make sure we don't hit a degenerate case where we don't start
-        * a flush but blocked on further inode ops.
-        */
-       if (flg->total_count == hammer_autoflush ||
-           flg->total_count >= hammer_limit_reclaim / 4)
-               flags |= HAMMER_FLUSH_SIGNAL;
-
 #if 0
        /*
         * We need to be able to vfsync/truncate from the backend.
@@ -2064,12 +2061,18 @@ hammer_flush_inode_core(hammer_inode_t ip, hammer_flush_group_t flg, int flags)
         */
        if (go_count == 0) {
                if ((ip->flags & HAMMER_INODE_MODMASK_NOXDIRTY) == 0) {
-                       --ip->hmp->count_iqueued;
+                       --hmp->count_iqueued;
                        --hammer_count_iqueued;
 
                        --flg->total_count;
                        ip->flush_state = HAMMER_FST_SETUP;
                        ip->flush_group = NULL;
+                       if (flags & HAMMER_FLUSH_SIGNAL) {
+                               ip->flags |= HAMMER_INODE_REFLUSH |
+                                            HAMMER_INODE_RESIGNAL;
+                       } else {
+                               ip->flags |= HAMMER_INODE_REFLUSH;
+                       }
 #if 0
                        if (ip->flags & HAMMER_INODE_VHELD) {
                                ip->flags &= ~HAMMER_INODE_VHELD;
@@ -2082,12 +2085,8 @@ hammer_flush_inode_core(hammer_inode_t ip, hammer_flush_group_t flg, int flags)
                         * when an inode is in SETUP.
                         */
                        ip->flags |= HAMMER_INODE_REFLUSH;
-                       if (flags & HAMMER_FLUSH_SIGNAL) {
-                               ip->flags |= HAMMER_INODE_RESIGNAL;
-                               hammer_flusher_async(ip->hmp, flg);
-                       }
-                       if (--ip->hmp->flusher.group_lock == 0)
-                               wakeup(&ip->hmp->flusher.group_lock);
+                       if (--hmp->flusher.group_lock == 0)
+                               wakeup(&hmp->flusher.group_lock);
                        return;
                }
        }
@@ -2134,11 +2133,18 @@ hammer_flush_inode_core(hammer_inode_t ip, hammer_flush_group_t flg, int flags)
         */
        KKASSERT(flg->running == 0);
        RB_INSERT(hammer_fls_rb_tree, &flg->flush_tree, ip);
-       if (--ip->hmp->flusher.group_lock == 0)
-               wakeup(&ip->hmp->flusher.group_lock);
+       if (--hmp->flusher.group_lock == 0)
+               wakeup(&hmp->flusher.group_lock);
 
-       if (flags & HAMMER_FLUSH_SIGNAL) {
-               hammer_flusher_async(ip->hmp, flg);
+       /*
+        * Auto-flush the group if it grows too large.  Make sure the
+        * inode reclaim wait pipeline continues to work.
+        */
+       if (flg->total_count >= hammer_autoflush ||
+           flg->total_count >= hammer_limit_reclaim / 4) {
+               if (hmp->fill_flush_group == flg)
+                       hmp->fill_flush_group = TAILQ_NEXT(flg, flush_entry);
+               hammer_flusher_async(hmp, flg);
        }
 }
 
@@ -3156,13 +3162,12 @@ hammer_inode_wakereclaims(hammer_inode_t ip)
        --hmp->inode_reclaims;
        ip->flags &= ~HAMMER_INODE_RECLAIM;
 
-       while ((reclaim = TAILQ_FIRST(&hmp->reclaim_list)) != NULL) {
-               if (reclaim->count > 0 && --reclaim->count == 0) {
+       if ((reclaim = TAILQ_FIRST(&hmp->reclaim_list)) != NULL) {
+               KKASSERT(reclaim->count > 0);
+               if (--reclaim->count == 0) {
                        TAILQ_REMOVE(&hmp->reclaim_list, reclaim, entry);
                        wakeup(reclaim);
                }
-               if (hmp->inode_reclaims > hammer_limit_reclaim / 2)
-                       break;
        }
 }
 
@@ -3173,19 +3178,25 @@ hammer_inode_wakereclaims(hammer_inode_t ip)
  *
  * When we block we don't care *which* inode has finished reclaiming,
  * as lone as one does.
+ *
+ * The reclaim pipeline is primary governed by the auto-flush which is
+ * 1/4 hammer_limit_reclaim.  We don't want to block if the count is
+ * less than 1/2 hammer_limit_reclaim.  From 1/2 to full count is
+ * dynamically governed.
  */
 void
 hammer_inode_waitreclaims(hammer_transaction_t trans)
 {
        hammer_mount_t hmp = trans->hmp;
        struct hammer_reclaim reclaim;
+       int lower_limit;
 
        /*
-        * Track inode load
+        * Track inode load, delay if the number of reclaiming inodes is
+        * between 2/4 and 4/4 hammer_limit_reclaim, depending.
         */
        if (curthread->td_proc) {
                struct hammer_inostats *stats;
-               int lower_limit;
 
                stats = hammer_inode_inostats(hmp, curthread->td_proc->p_pid);
                ++stats->count;
@@ -3193,23 +3204,20 @@ hammer_inode_waitreclaims(hammer_transaction_t trans)
                if (stats->count > hammer_limit_reclaim / 2)
                        stats->count = hammer_limit_reclaim / 2;
                lower_limit = hammer_limit_reclaim - stats->count;
-               if (hammer_debug_general & 0x10000)
-                       kprintf("pid %5d limit %d\n", (int)curthread->td_proc->p_pid, lower_limit);
-
-               if (hmp->inode_reclaims < lower_limit)
-                       return;
+               if (hammer_debug_general & 0x10000) {
+                       kprintf("pid %5d limit %d\n",
+                               (int)curthread->td_proc->p_pid, lower_limit);
+               }
        } else {
-               /*
-                * Default mode
-                */
-               if (hmp->inode_reclaims < hammer_limit_reclaim)
-                       return;
+               lower_limit = hammer_limit_reclaim * 3 / 4;
+       }
+       if (hmp->inode_reclaims >= lower_limit) {
+               reclaim.count = 1;
+               TAILQ_INSERT_TAIL(&hmp->reclaim_list, &reclaim, entry);
+               tsleep(&reclaim, 0, "hmrrcm", hz);
+               if (reclaim.count > 0)
+                       TAILQ_REMOVE(&hmp->reclaim_list, &reclaim, entry);
        }
-       reclaim.count = 1;
-       TAILQ_INSERT_TAIL(&hmp->reclaim_list, &reclaim, entry);
-       tsleep(&reclaim, 0, "hmrrcm", hz);
-       if (reclaim.count > 0)
-               TAILQ_REMOVE(&hmp->reclaim_list, &reclaim, entry);
 }
 
 /*
index ec01916..328a9d9 100644 (file)
@@ -105,10 +105,10 @@ int hammer_limit_dirtybufspace;           /* per-mount */
 int hammer_limit_running_io;           /* per-mount */
 int hammer_limit_recs;                 /* as a whole XXX */
 int hammer_limit_inode_recs = 1024;    /* per inode */
-int hammer_limit_reclaim = HAMMER_RECLAIM_WAIT;
+int hammer_limit_reclaim;
 int hammer_live_dedup_cache_size = DEDUP_CACHE_SIZE;
 int hammer_limit_redo = 4096 * 1024;   /* per inode */
-int hammer_autoflush = 2000;           /* auto flush */
+int hammer_autoflush = 500;            /* auto flush (typ on reclaim) */
 int hammer_bio_count;
 int hammer_verify_zone;
 int hammer_verify_data = 1;
@@ -355,6 +355,15 @@ hammer_vfs_init(struct vfsconf *conf)
                hammer_limit_running_io = hammer_limit_dirtybufspace;
        if (hammer_limit_running_io > 10 * 1024 * 1024)
                hammer_limit_running_io = 10 * 1024 * 1024;
+
+       /*
+        * The hammer_inode structure detaches from the vnode on reclaim.
+        * This limits the number of inodes in this state to prevent a
+        * memory pool blowout.
+        */
+       if (hammer_limit_reclaim == 0)
+               hammer_limit_reclaim = desiredvnodes / 10;
+
        return(0);
 }