HAMMER 61E/Many: Stabilization, Performance
authorMatthew Dillon <dillon@dragonflybsd.org>
Sat, 12 Jul 2008 23:04:50 +0000 (23:04 +0000)
committerMatthew Dillon <dillon@dragonflybsd.org>
Sat, 12 Jul 2008 23:04:50 +0000 (23:04 +0000)
* The UNDO FIFO could get over-full, causing an assertion.  This was because
  certain load tests could cuase directories to end up with tens of
  thousands of directory entries and all of them were being flushed in
  a single flush group.

  Rewrite the flush group infrastructure to fix the issue.  Instead of
  having a two-stage flush we now have flush compartmentalization with
  the new hammer_flush_group structure.  Flushes can thus be broken down
  into transactions that don't blow out the UNDO FIFO.

* Improve flush performance by quite a bit.  The new infrastructure
  generates fewer B-Tree conflicts and appears to flush the work more
  smoothly, probably owing to the fact that the upper size of each
  flush group is now bounded.

12 files changed:
sys/vfs/hammer/hammer.h
sys/vfs/hammer/hammer_flusher.c
sys/vfs/hammer/hammer_inode.c
sys/vfs/hammer/hammer_ioctl.c
sys/vfs/hammer/hammer_ioctl.h
sys/vfs/hammer/hammer_mirror.c
sys/vfs/hammer/hammer_object.c
sys/vfs/hammer/hammer_ondisk.c
sys/vfs/hammer/hammer_prune.c
sys/vfs/hammer/hammer_reblock.c
sys/vfs/hammer/hammer_vfsops.c
sys/vfs/hammer/hammer_vnops.c

index d29ab91..6f3fff1 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.113 2008/07/12 02:47:39 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.114 2008/07/12 23:04:50 dillon Exp $
  */
 /*
  * This header file contains structures used internally by the HAMMERFS
@@ -210,7 +210,7 @@ typedef struct hammer_objid_cache {
  * Associate an inode with a B-Tree node to cache search start positions
  */
 typedef struct hammer_node_cache {
-       TAILQ_ENTRY(hammer_node_cache) entry;
+       TAILQ_ENTRY(hammer_node_cache)  entry;
        struct hammer_node              *node;
        struct hammer_inode             *ip;
 } *hammer_node_cache_t;
@@ -218,6 +218,26 @@ typedef struct hammer_node_cache {
 TAILQ_HEAD(hammer_node_cache_list, hammer_node_cache);
 
 /*
+ * Structure used to organize flush groups.  Flush groups must be
+ * organized into chunks in order to avoid blowing out the UNDO FIFO.
+ * Without this a 'sync' could end up flushing 50,000 inodes in a single
+ * transaction.
+ */
+struct hammer_flush_group {
+       TAILQ_ENTRY(hammer_flush_group) flush_entry;
+       TAILQ_HEAD(, hammer_inode)      flush_list;
+       int                             inode_count;    /* inode load */
+       int                             total_count;    /* record load */
+       int                             running;        /* group is running */
+       int                             closed;
+       int                             refs;
+};
+
+typedef struct hammer_flush_group *hammer_flush_group_t;
+
+TAILQ_HEAD(hammer_flush_group_list, hammer_flush_group);
+
+/*
  * Structure used to represent an inode in-memory.
  *
  * The record and data associated with an inode may be out of sync with
@@ -250,7 +270,7 @@ TAILQ_HEAD(hammer_node_list, hammer_node);
 struct hammer_inode {
        RB_ENTRY(hammer_inode)  rb_node;
        hammer_inode_state_t    flush_state;
-       int                     flush_group;
+       hammer_flush_group_t    flush_group;
        TAILQ_ENTRY(hammer_inode) flush_entry;
        struct hammer_record_list target_list;  /* target of dependant recs */
        int64_t                 obj_id;         /* (key) object identifier */
@@ -368,7 +388,7 @@ struct hammer_record {
        RB_ENTRY(hammer_record)         rb_node;
        TAILQ_ENTRY(hammer_record)      target_entry;
        hammer_inode_state_t            flush_state;
-       int                             flush_group;
+       hammer_flush_group_t            flush_group;
        hammer_record_type_t            type;
        struct hammer_lock              lock;
        struct hammer_reserve           *resv;
@@ -614,8 +634,8 @@ typedef struct hammer_reserve *hammer_reserve_t;
  *
  * This is strictly a heuristic.
  */
-#define HAMMER_MAX_UNDOS       1024
-#define HAMMER_MAX_FLUSHERS    4
+#define HAMMER_MAX_UNDOS               1024
+#define HAMMER_MAX_FLUSHERS            4
 
 struct hammer_undo {
        RB_ENTRY(hammer_undo)   rb_node;
@@ -627,6 +647,7 @@ struct hammer_undo {
 typedef struct hammer_undo *hammer_undo_t;
 
 struct hammer_flusher_info;
+TAILQ_HEAD(hammer_flusher_info_list, hammer_flusher_info);
 
 struct hammer_flusher {
        int             signal;         /* flusher thread sequencer */
@@ -635,14 +656,13 @@ struct hammer_flusher {
        int             next;           /* next flush group */
        int             group_lock;     /* lock sequencing of the next flush */
        int             exiting;        /* request master exit */
-       int             count;          /* number of slave flushers */
-       int             running;        /* number of slave flushers running */
        thread_t        td;             /* master flusher thread */
        hammer_tid_t    tid;            /* last flushed transaction id */
        int             finalize_want;          /* serialize finalization */
        struct hammer_lock finalize_lock;       /* serialize finalization */
        struct hammer_transaction trans;        /* shared transaction */
-       struct hammer_flusher_info *info[HAMMER_MAX_FLUSHERS];
+       struct hammer_flusher_info_list run_list;
+       struct hammer_flusher_info_list ready_list;
 };
 
 /*
@@ -671,6 +691,7 @@ struct hammer_mount {
        int64_t rsv_databytes;  /* reserved space due to record data */
        int     rsv_recs;       /* reserved space due to dirty records */
        int     rsv_fromdelay;  /* bigblocks reserved due to flush delay */
+       int     undo_rec_limit; /* based on size of undo area */
        int     last_newrecords;
        int     count_newrecords;
 
@@ -705,8 +726,8 @@ struct hammer_mount {
        struct hammer_undo      undos[HAMMER_MAX_UNDOS];
        int                     undo_alloc;
        TAILQ_HEAD(, hammer_undo)  undo_lru_list;
-       TAILQ_HEAD(, hammer_inode) flush_list;
        TAILQ_HEAD(, hammer_reserve) delay_list;
+       struct hammer_flush_group_list  flush_group_list;
        TAILQ_HEAD(, hammer_objid_cache) objid_cache_list;
        TAILQ_HEAD(, hammer_reclaim) reclaim_list;
 };
@@ -748,6 +769,7 @@ extern int hammer_debug_tid;
 extern int hammer_debug_recover;
 extern int hammer_debug_recover_faults;
 extern int hammer_debug_cluster_enable;
+extern int hammer_count_fsyncs;
 extern int hammer_count_inodes;
 extern int hammer_count_iqueued;
 extern int hammer_count_reclaiming;
@@ -1093,7 +1115,7 @@ int hammer_signal_check(hammer_mount_t hmp);
 void hammer_flusher_create(hammer_mount_t hmp);
 void hammer_flusher_destroy(hammer_mount_t hmp);
 void hammer_flusher_sync(hammer_mount_t hmp);
-int  hammer_flusher_async(hammer_mount_t hmp);
+int  hammer_flusher_async(hammer_mount_t hmp, hammer_flush_group_t flg);
 void hammer_flusher_wait(hammer_mount_t hmp, int seq);
 int  hammer_flusher_meta_limit(hammer_mount_t hmp);
 int  hammer_flusher_meta_halflimit(hammer_mount_t hmp);
index 53513a9..c265adc 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_flusher.c,v 1.36 2008/07/11 05:44:23 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_flusher.c,v 1.37 2008/07/12 23:04:50 dillon Exp $
  */
 /*
  * HAMMER dependancy flusher thread
@@ -52,42 +52,52 @@ static void hammer_flusher_flush_inode(hammer_inode_t ip,
  * Support structures for the flusher threads.
  */
 struct hammer_flusher_info {
+       TAILQ_ENTRY(hammer_flusher_info) entry;
        struct hammer_mount *hmp;
        thread_t        td;
-       int             startit;
+       int             runstate;
+       int             count;
+       hammer_flush_group_t flg;
        hammer_inode_t  work_array[HAMMER_FLUSH_GROUP_SIZE];
 };
 
 typedef struct hammer_flusher_info *hammer_flusher_info_t;
 
 /*
- * Sync all inodes pending on the flusher.  This routine may have to be
- * called twice to get them all as some may be queued to a later flush group.
+ * Sync all inodes pending on the flusher.
+ *
+ * All flush groups will be flushed.  This does not queue dirty inodes
+ * to the flush groups, it just flushes out what has already been queued!
  */
 void
 hammer_flusher_sync(hammer_mount_t hmp)
 {
        int seq;
 
-       if (hmp->flusher.td) {
-               seq = hmp->flusher.next;
-               if (hmp->flusher.signal++ == 0)
-                       wakeup(&hmp->flusher.signal);
-               while ((int)(seq - hmp->flusher.done) > 0)
-                       tsleep(&hmp->flusher.done, 0, "hmrfls", 0);
-       }
+       seq = hammer_flusher_async(hmp, NULL);
+       while ((int)(seq - hmp->flusher.done) > 0)
+               tsleep(&hmp->flusher.done, 0, "hmrfls", 0);
 }
 
 /*
  * Sync all inodes pending on the flusher - return immediately.
+ *
+ * All flush groups will be flushed.
  */
 int
-hammer_flusher_async(hammer_mount_t hmp)
+hammer_flusher_async(hammer_mount_t hmp, hammer_flush_group_t close_flg)
 {
-       int seq;
-
+       hammer_flush_group_t flg;
+       int seq = hmp->flusher.next;
+
+       TAILQ_FOREACH(flg, &hmp->flush_group_list, flush_entry) {
+               if (flg->running == 0)
+                       ++seq;
+               flg->closed = 1;
+               if (flg == close_flg)
+                       break;
+       }
        if (hmp->flusher.td) {
-               seq = hmp->flusher.next;
                if (hmp->flusher.signal++ == 0)
                        wakeup(&hmp->flusher.signal);
        } else {
@@ -113,16 +123,16 @@ hammer_flusher_create(hammer_mount_t hmp)
        hmp->flusher.act = 0;
        hmp->flusher.done = 0;
        hmp->flusher.next = 1;
-       hmp->flusher.count = 0;
        hammer_ref(&hmp->flusher.finalize_lock);
+       TAILQ_INIT(&hmp->flusher.run_list);
+       TAILQ_INIT(&hmp->flusher.ready_list);
 
        lwkt_create(hammer_flusher_master_thread, hmp,
                    &hmp->flusher.td, NULL, 0, -1, "hammer-M");
        for (i = 0; i < HAMMER_MAX_FLUSHERS; ++i) {
                info = kmalloc(sizeof(*info), M_HAMMER, M_WAITOK|M_ZERO);
                info->hmp = hmp;
-               ++hmp->flusher.count;
-               hmp->flusher.info[i] = info;
+               TAILQ_INSERT_TAIL(&hmp->flusher.ready_list, info, entry);
                lwkt_create(hammer_flusher_slave_thread, info,
                            &info->td, NULL, 0, -1, "hammer-S%d", i);
        }
@@ -132,7 +142,6 @@ void
 hammer_flusher_destroy(hammer_mount_t hmp)
 {
        hammer_flusher_info_t info;
-       int i;
 
        /*
         * Kill the master
@@ -147,20 +156,16 @@ hammer_flusher_destroy(hammer_mount_t hmp)
        /*
         * Kill the slaves
         */
-       for (i = 0; i < HAMMER_MAX_FLUSHERS; ++i) {
-               if ((info = hmp->flusher.info[i]) != NULL) {
-                       KKASSERT(info->startit == 0);
-                       info->startit = -1;
-                       wakeup(&info->startit);
-                       while (info->td) {
-                               tsleep(&info->td, 0, "hmrwwc", 0);
-                       }
-                       hmp->flusher.info[i] = NULL;
-                       kfree(info, M_HAMMER);
-                       --hmp->flusher.count;
-               }
+       while ((info = TAILQ_FIRST(&hmp->flusher.ready_list)) != NULL) {
+               KKASSERT(info->runstate == 0);
+               TAILQ_REMOVE(&hmp->flusher.ready_list, info, entry);
+               info->runstate = -1;
+               wakeup(&info->runstate);
+               while (info->td)
+                       tsleep(&info->td, 0, "hmrwwc", 0);
+               TAILQ_REMOVE(&hmp->flusher.ready_list, info, entry);
+               kfree(info, M_HAMMER);
        }
-       KKASSERT(hmp->flusher.count == 0);
 }
 
 /*
@@ -170,28 +175,35 @@ hammer_flusher_destroy(hammer_mount_t hmp)
 static void
 hammer_flusher_master_thread(void *arg)
 {
-       hammer_mount_t hmp = arg;
+       hammer_flush_group_t flg;
+       hammer_mount_t hmp;
 
-       for (;;) {
-               while (hmp->flusher.group_lock)
-                       tsleep(&hmp->flusher.group_lock, 0, "hmrhld", 0);
-               hmp->flusher.act = hmp->flusher.next;
-               ++hmp->flusher.next;
-               hammer_flusher_clean_loose_ios(hmp);
-               hammer_flusher_flush(hmp);
-               hmp->flusher.done = hmp->flusher.act;
-               wakeup(&hmp->flusher.done);
+       hmp = arg;
 
+       for (;;) {
                /*
-                * Wait for activity.
+                * Do at least one flush cycle.  We may have to update the
+                * UNDO FIFO even if no inodes are queued.
                 */
-               if (hmp->flusher.exiting && TAILQ_EMPTY(&hmp->flush_list))
-                       break;
+               for (;;) {
+                       while (hmp->flusher.group_lock)
+                               tsleep(&hmp->flusher.group_lock, 0, "hmrhld", 0);
+                       hmp->flusher.act = hmp->flusher.next;
+                       ++hmp->flusher.next;
+                       hammer_flusher_clean_loose_ios(hmp);
+                       hammer_flusher_flush(hmp);
+                       hmp->flusher.done = hmp->flusher.act;
+                       wakeup(&hmp->flusher.done);
+                       flg = TAILQ_FIRST(&hmp->flush_group_list);
+                       if (flg == NULL || flg->closed == 0)
+                               break;
+               }
 
                /*
-                * This is a hack until we can dispose of frontend buffer
-                * cache buffers on the frontend.
+                * Wait for activity.
                 */
+               if (hmp->flusher.exiting && TAILQ_EMPTY(&hmp->flush_group_list))
+                       break;
                while (hmp->flusher.signal == 0)
                        tsleep(&hmp->flusher.signal, 0, "hmrwwa", 0);
                hmp->flusher.signal = 0;
@@ -206,56 +218,166 @@ hammer_flusher_master_thread(void *arg)
 }
 
 /*
+ * Flush all inodes in the current flush group.
+ */
+static void
+hammer_flusher_flush(hammer_mount_t hmp)
+{
+       hammer_flusher_info_t info;
+       hammer_flush_group_t flg;
+       hammer_reserve_t resv;
+       hammer_inode_t ip;
+       hammer_inode_t next_ip;
+       int slave_index;
+
+       /*
+        * Just in-case there's a flush race on mount
+        */
+       if (TAILQ_FIRST(&hmp->flusher.ready_list) == NULL)
+               return;
+
+       /*
+        * We only do one flg but we may have to loop/retry.
+        */
+       while ((flg = TAILQ_FIRST(&hmp->flush_group_list)) != NULL) {
+               if (hammer_debug_general & 0x0001) {
+                       kprintf("hammer_flush %d ttl=%d recs=%d\n",
+                               hmp->flusher.act,
+                               flg->total_count, flg->refs);
+               }
+               hammer_start_transaction_fls(&hmp->flusher.trans, hmp);
+
+               /*
+                * If the previous flush cycle just about exhausted our
+                * UNDO space we may have to do a dummy cycle to move the
+                * first_offset up before actually digging into a new cycle,
+                * or the new cycle will not have sufficient undo space.
+                */
+               if (hammer_flusher_undo_exhausted(&hmp->flusher.trans, 3))
+                       hammer_flusher_finalize(&hmp->flusher.trans, 0);
+
+               /*
+                * Iterate the inodes in the flg's flush_list and assign
+                * them to slaves.
+                */
+               flg->running = 1;
+               slave_index = 0;
+               info = TAILQ_FIRST(&hmp->flusher.ready_list);
+               next_ip = TAILQ_FIRST(&flg->flush_list);
+
+               while ((ip = next_ip) != NULL) {
+                       next_ip = TAILQ_NEXT(ip, flush_entry);
+
+                       /*
+                        * Add ip to the slave's work array.  The slave is
+                        * not currently running.
+                        */
+                       info->work_array[info->count++] = ip;
+                       if (info->count != HAMMER_FLUSH_GROUP_SIZE)
+                               continue;
+
+                       /*
+                        * Get the slave running
+                        */
+                       TAILQ_REMOVE(&hmp->flusher.ready_list, info, entry);
+                       TAILQ_INSERT_TAIL(&hmp->flusher.run_list, info, entry);
+                       info->flg = flg;
+                       info->runstate = 1;
+                       wakeup(&info->runstate);
+
+                       /*
+                        * Get a new slave.  We may have to wait for one to
+                        * finish running.
+                        */
+                       while ((info = TAILQ_FIRST(&hmp->flusher.ready_list)) == NULL) {
+                               tsleep(&hmp->flusher.ready_list, 0, "hmrfcc", 0);
+                       }
+               }
+
+               /*
+                * Run the current slave if necessary
+                */
+               if (info->count) {
+                       TAILQ_REMOVE(&hmp->flusher.ready_list, info, entry);
+                       TAILQ_INSERT_TAIL(&hmp->flusher.run_list, info, entry);
+                       info->flg = flg;
+                       info->runstate = 1;
+                       wakeup(&info->runstate);
+               }
+
+               /*
+                * Wait for all slaves to finish running
+                */
+               while (TAILQ_FIRST(&hmp->flusher.run_list) != NULL)
+                       tsleep(&hmp->flusher.ready_list, 0, "hmrfcc", 0);
+
+               /*
+                * Do the final finalization, clean up
+                */
+               hammer_flusher_finalize(&hmp->flusher.trans, 1);
+               hmp->flusher.tid = hmp->flusher.trans.tid;
+
+               hammer_done_transaction(&hmp->flusher.trans);
+
+               /*
+                * Loop up on the same flg.  If the flg is done clean it up
+                * and break out.  We only flush one flg.
+                */
+               if (TAILQ_FIRST(&flg->flush_list) == NULL) {
+                       KKASSERT(TAILQ_EMPTY(&flg->flush_list));
+                       KKASSERT(flg->refs == 0);
+                       TAILQ_REMOVE(&hmp->flush_group_list, flg, flush_entry);
+                       kfree(flg, M_HAMMER);
+                       break;
+               }
+       }
+
+       /*
+        * Clean up any freed big-blocks (typically zone-2). 
+        * resv->flush_group is typically set several flush groups ahead
+        * of the free to ensure that the freed block is not reused until
+        * it can no longer be reused.
+        */
+       while ((resv = TAILQ_FIRST(&hmp->delay_list)) != NULL) {
+               if (resv->flush_group != hmp->flusher.act)
+                       break;
+               hammer_reserve_clrdelay(hmp, resv);
+       }
+}
+
+
+/*
  * The slave flusher thread pulls work off the master flush_list until no
  * work is left.
  */
 static void
 hammer_flusher_slave_thread(void *arg)
 {
+       hammer_flush_group_t flg;
        hammer_flusher_info_t info;
        hammer_mount_t hmp;
        hammer_inode_t ip;
-       int c;
        int i;
-       int n;
 
        info = arg;
        hmp = info->hmp;
 
        for (;;) {
-               while (info->startit == 0)
-                       tsleep(&info->startit, 0, "hmrssw", 0);
-               if (info->startit < 0)
+               while (info->runstate == 0)
+                       tsleep(&info->runstate, 0, "hmrssw", 0);
+               if (info->runstate < 0)
                        break;
-               info->startit = 0;
+               flg = info->flg;
 
-               /*
-                * Try to pull out around ~64 inodes at a time to flush.
-                * The idea is to try to avoid deadlocks between the slaves.
-                */
-               n = c = 0;
-               while ((ip = TAILQ_FIRST(&hmp->flush_list)) != NULL) {
-                       if (ip->flush_group != hmp->flusher.act)
-                               break;
-                       TAILQ_REMOVE(&hmp->flush_list, ip, flush_entry);
-                       info->work_array[n++] = ip;
-                       c += ip->rsv_recs;
-                       if (n < HAMMER_FLUSH_GROUP_SIZE &&
-                           c < HAMMER_FLUSH_GROUP_SIZE * 8) {
-                               continue;
-                       }
-                       for (i = 0; i < n; ++i){
-                               hammer_flusher_flush_inode(info->work_array[i],
-                                                       &hmp->flusher.trans);
-                       }
-                       n = c = 0;
+               for (i = 0; i < info->count; ++i) {
+                       ip = info->work_array[i];
+                       hammer_flusher_flush_inode(ip, &hmp->flusher.trans);
                }
-               for (i = 0; i < n; ++i) {
-                       hammer_flusher_flush_inode(info->work_array[i],
-                                                  &hmp->flusher.trans);
-               }
-               if (--hmp->flusher.running == 0)
-                       wakeup(&hmp->flusher.running);
+               info->count = 0;
+               info->runstate = 0;
+               TAILQ_REMOVE(&hmp->flusher.run_list, info, entry);
+               TAILQ_INSERT_TAIL(&hmp->flusher.ready_list, info, entry);
+               wakeup(&hmp->flusher.ready_list);
        }
        info->td = NULL;
        wakeup(&info->td);
@@ -290,67 +412,6 @@ hammer_flusher_clean_loose_ios(hammer_mount_t hmp)
 }
 
 /*
- * Flush all inodes in the current flush group.
- */
-static void
-hammer_flusher_flush(hammer_mount_t hmp)
-{
-       hammer_flusher_info_t info;
-       hammer_reserve_t resv;
-       int i;
-       int n;
-
-       hammer_start_transaction_fls(&hmp->flusher.trans, hmp);
-
-       /*
-        * If the previous flush cycle just about exhausted our UNDO space
-        * we may have to do a dummy cycle to move the first_offset up
-        * before actually digging into a new cycle, or the new cycle will
-        * not have sufficient undo space.
-        */
-       if (hammer_flusher_undo_exhausted(&hmp->flusher.trans, 3))
-               hammer_flusher_finalize(&hmp->flusher.trans, 0);
-
-       /*
-        * Start work threads.
-        */
-       i = 0;
-       n = hmp->count_iqueued / HAMMER_FLUSH_GROUP_SIZE;
-       if (TAILQ_FIRST(&hmp->flush_list)) {
-               for (i = 0; i <= n; ++i) {
-                       if (i == HAMMER_MAX_FLUSHERS ||
-                           hmp->flusher.info[i] == NULL) {
-                               break;
-                       }
-                       info = hmp->flusher.info[i];
-                       if (info->startit == 0) {
-                               ++hmp->flusher.running;
-                               info->startit = 1;
-                               wakeup(&info->startit);
-                       }
-               }
-       }
-       while (hmp->flusher.running)
-               tsleep(&hmp->flusher.running, 0, "hmrfcc", 0);
-
-       hammer_flusher_finalize(&hmp->flusher.trans, 1);
-       hmp->flusher.tid = hmp->flusher.trans.tid;
-
-       /*
-        * Clean up any freed big-blocks (typically zone-2). 
-        * resv->flush_group is typically set several flush groups ahead
-        * of the free to ensure that the freed block is not reused until
-        * it can no longer be reused.
-        */
-       while ((resv = TAILQ_FIRST(&hmp->delay_list)) != NULL) {
-               if (resv->flush_group != hmp->flusher.act)
-                       break;
-               hammer_reserve_clrdelay(hmp, resv);
-       }
-       hammer_done_transaction(&hmp->flusher.trans);
-}
-
-/*
  * Flush a single inode that is part of a flush group.
  *
  * NOTE!  The sync code can return EWOULDBLOCK if the flush operation
index 41ed72b..cb5452a 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.100 2008/07/12 02:47:39 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.101 2008/07/12 23:04:50 dillon Exp $
  */
 
 #include "hammer.h"
 
 static int     hammer_unload_inode(struct hammer_inode *ip);
 static void    hammer_free_inode(hammer_inode_t ip);
-static void    hammer_flush_inode_core(hammer_inode_t ip, int flags);
+static void    hammer_flush_inode_core(hammer_inode_t ip,
+                                       hammer_flush_group_t flg, int flags);
 static int     hammer_setup_child_callback(hammer_record_t rec, void *data);
+#if 0
 static int     hammer_syncgrp_child_callback(hammer_record_t rec, void *data);
-static int     hammer_setup_parent_inodes(hammer_inode_t ip);
-static int     hammer_setup_parent_inodes_helper(hammer_record_t record);
+#endif
+static int     hammer_setup_parent_inodes(hammer_inode_t ip,
+                                       hammer_flush_group_t flg);
+static int     hammer_setup_parent_inodes_helper(hammer_record_t record,
+                                       hammer_flush_group_t flg);
 static void    hammer_inode_wakereclaims(hammer_inode_t ip);
 
 #ifdef DEBUG_TRUNCATE
@@ -215,9 +220,14 @@ hammer_vop_reclaim(struct vop_reclaim_args *ap)
                        ++hammer_count_reclaiming;
                        ++hmp->inode_reclaims;
                        ip->flags |= HAMMER_INODE_RECLAIM;
+
+                       /*
+                        * Poke the flusher.  If we don't do this programs
+                        * will start to stall on the reclaiming count.
+                        */
                        if (hmp->inode_reclaims > HAMMER_RECLAIM_FLUSH &&
-                           (hmp->inode_reclaims & 255) == 0) {
-                               hammer_flusher_async(hmp);
+                          (hmp->inode_reclaims & 255) == 0) {
+                              hammer_flusher_async(hmp, NULL);
                        }
                }
                hammer_rel_inode(ip, 1);
@@ -978,7 +988,9 @@ retry:
         */
        if (error == 0 && (ip->flags & HAMMER_INODE_DELETED) == 0) {
                /*
-                * Generate a record and write it to the media
+                * Generate a record and write it to the media.  We clean-up
+                * the state before releasing so we do not have to set-up
+                * a flush_group.
                 */
                record = hammer_alloc_mem_record(ip, 0);
                record->type = HAMMER_MEM_RECORD_INODE;
@@ -1294,19 +1306,48 @@ hammer_modify_inode(hammer_inode_t ip, int flags)
 void
 hammer_flush_inode(hammer_inode_t ip, int flags)
 {
+       hammer_mount_t hmp;
+       hammer_flush_group_t flg;
        int good;
 
        /*
-        * Trivial 'nothing to flush' case.  If the inode is ina SETUP
+        * Setup a flush group.  It remains cached so it is ok if we
+        * wind up not flushing the inode.
+        */
+       hmp = ip->hmp;
+       flg = TAILQ_LAST(&ip->hmp->flush_group_list, hammer_flush_group_list);
+
+       if (flg) {
+               if (flg->running) {
+                       flg = NULL;
+               } else if (flg->total_count + flg->refs >
+                          ip->hmp->undo_rec_limit) {
+                       hammer_flusher_async(ip->hmp, flg);
+                       flg = NULL;
+               }
+       }
+       if (flg == NULL) {
+               flg = kmalloc(sizeof(*flg), M_HAMMER, M_WAITOK|M_ZERO);
+               TAILQ_INIT(&flg->flush_list);
+               TAILQ_INSERT_TAIL(&hmp->flush_group_list, flg, flush_entry);
+       }
+
+       /*
+        * Trivial 'nothing to flush' case.  If the inode is in a SETUP
         * state we have to put it back into an IDLE state so we can
         * drop the extra ref.
+        *
+        * If we have a parent dependancy we must still fall through
+        * so we can run it.
         */
        if ((ip->flags & HAMMER_INODE_MODMASK) == 0) {
-               if (ip->flush_state == HAMMER_FST_SETUP) {
+               if (ip->flush_state == HAMMER_FST_SETUP &&
+                   TAILQ_EMPTY(&ip->target_list)) {
                        ip->flush_state = HAMMER_FST_IDLE;
                        hammer_rel_inode(ip, 0);
                }
-               return;
+               if (ip->flush_state == HAMMER_FST_IDLE)
+                       return;
        }
 
        /*
@@ -1319,7 +1360,7 @@ hammer_flush_inode(hammer_inode_t ip, int flags)
                 * our children may not be flushable so we have to re-test
                 * with that additional knowledge.
                 */
-               hammer_flush_inode_core(ip, flags);
+               hammer_flush_inode_core(ip, flg, flags);
                break;
        case HAMMER_FST_SETUP:
                /*
@@ -1330,19 +1371,19 @@ hammer_flush_inode(hammer_inode_t ip, int flags)
                 * can't flush, 0 means there weren't any dependancies, and
                 * 1 means we have good connectivity.
                 */
-               good = hammer_setup_parent_inodes(ip);
+               good = hammer_setup_parent_inodes(ip, flg);
 
                /*
                 * We can continue if good >= 0.  Determine how many records
                 * under our inode can be flushed (and mark them).
                 */
                if (good >= 0) {
-                       hammer_flush_inode_core(ip, flags);
+                       hammer_flush_inode_core(ip, flg, flags);
                } else {
                        ip->flags |= HAMMER_INODE_REFLUSH;
                        if (flags & HAMMER_FLUSH_SIGNAL) {
                                ip->flags |= HAMMER_INODE_RESIGNAL;
-                               hammer_flusher_async(ip->hmp);
+                               hammer_flusher_async(ip->hmp, flg);
                        }
                }
                break;
@@ -1355,7 +1396,7 @@ hammer_flush_inode(hammer_inode_t ip, int flags)
                        ip->flags |= HAMMER_INODE_REFLUSH;
                if (flags & HAMMER_FLUSH_SIGNAL) {
                        ip->flags |= HAMMER_INODE_RESIGNAL;
-                       hammer_flusher_async(ip->hmp);
+                       hammer_flusher_async(ip->hmp, flg);
                }
                break;
        }
@@ -1370,19 +1411,15 @@ hammer_flush_inode(hammer_inode_t ip, int flags)
  *     ref/rel code later, the rel CAN block.
  */
 static int
-hammer_setup_parent_inodes(hammer_inode_t ip)
+hammer_setup_parent_inodes(hammer_inode_t ip, hammer_flush_group_t flg)
 {
        hammer_record_t depend;
-#if 0
-       hammer_record_t next;
-       hammer_inode_t  pip;
-#endif
        int good;
        int r;
 
        good = 0;
        TAILQ_FOREACH(depend, &ip->target_list, target_entry) {
-               r = hammer_setup_parent_inodes_helper(depend);
+               r = hammer_setup_parent_inodes_helper(depend, flg);
                KKASSERT(depend->target_ip == ip);
                if (r < 0 && good == 0)
                        good = -1;
@@ -1390,39 +1427,6 @@ hammer_setup_parent_inodes(hammer_inode_t ip)
                        good = 1;
        }
        return(good);
-
-#if 0
-retry:
-       good = 0;
-       next = TAILQ_FIRST(&ip->target_list);
-       if (next) {
-               hammer_ref(&next->lock);
-               hammer_ref(&next->ip->lock);
-       }
-       while ((depend = next) != NULL) {
-               if (depend->target_ip == NULL) {
-                       pip = depend->ip;
-                       hammer_rel_mem_record(depend);
-                       hammer_rel_inode(pip, 0);
-                       goto retry;
-               }
-               KKASSERT(depend->target_ip == ip);
-               next = TAILQ_NEXT(depend, target_entry);
-               if (next) {
-                       hammer_ref(&next->lock);
-                       hammer_ref(&next->ip->lock);
-               }
-               r = hammer_setup_parent_inodes_helper(depend);
-               if (r < 0 && good == 0)
-                       good = -1;
-               if (r > 0)
-                       good = 1;
-               pip = depend->ip;
-               hammer_rel_mem_record(depend);
-               hammer_rel_inode(pip, 0);
-       }
-       return(good);
-#endif
 }
 
 /*
@@ -1442,7 +1446,8 @@ retry:
  * Return -1 if we can't resolve the dependancy and there is no connectivity.
  */
 static int
-hammer_setup_parent_inodes_helper(hammer_record_t record)
+hammer_setup_parent_inodes_helper(hammer_record_t record,
+                                 hammer_flush_group_t flg)
 {
        hammer_mount_t hmp;
        hammer_inode_t pip;
@@ -1461,7 +1466,7 @@ hammer_setup_parent_inodes_helper(hammer_record_t record)
         * allow the operation yet anyway (the second return -1).
         */
        if (record->flush_state == HAMMER_FST_FLUSH) {
-               if (record->flush_group != hmp->flusher.next) {
+               if (record->flush_group != flg) {
                        pip->flags |= HAMMER_INODE_REFLUSH;
                        return(-1);
                }
@@ -1477,7 +1482,7 @@ hammer_setup_parent_inodes_helper(hammer_record_t record)
         */
        KKASSERT(record->flush_state == HAMMER_FST_SETUP);
 
-       good = hammer_setup_parent_inodes(pip);
+       good = hammer_setup_parent_inodes(pip, flg);
 
        /*
         * We can't flush ip because it has no connectivity (XXX also check
@@ -1496,7 +1501,7 @@ hammer_setup_parent_inodes_helper(hammer_record_t record)
         * group as the parent.
         */
        if (pip->flush_state != HAMMER_FST_FLUSH)
-               hammer_flush_inode_core(pip, HAMMER_FLUSH_RECURSION);
+               hammer_flush_inode_core(pip, flg, HAMMER_FLUSH_RECURSION);
        KKASSERT(pip->flush_state == HAMMER_FST_FLUSH);
        KKASSERT(record->flush_state == HAMMER_FST_SETUP);
 
@@ -1516,7 +1521,7 @@ hammer_setup_parent_inodes_helper(hammer_record_t record)
                return(-1);
        } else
 #endif
-       if (pip->flush_group == pip->hmp->flusher.next) {
+       if (pip->flush_group == flg) {
                /*
                 * This is the record we wanted to synchronize.  If the
                 * record went into a flush state while we blocked it 
@@ -1525,6 +1530,7 @@ hammer_setup_parent_inodes_helper(hammer_record_t record)
                if (record->flush_state != HAMMER_FST_FLUSH) {
                        record->flush_state = HAMMER_FST_FLUSH;
                        record->flush_group = pip->flush_group;
+                       ++record->flush_group->refs;
                        hammer_ref(&record->lock);
                } else {
                        KKASSERT(record->flush_group == pip->flush_group);
@@ -1551,7 +1557,7 @@ hammer_setup_parent_inodes_helper(hammer_record_t record)
  * This is the core routine placing an inode into the FST_FLUSH state.
  */
 static void
-hammer_flush_inode_core(hammer_inode_t ip, int flags)
+hammer_flush_inode_core(hammer_inode_t ip, hammer_flush_group_t flg, int flags)
 {
        int go_count;
 
@@ -1564,10 +1570,11 @@ hammer_flush_inode_core(hammer_inode_t ip, int flags)
        if (ip->flush_state == HAMMER_FST_IDLE)
                hammer_ref(&ip->lock);
        ip->flush_state = HAMMER_FST_FLUSH;
-       ip->flush_group = ip->hmp->flusher.next;
+       ip->flush_group = flg;
        ++ip->hmp->flusher.group_lock;
        ++ip->hmp->count_iqueued;
        ++hammer_count_iqueued;
+       ++flg->total_count;
 
        /*
         * We need to be able to vfsync/truncate from the backend.
@@ -1581,17 +1588,35 @@ hammer_flush_inode_core(hammer_inode_t ip, int flags)
        /*
         * Figure out how many in-memory records we can actually flush
         * (not including inode meta-data, buffers, etc).
-        *
-        * Do not add new records to the flush if this is a recursion or
-        * if we must still complete a flush from the previous flush cycle.
         */
        if (flags & HAMMER_FLUSH_RECURSION) {
+               /*
+                * If this is a upwards recursion we do not want to
+                * recurse down again!
+                */
                go_count = 1;
        } else if (ip->flags & HAMMER_INODE_WOULDBLOCK) {
+               /*
+                * No new records are added if we must complete a flush
+                * from a previous cycle, but we do have to move the records
+                * from the previous cycle to the current one.
+                */
+#if 0
                go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
                                   hammer_syncgrp_child_callback, NULL);
+#endif
                go_count = 1;
        } else {
+               /*
+                * Normal flush, scan records and bring them into the flush.
+                * Directory adds and deletes are usually skipped (they are
+                * grouped with the related inode rather then with the
+                * directory).
+                *
+                * go_count can be negative, which means the scan aborted
+                * due to the flush group being over-full and we should
+                * flush what we have.
+                */
                go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
                                   hammer_setup_child_callback, NULL);
        }
@@ -1610,13 +1635,14 @@ hammer_flush_inode_core(hammer_inode_t ip, int flags)
                        --hammer_count_iqueued;
 
                        ip->flush_state = HAMMER_FST_SETUP;
+                       ip->flush_group = NULL;
                        if (ip->flags & HAMMER_INODE_VHELD) {
                                ip->flags &= ~HAMMER_INODE_VHELD;
                                vrele(ip->vp);
                        }
                        if (flags & HAMMER_FLUSH_SIGNAL) {
                                ip->flags |= HAMMER_INODE_RESIGNAL;
-                               hammer_flusher_async(ip->hmp);
+                               hammer_flusher_async(ip->hmp, flg);
                        }
                        if (--ip->hmp->flusher.group_lock == 0)
                                wakeup(&ip->hmp->flusher.group_lock);
@@ -1673,12 +1699,13 @@ hammer_flush_inode_core(hammer_inode_t ip, int flags)
        /*
         * The flusher list inherits our inode and reference.
         */
-       TAILQ_INSERT_TAIL(&ip->hmp->flush_list, ip, flush_entry);
+       KKASSERT(flg->running == 0);
+       TAILQ_INSERT_TAIL(&flg->flush_list, ip, flush_entry);
        if (--ip->hmp->flusher.group_lock == 0)
                wakeup(&ip->hmp->flusher.group_lock);
 
        if (flags & HAMMER_FLUSH_SIGNAL) {
-               hammer_flusher_async(ip->hmp);
+               hammer_flusher_async(ip->hmp, flg);
        }
 }
 
@@ -1696,6 +1723,7 @@ hammer_flush_inode_core(hammer_inode_t ip, int flags)
 static int
 hammer_setup_child_callback(hammer_record_t rec, void *data)
 {
+       hammer_flush_group_t flg;
        hammer_inode_t target_ip;
        hammer_inode_t ip;
        int r;
@@ -1709,15 +1737,10 @@ hammer_setup_child_callback(hammer_record_t rec, void *data)
         * Don't get confused between record deletion and, say, directory
         * entry deletion.  The deletion of a directory entry that is on
         * the media has nothing to do with the record deletion flags.
-        *
-        * The flush_group for a record already in a flush state must
-        * be updated.  This case can only occur if the inode deleting
-        * too many records had to be moved to the next flush group.
         */
        if (rec->flags & (HAMMER_RECF_DELETED_FE|HAMMER_RECF_DELETED_BE)) {
                if (rec->flush_state == HAMMER_FST_FLUSH) {
-                       KKASSERT(rec->ip->flags & HAMMER_INODE_WOULDBLOCK);
-                       rec->flush_group = rec->ip->flush_group;
+                       KKASSERT(rec->flush_group == rec->ip->flush_group);
                        r = 1;
                } else {
                        r = 0;
@@ -1730,45 +1753,75 @@ hammer_setup_child_callback(hammer_record_t rec, void *data)
         * can be flushed.
         */
        ip = rec->ip;
+       flg = ip->flush_group;
        r = 0;
 
        switch(rec->flush_state) {
        case HAMMER_FST_IDLE:
                /*
-                * Record has no setup dependancy, we can flush it.
+                * The record has no setup dependancy, we can flush it.
                 */
                KKASSERT(rec->target_ip == NULL);
                rec->flush_state = HAMMER_FST_FLUSH;
-               rec->flush_group = ip->flush_group;
+               rec->flush_group = flg;
+               ++flg->refs;
                hammer_ref(&rec->lock);
                r = 1;
                break;
        case HAMMER_FST_SETUP:
                /*
-                * Record has a setup dependancy.  Try to include the
-                * target ip in the flush. 
-                *
-                * We have to be careful here, if we do not do the right
-                * thing we can lose track of dirty inodes and the system
-                * will lockup trying to allocate buffers.
+                * The record has a setup dependancy.  These are typically
+                * directory entry adds and deletes.  Such entries will be
+                * flushed when their inodes are flushed so we do not have
+                * to add them to the flush here.
                 */
                target_ip = rec->target_ip;
                KKASSERT(target_ip != NULL);
                KKASSERT(target_ip->flush_state != HAMMER_FST_IDLE);
+
+               /*
+                * If the target IP is already flushing in our group
+                * we are golden, otherwise make sure the target
+                * reflushes.
+                */
                if (target_ip->flush_state == HAMMER_FST_FLUSH) {
-                       /*
-                        * If the target IP is already flushing in our group
-                        * we are golden, otherwise make sure the target
-                        * reflushes.
-                        */
-                       if (target_ip->flush_group == ip->flush_group) {
+                       if (target_ip->flush_group == flg) {
                                rec->flush_state = HAMMER_FST_FLUSH;
-                               rec->flush_group = ip->flush_group;
+                               rec->flush_group = flg;
+                               ++flg->refs;
                                hammer_ref(&rec->lock);
                                r = 1;
                        } else {
                                target_ip->flags |= HAMMER_INODE_REFLUSH;
                        }
+                       break;
+               } 
+
+               /*
+                * Target IP is not yet flushing.  This can get complex
+                * because we have to be careful about the recursion.
+                */
+               if ((target_ip->flags & HAMMER_INODE_RECLAIM) == 0 &&
+                   (target_ip->flags & HAMMER_INODE_REFLUSH) == 0) {
+                       /*
+                        * We aren't reclaiming or trying to flush target_ip.
+                        * Let the record flush with the target.
+                        */
+                       /*r = 0;*/
+               } else if (flg->total_count + flg->refs >
+                          ip->hmp->undo_rec_limit) {
+                       /*
+                        * Our flush group is over-full and we risk blowing
+                        * out the UNDO FIFO.  Stop the scan, flush what we
+                        * have, then reflush the directory.
+                        *
+                        * The directory may be forced through multiple
+                        * flush groups before it can be completely
+                        * flushed.
+                        */
+                       ip->flags |= HAMMER_INODE_REFLUSH;
+                       ip->flags |= HAMMER_INODE_RESIGNAL;
+                       r = -1;
                } else if (rec->type == HAMMER_MEM_RECORD_ADD) {
                        /*
                         * If the target IP is not flushing we can force
@@ -1777,9 +1830,10 @@ hammer_setup_child_callback(hammer_record_t rec, void *data)
                         * hand that we CAN deal with.
                         */
                        rec->flush_state = HAMMER_FST_FLUSH;
-                       rec->flush_group = ip->flush_group;
+                       rec->flush_group = flg;
+                       ++flg->refs;
                        hammer_ref(&rec->lock);
-                       hammer_flush_inode_core(target_ip,
+                       hammer_flush_inode_core(target_ip, flg,
                                                HAMMER_FLUSH_RECURSION);
                        r = 1;
                } else {
@@ -1793,9 +1847,10 @@ hammer_setup_child_callback(hammer_record_t rec, void *data)
                         * XXX
                         */
                        rec->flush_state = HAMMER_FST_FLUSH;
-                       rec->flush_group = ip->flush_group;
+                       rec->flush_group = flg;
+                       ++flg->refs;
                        hammer_ref(&rec->lock);
-                       hammer_flush_inode_core(target_ip,
+                       hammer_flush_inode_core(target_ip, flg,
                                                HAMMER_FLUSH_RECURSION);
                        r = 1;
                }
@@ -1803,22 +1858,18 @@ hammer_setup_child_callback(hammer_record_t rec, void *data)
        case HAMMER_FST_FLUSH:
                /* 
                 * If the WOULDBLOCK flag is set records may have been left
-                * over from a previous flush attempt and should be moved
-                * to the current flush group.  If it is not set then all
-                * such records had better have been flushed already or
-                * already associated with the current flush group.
+                * over from a previous flush attempt.  The flush group will
+                * have been left intact - we are probably reflushing it
+                * now.
                 */
-               if (ip->flags & HAMMER_INODE_WOULDBLOCK) {
-                       rec->flush_group = ip->flush_group;
-               } else {
-                       KKASSERT(rec->flush_group == ip->flush_group);
-               }
+               KKASSERT(rec->flush_group == flg);
                r = 1;
                break;
        }
        return(r);
 }
 
+#if 0
 /*
  * This version just moves records already in a flush state to the new
  * flush group and that is it.
@@ -1830,47 +1881,31 @@ hammer_syncgrp_child_callback(hammer_record_t rec, void *data)
 
        switch(rec->flush_state) {
        case HAMMER_FST_FLUSH:
-               if (ip->flags & HAMMER_INODE_WOULDBLOCK) {
-                       rec->flush_group = ip->flush_group;
-               } else {
-                       KKASSERT(rec->flush_group == ip->flush_group);
-               }
+               KKASSERT(rec->flush_group == ip->flush_group);
                break;
        default:
                break;
        }
        return(0);
 }
+#endif
 
 /*
- * Wait for a previously queued flush to complete.  Not only do we need to
- * wait for the inode to sync out, we also may have to run the flusher again
- * to get it past the UNDO position pertaining to the flush so a crash does
- * not 'undo' our flush.
+ * Wait for a previously queued flush to complete.
  */
 void
 hammer_wait_inode(hammer_inode_t ip)
 {
-       hammer_mount_t hmp = ip->hmp;
-       int sync_group;
-       int waitcount;
-
-       sync_group = ip->flush_group;
-       waitcount = (ip->flags & HAMMER_INODE_REFLUSH) ? 2 : 1;
+       hammer_flush_group_t flg;
 
+       flg = NULL;
        if (ip->flush_state == HAMMER_FST_SETUP) {
                hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
        }
-       /* XXX can we make this != FST_IDLE ? check SETUP depends */
-       while (ip->flush_state == HAMMER_FST_FLUSH &&
-              (ip->flush_group - sync_group) < waitcount) {
+       while (ip->flush_state != HAMMER_FST_IDLE) {
                ip->flags |= HAMMER_INODE_FLUSHW;
                tsleep(&ip->flags, 0, "hmrwin", 0);
        }
-       while (hmp->flusher.done - sync_group < waitcount) {
-               kprintf("Y");
-               hammer_flusher_sync(hmp);
-       }
 }
 
 /*
@@ -1928,38 +1963,56 @@ hammer_flush_inode_done(hammer_inode_t ip)
                ip->flags |= HAMMER_INODE_REFLUSH;
 
        /*
-        * Clean up the vnode ref
-        */
-       if (ip->flags & HAMMER_INODE_VHELD) {
-               ip->flags &= ~HAMMER_INODE_VHELD;
-               vrele(ip->vp);
-       }
-
-       /*
-        * Adjust flush_state.  The target state (idle or setup) shouldn't
-        * be terribly important since we will reflush if we really need
-        * to do anything.
-        *
-        * If the WOULDBLOCK flag is set we must re-flush immediately
-        * to continue a potentially large deletion.  The flag also causes
-        * the hammer_setup_child_callback() to move records in the old
-        * flush group to the new one.
+        * Adjust the flush state.
         */
        if (ip->flags & HAMMER_INODE_WOULDBLOCK) {
-               ip->flush_state = HAMMER_FST_IDLE;
-               hammer_flush_inode_core(ip, HAMMER_FLUSH_SIGNAL);
+               /*
+                * We were unable to flush out all our records, leave the
+                * inode in a flush state and in the current flush group.
+                *
+                * This occurs if the UNDO block gets too full
+                * or there is too much dirty meta-data and allows the
+                * flusher to finalize the UNDO block and then re-flush.
+                */
                ip->flags &= ~HAMMER_INODE_WOULDBLOCK;
-               dorel = 1;
-       } else if (TAILQ_EMPTY(&ip->target_list) && RB_EMPTY(&ip->rec_tree)) {
-               ip->flush_state = HAMMER_FST_IDLE;
-               dorel = 1;
-       } else {
-               ip->flush_state = HAMMER_FST_SETUP;
                dorel = 0;
-       }
+       } else {
+               /*
+                * Remove from the flush_group
+                */
+               TAILQ_REMOVE(&ip->flush_group->flush_list, ip, flush_entry);
+               ip->flush_group = NULL;
+
+               /*
+                * Clean up the vnode ref and tracking counts.
+                */
+               if (ip->flags & HAMMER_INODE_VHELD) {
+                       ip->flags &= ~HAMMER_INODE_VHELD;
+                       vrele(ip->vp);
+               }
+               --hmp->count_iqueued;
+               --hammer_count_iqueued;
+
+               /*
+                * And adjust the state.
+                */
+               if (TAILQ_EMPTY(&ip->target_list) && RB_EMPTY(&ip->rec_tree)) {
+                       ip->flush_state = HAMMER_FST_IDLE;
+                       dorel = 1;
+               } else {
+                       ip->flush_state = HAMMER_FST_SETUP;
+                       dorel = 0;
+               }
 
-       --hmp->count_iqueued;
-       --hammer_count_iqueued;
+               /*
+                * If the frontend is waiting for a flush to complete,
+                * wake it up.
+                */
+               if (ip->flags & HAMMER_INODE_FLUSHW) {
+                       ip->flags &= ~HAMMER_INODE_FLUSHW;
+                       wakeup(&ip->flags);
+               }
+       }
 
        /*
         * If the frontend made more changes and requested another flush,
@@ -1984,16 +2037,6 @@ hammer_flush_inode_done(hammer_inode_t ip)
                --hmp->rsv_inodes;
        }
 
-       /*
-        * Finally, if the frontend is waiting for a flush to complete,
-        * wake it up.
-        */
-       if (ip->flush_state != HAMMER_FST_FLUSH) {
-               if (ip->flags & HAMMER_INODE_FLUSHW) {
-                       ip->flags &= ~HAMMER_INODE_FLUSHW;
-                       wakeup(&ip->flags);
-               }
-       }
        if (dorel)
                hammer_rel_inode(ip, 0);
 }
@@ -2019,7 +2062,7 @@ hammer_sync_record_callback(hammer_record_t record, void *data)
 
 #if 1
        if (record->flush_group != record->ip->flush_group) {
-               kprintf("sync_record %p ip %p bad flush group %d %d\n", record, record->ip, record->flush_group ,record->ip->flush_group);
+               kprintf("sync_record %p ip %p bad flush group %p %p\n", record, record->ip, record->flush_group ,record->ip->flush_group);
                Debugger("blah2");
                return(0);
        }
@@ -2197,7 +2240,7 @@ hammer_sync_inode(hammer_inode_t ip)
        while ((depend = next) != NULL) {
                next = TAILQ_NEXT(depend, target_entry);
                if (depend->flush_state == HAMMER_FST_FLUSH &&
-                   depend->flush_group == ip->hmp->flusher.act) {
+                   depend->flush_group == ip->flush_group) {
                        /*
                         * If this is an ADD that was deleted by the frontend
                         * the frontend nlinks count will have already been
index f3b3d4e..6820de0 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_ioctl.c,v 1.27 2008/07/12 02:47:39 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_ioctl.c,v 1.28 2008/07/12 23:04:50 dillon Exp $
  */
 
 #include "hammer.h"
@@ -364,7 +364,7 @@ hammer_ioc_synctid(hammer_transaction_t trans, hammer_inode_t ip,
        case HAMMER_SYNCTID_ASYNC:
                hammer_queue_inodes_flusher(hmp, MNT_NOWAIT);
                std->tid = hmp->flusher.tid;    /* inaccurate */
-               hammer_flusher_async(hmp);
+               hammer_flusher_async(hmp, NULL);
                break;
        case HAMMER_SYNCTID_SYNC1:
                hammer_queue_inodes_flusher(hmp, MNT_WAIT);
index 6783621..2c31b8e 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_ioctl.h,v 1.20 2008/07/12 02:47:39 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_ioctl.h,v 1.21 2008/07/12 23:04:50 dillon Exp $
  */
 /*
  * HAMMER ioctl's.  This file can be #included from userland
@@ -299,6 +299,7 @@ typedef union hammer_ioc_mrecord_any *hammer_ioc_mrecord_any_t;
 #define HAMMER_MREC_TYPE_SYNC          4       /* (userland only) */
 #define HAMMER_MREC_TYPE_SKIP          5       /* skip-range */
 #define HAMMER_MREC_TYPE_PASS          6       /* record for cmp only (pass) */
+#define HAMMER_MREC_TYPE_TERM          7       /* (userland only) */
 
 #define HAMMER_MREC_CRCOFF     (offsetof(struct hammer_ioc_mrecord_head, rec_size))
 #define HAMMER_MREC_HEADSIZE   sizeof(struct hammer_ioc_mrecord_head)
index cc4c8a7..82d9f8f 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_mirror.c,v 1.13 2008/07/12 02:47:39 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_mirror.c,v 1.14 2008/07/12 23:04:50 dillon Exp $
  */
 /*
  * HAMMER mirroring ioctls - serialize and deserialize modifications made
@@ -355,11 +355,11 @@ hammer_ioc_mirror_write(hammer_transaction_t trans, hammer_inode_t ip,
                 * cache as well.
                 */
                if (hammer_flusher_meta_halflimit(trans->hmp) ||
-                   hammer_flusher_undo_exhausted(trans, 1)) {
+                   hammer_flusher_undo_exhausted(trans, 2)) {
                        hammer_unlock_cursor(&cursor, 0);
                        hammer_flusher_wait(trans->hmp, seq);
                        hammer_lock_cursor(&cursor, 0);
-                       seq = hammer_flusher_async(trans->hmp);
+                       seq = hammer_flusher_async(trans->hmp, NULL);
                }
 
                /*
@@ -374,7 +374,7 @@ hammer_ioc_mirror_write(hammer_transaction_t trans, hammer_inode_t ip,
                        hammer_unlock_cursor(&cursor, 0);
                        hammer_flusher_wait(trans->hmp, seq);
                        hammer_lock_cursor(&cursor, 0);
-                       seq = hammer_flusher_async(trans->hmp);
+                       seq = hammer_flusher_async(trans->hmp, NULL);
                }
 
 
index 3a10ae8..a7e733f 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.87 2008/07/12 02:47:39 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.88 2008/07/12 23:04:50 dillon Exp $
  */
 
 #include "hammer.h"
@@ -304,6 +304,9 @@ hammer_flush_record_done(hammer_record_t record, int error)
                Debugger("flush_record_done error");
        }
 
+       --record->flush_group->refs;
+       record->flush_group = NULL;
+
        if (record->flags & HAMMER_RECF_DELETED_BE) {
                if ((target_ip = record->target_ip) != NULL) {
                        TAILQ_REMOVE(&target_ip->target_list, record,
index 7e977e2..32d5d9b 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.67 2008/07/09 10:29:20 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.68 2008/07/12 23:04:50 dillon Exp $
  */
 /*
  * Manage HAMMER's on-disk structures.  These routines are primarily
@@ -1375,7 +1375,7 @@ hammer_sync_hmp(hammer_mount_t hmp, int waitfor)
                 hammer_flusher_sync(hmp);
                 hammer_flusher_sync(hmp);
        } else {
-                hammer_flusher_async(hmp);
+                hammer_flusher_async(hmp, NULL);
        }
        return(info.error);
 }
index 609ab66..f506822 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_prune.c,v 1.15 2008/07/12 02:47:39 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_prune.c,v 1.16 2008/07/12 23:04:50 dillon Exp $
  */
 
 #include "hammer.h"
@@ -215,11 +215,11 @@ retry:
                ++prune->stat_scanrecords;
 
                if (hammer_flusher_meta_halflimit(trans->hmp) ||
-                   hammer_flusher_undo_exhausted(trans, 1)) {
+                   hammer_flusher_undo_exhausted(trans, 2)) {
                        hammer_unlock_cursor(&cursor, 0);
                        hammer_flusher_wait(trans->hmp, seq);
                        hammer_lock_cursor(&cursor, 0);
-                       seq = hammer_flusher_async(trans->hmp);
+                       seq = hammer_flusher_async(trans->hmp, NULL);
                }
                hammer_sync_lock_sh(trans);
                error = hammer_btree_iterate_reverse(&cursor);
index f8ddf69..aa2b1e2 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_reblock.c,v 1.28 2008/07/12 02:47:39 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_reblock.c,v 1.29 2008/07/12 23:04:50 dillon Exp $
  */
 /*
  * HAMMER reblocker - This code frees up fragmented physical space
@@ -142,7 +142,7 @@ retry:
                        hammer_unlock_cursor(&cursor, 0);
                        hammer_flusher_wait(trans->hmp, seq);
                        hammer_lock_cursor(&cursor, 0);
-                       seq = hammer_flusher_async(trans->hmp);
+                       seq = hammer_flusher_async(trans->hmp, NULL);
                }
 
                /*
@@ -156,11 +156,11 @@ retry:
                hammer_sync_unlock(trans);
 
                if (hammer_flusher_meta_halflimit(trans->hmp) ||
-                   hammer_flusher_undo_exhausted(trans, 1)) {
+                   hammer_flusher_undo_exhausted(trans, 2)) {
                        hammer_unlock_cursor(&cursor, 0);
                        hammer_flusher_wait(trans->hmp, seq);
                        hammer_lock_cursor(&cursor, 0);
-                       seq = hammer_flusher_async(trans->hmp);
+                       seq = hammer_flusher_async(trans->hmp, NULL);
                }
                if (error == 0) {
                        cursor.flags |= HAMMER_CURSOR_ATEDISK;
index 1bfbc98..39a765b 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_vfsops.c,v 1.61 2008/07/10 21:23:58 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_vfsops.c,v 1.62 2008/07/12 23:04:50 dillon Exp $
  */
 
 #include <sys/param.h>
@@ -57,6 +57,7 @@ int hammer_debug_tid;
 int hammer_debug_recover;              /* -1 will disable, +1 will force */
 int hammer_debug_recover_faults;
 int hammer_debug_cluster_enable = 1;   /* enable read clustering by default */
+int hammer_count_fsyncs;
 int hammer_count_inodes;
 int hammer_count_iqueued;
 int hammer_count_reclaiming;
@@ -118,6 +119,8 @@ SYSCTL_INT(_vfs_hammer, OID_AUTO, limit_recs, CTLFLAG_RW,
 SYSCTL_INT(_vfs_hammer, OID_AUTO, limit_iqueued, CTLFLAG_RW,
           &hammer_limit_iqueued, 0, "");
 
+SYSCTL_INT(_vfs_hammer, OID_AUTO, count_fsyncs, CTLFLAG_RD,
+          &hammer_count_fsyncs, 0, "");
 SYSCTL_INT(_vfs_hammer, OID_AUTO, count_inodes, CTLFLAG_RD,
           &hammer_count_inodes, 0, "");
 SYSCTL_INT(_vfs_hammer, OID_AUTO, count_iqueued, CTLFLAG_RD,
@@ -297,8 +300,8 @@ hammer_vfs_mount(struct mount *mp, char *mntpt, caddr_t data,
                hmp->undo_lock.refs = 1;
                hmp->blkmap_lock.refs = 1;
 
-               TAILQ_INIT(&hmp->flush_list);
                TAILQ_INIT(&hmp->delay_list);
+               TAILQ_INIT(&hmp->flush_group_list);
                TAILQ_INIT(&hmp->objid_cache_list);
                TAILQ_INIT(&hmp->undo_lru_list);
                TAILQ_INIT(&hmp->reclaim_list);
@@ -442,6 +445,16 @@ hammer_vfs_mount(struct mount *mp, char *mntpt, caddr_t data,
        bcopy(rootvol->ondisk->vol0_blockmap, hmp->blockmap,
              sizeof(hmp->blockmap));
 
+       /*
+        * The undo_rec_limit limits the size of flush groups to avoid
+        * blowing out the UNDO FIFO.  This calculation is typically in
+        * the tens of thousands and is designed primarily when small
+        * HAMMER filesystems are created.
+        */
+       hmp->undo_rec_limit = hammer_undo_max(hmp) / 8192 + 100;
+       if (hammer_debug_general & 0x0001)
+               kprintf("HAMMER: undo_rec_limit %d\n", hmp->undo_rec_limit);
+
        error = hammer_recover(hmp, rootvol);
        if (error) {
                kprintf("Failed to recover HAMMER filesystem on mount\n");
index 8b4eb35..073627a 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.87 2008/07/12 02:47:39 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.88 2008/07/12 23:04:50 dillon Exp $
  */
 
 #include <sys/param.h>
@@ -182,6 +182,7 @@ hammer_vop_fsync(struct vop_fsync_args *ap)
 {
        hammer_inode_t ip = VTOI(ap->a_vp);
 
+       ++hammer_count_fsyncs;
        vfsync(ap->a_vp, ap->a_waitfor, 1, NULL, NULL);
        hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
        if (ap->a_waitfor == MNT_WAIT)