BIOQ - Create a more sophisticated bursting mechanic for writes.
authorMatthew Dillon <dillon@apollo.backplane.com>
Thu, 20 Aug 2009 06:27:58 +0000 (23:27 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Thu, 20 Aug 2009 06:27:58 +0000 (23:27 -0700)
* Add sysctls:

  vfs.bioq_reorder_minor_interval
  vfs.bioq_reorder_minor_bytes

  vfs.bioq_reorder_burst_interval
  vfs.bioq_reorder_burst_bytes

* Reads are prioritized over writes.  Every N (minor_interval) reads
  up to (minor_bytes) worth of writes are allowed in order to avoid
  complete write starvation (which is just as bad as complete read
  starvation).

* If a large amount of write data is queued to devices the related
  buffer cache buffers will be locked.  This is not a good place to
  be as any readers trying to access those buffers will then also
  block.

  When the amount of data is excessive the (burst_interval) and
  (burst_bytes) comes into play.  This primarily helps HAMMER
  flush cycles.

* Try to keep vfs.hirunningspace reasonably bounded.  We do not want
  too large a value because of the above buffer locking issue.  On
  the flip side, we can have large numbers of delayed-write dirty buffers
  sitting around because they aren't locked.

* When the buffer cache is hitting hidirtybufspace the buffer daemons
  now try to keep the vfs.runningbufspace at hirunningspace levels
  instead of lorunningspace levels in order to trigger the bioq's
  bursting mode.

sys/dev/disk/nata/ata-queue.c
sys/kern/subr_disk.c
sys/kern/vfs_bio.c
sys/sys/buf.h

index 0aa98d0..2c17478 100644 (file)
@@ -614,7 +614,7 @@ ata_sort_queue(struct ata_channel *ch, struct ata_request *request)
             * Insert before the first write
             */
            TAILQ_INSERT_BEFORE(ch->transition, request, chain);
-           if (++ch->reorder >= bioq_reorder_interval) {
+           if (++ch->reorder >= bioq_reorder_minor_interval) {
                ch->reorder = 0;
                atawritereorder(ch);
            }
@@ -651,7 +651,7 @@ atawritereorder(struct ata_channel *ch)
 {
     struct ata_request *req;
     u_int64_t next_offset;
-    size_t left = (size_t)bioq_reorder_bytes;
+    size_t left = (size_t)bioq_reorder_minor_bytes;
     size_t n;
 
     next_offset = ata_get_lba(ch->transition);
index e95823c..716d39c 100644 (file)
@@ -874,16 +874,26 @@ SYSCTL_INT(_debug_sizeof, OID_AUTO, disk, CTLFLAG_RD,
     0, sizeof(struct disk), "sizeof(struct disk)");
 
 /*
- * How sorted do we want to be?  The higher the number the harder we try
- * to sort, but also the higher the risk of bio's getting starved do
- * to insertions in front of them.
+ * Reorder interval for burst write allowance and minor write
+ * allowance.
+ *
+ * We always want to trickle some writes in to make use of the
+ * disk's zone cache.  Bursting occurs on a longer interval and only
+ * runningbufspace is well over the hirunningspace limit.
  */
-int bioq_reorder_interval = 8;
-SYSCTL_INT(_kern, OID_AUTO, bioq_reorder_interval,
-          CTLFLAG_RW, &bioq_reorder_interval, 0, "");
-int bioq_reorder_bytes = 262144;
-SYSCTL_INT(_kern, OID_AUTO, bioq_reorder_bytes,
-          CTLFLAG_RW, &bioq_reorder_bytes, 0, "");
+int bioq_reorder_burst_interval = 60;  /* should be multiple of minor */
+SYSCTL_INT(_kern, OID_AUTO, bioq_reorder_burst_interval,
+          CTLFLAG_RW, &bioq_reorder_burst_interval, 0, "");
+int bioq_reorder_minor_interval = 5;
+SYSCTL_INT(_kern, OID_AUTO, bioq_reorder_minor_interval,
+          CTLFLAG_RW, &bioq_reorder_minor_interval, 0, "");
+
+int bioq_reorder_burst_bytes = 3000000;
+SYSCTL_INT(_kern, OID_AUTO, bioq_reorder_burst_bytes,
+          CTLFLAG_RW, &bioq_reorder_burst_bytes, 0, "");
+int bioq_reorder_minor_bytes = 262144;
+SYSCTL_INT(_kern, OID_AUTO, bioq_reorder_minor_bytes,
+          CTLFLAG_RW, &bioq_reorder_minor_bytes, 0, "");
 
 
 /*
@@ -895,23 +905,40 @@ SYSCTL_INT(_kern, OID_AUTO, bioq_reorder_bytes,
  * of writes to be queued asynchronously.  This creates a huge bottleneck
  * for reads which reduce read bandwidth to a trickle.
  *
- * To solve this problem we generally reorder reads before writes.  However,
- * a large number of random reads can also starve writes and make poor use
- * of the drive zone cache so we allow writes to trickle in every N reads.
+ * To solve this problem we generally reorder reads before writes.
+ *
+ * However, a large number of random reads can also starve writes and
+ * make poor use of the drive zone cache so we allow writes to trickle
+ * in every N reads.
  */
 void
 bioqdisksort(struct bio_queue_head *bioq, struct bio *bio)
 {
+       /*
+        * The BIO wants to be ordered.  Adding to the tail also
+        * causes transition to be set to NULL, forcing the ordering
+        * of all prior I/O's.
+        */
+       if (bio->bio_buf->b_flags & B_ORDERED) {
+               bioq_insert_tail(bioq, bio);
+               return;
+       }
+
        switch(bio->bio_buf->b_cmd) {
        case BUF_CMD_READ:
                if (bioq->transition) {
                        /*
-                        * Insert before the first write.
+                        * Insert before the first write.  Bleedover writes
+                        * based on reorder intervals to prevent starvation.
                         */
                        TAILQ_INSERT_BEFORE(bioq->transition, bio, bio_act);
-                       if (++bioq->reorder >= bioq_reorder_interval) {
-                               bioq->reorder = 0;
+                       ++bioq->reorder;
+                       if (bioq->reorder % bioq_reorder_minor_interval == 0) {
                                bioqwritereorder(bioq);
+                               if (bioq->reorder >=
+                                   bioq_reorder_burst_interval) {
+                                       bioq->reorder = 0;
+                               }
                        }
                } else {
                        /*
@@ -936,14 +963,19 @@ bioqdisksort(struct bio_queue_head *bioq, struct bio *bio)
                 * All other request types are forced to be ordered.
                 */
                bioq_insert_tail(bioq, bio);
-               return;
+               break;
        }
 }
 
 /*
- * Move the transition point to prevent reads from completely
- * starving our writes.  This brings a number of writes into
+ * Move the read-write transition point to prevent reads from
+ * completely starving our writes.  This brings a number of writes into
  * the fold every N reads.
+ *
+ * We bring a few linear writes into the fold on a minor interval
+ * and we bring a non-linear burst of writes into the fold on a major
+ * interval.  Bursting only occurs if runningbufspace is really high
+ * (typically from syncs, fsyncs, or HAMMER flushes).
  */
 static
 void
@@ -951,12 +983,23 @@ bioqwritereorder(struct bio_queue_head *bioq)
 {
        struct bio *bio;
        off_t next_offset;
-       size_t left = (size_t)bioq_reorder_bytes;
+       size_t left;
        size_t n;
+       int check_off;
+
+       if (bioq->reorder < bioq_reorder_burst_interval ||
+           !buf_runningbufspace_severe()) {
+               left = (size_t)bioq_reorder_minor_bytes;
+               check_off = 1;
+       } else {
+               left = (size_t)bioq_reorder_burst_bytes;
+               check_off = 0;
+       }
 
        next_offset = bioq->transition->bio_offset;
        while ((bio = bioq->transition) != NULL &&
-              next_offset == bio->bio_offset) {
+              (check_off == 0 || next_offset == bio->bio_offset)
+       ) {
                n = bio->bio_buf->b_bcount;
                next_offset = bio->bio_offset + n;
                bioq->transition = TAILQ_NEXT(bio, bio_act);
index 3240f58..650ad7f 100644 (file)
@@ -236,12 +236,18 @@ static __inline void
 runningbufwakeup(struct buf *bp)
 {
        int totalspace;
+       int limit;
 
        if ((totalspace = bp->b_runningbufspace) != 0) {
                atomic_subtract_int(&runningbufspace, totalspace);
                atomic_subtract_int(&runningbufcount, 1);
                bp->b_runningbufspace = 0;
-               if (runningbufreq && runningbufspace <= lorunningspace) {
+
+               /*
+                * see waitrunningbufspace() for limit test.
+                */
+               limit = hirunningspace * 2 / 3;
+               if (runningbufreq && runningbufspace <= limit) {
                        runningbufreq = 0;
                        wakeup(&runningbufreq);
                }
@@ -273,33 +279,33 @@ bufcountwakeup(void)
 /*
  * waitrunningbufspace()
  *
- * Wait for the amount of running I/O to drop to a reasonable level.
+ * Wait for the amount of running I/O to drop to hirunningspace * 2 / 3.
+ * This is the point where write bursting stops so we don't want to wait
+ * for the running amount to drop below it (at least if we still want bioq
+ * to burst writes).
  *
  * The caller may be using this function to block in a tight loop, we
- * must block of runningbufspace is greater then the passed limit.
+ * must block while runningbufspace is greater then or equal to
+ * hirunningspace * 2 / 3.
+ *
  * And even with that it may not be enough, due to the presence of
  * B_LOCKED dirty buffers, so also wait for at least one running buffer
  * to complete.
  */
 static __inline void
-waitrunningbufspace(int limit)
+waitrunningbufspace(void)
 {
-       int lorun;
-
-       if (lorunningspace < limit)
-               lorun = lorunningspace;
-       else
-               lorun = limit;
+       int limit = hirunningspace * 2 / 3;
 
        crit_enter();
-       if (runningbufspace > lorun) {
-               while (runningbufspace > lorun) {
+       if (runningbufspace > limit) {
+               while (runningbufspace > limit) {
                        ++runningbufreq;
-                       tsleep(&runningbufreq, 0, "wdrain", 0);
+                       tsleep(&runningbufreq, 0, "wdrn1", 0);
                }
        } else if (runningbufspace) {
                ++runningbufreq;
-               tsleep(&runningbufreq, 0, "wdrain2", 1);
+               tsleep(&runningbufreq, 0, "wdrn2", 1);
        }
        crit_exit();
 }
@@ -316,6 +322,16 @@ buf_dirty_count_severe(void)
                dirtybufcount >= nbuf / 2);
 }
 
+/*
+ * Return true if the amount of running I/O is severe and BIOQ should
+ * start bursting.
+ */
+int
+buf_runningbufspace_severe(void)
+{
+       return (runningbufspace >= hirunningspace * 2 / 3);
+}
+
 /*
  * vfs_buf_test_cache:
  *
@@ -603,7 +619,7 @@ bufinit(void)
        lobufspace = hibufspace - MAXBSIZE;
 
        lorunningspace = 512 * 1024;
-       hirunningspace = 1024 * 1024;
+       /* hirunningspace -- see below */
 
        /*
         * Limit the amount of malloc memory since it is wired permanently
@@ -617,8 +633,17 @@ bufinit(void)
        /*
         * Reduce the chance of a deadlock occuring by limiting the number
         * of delayed-write dirty buffers we allow to stack up.
+        *
+        * We don't want too much actually queued to the device at once
+        * (XXX this needs to be per-mount!), because the buffers will
+        * wind up locked for a very long period of time while the I/O
+        * drains.
         */
-       hidirtybufspace = hibufspace / 2;
+       hidirtybufspace = hibufspace / 2;       /* dirty + running */
+       hirunningspace = hibufspace / 16;       /* locked & queued to device */
+       if (hirunningspace < 1024 * 1024)
+               hirunningspace = 1024 * 1024;
+
        dirtybufspace = 0;
        dirtybufspacehw = 0;
 
@@ -2265,6 +2290,10 @@ buf_daemon(void)
                kproc_suspend_loop();
 
                /*
+                * Do the flush as long as the number of dirty buffers
+                * (including those running) exceeds lodirtybufspace.
+                *
+                * When flushing limit running I/O to hirunningspace
                 * Do the flush.  Limit the amount of in-transit I/O we
                 * allow to build up, otherwise we would completely saturate
                 * the I/O system.  Wakeup any waiting processes before we
@@ -2274,13 +2303,15 @@ buf_daemon(void)
                 * but because we split the operation into two threads we
                 * have to cut it in half for each thread.
                 */
+               waitrunningbufspace();
                limit = lodirtybufspace / 2;
-               waitrunningbufspace(limit);
                while (runningbufspace + dirtybufspace > limit ||
                       dirtybufcount - dirtybufcounthw >= nbuf / 2) {
                        if (flushbufqueues(BQUEUE_DIRTY) == 0)
                                break;
-                       waitrunningbufspace(limit);
+                       if (runningbufspace < hirunningspace)
+                               continue;
+                       waitrunningbufspace();
                }
 
                /*
@@ -2324,17 +2355,23 @@ buf_daemon_hw(void)
                 * the I/O system.  Wakeup any waiting processes before we
                 * normally would so they can run in parallel with our drain.
                 *
+                * Once we decide to flush push the queued I/O up to
+                * hirunningspace in order to trigger bursting by the bioq
+                * subsystem.
+                *
                 * Our aggregate normal+HW lo water mark is lodirtybufspace,
                 * but because we split the operation into two threads we
                 * have to cut it in half for each thread.
                 */
+               waitrunningbufspace();
                limit = lodirtybufspace / 2;
-               waitrunningbufspace(limit);
                while (runningbufspace + dirtybufspacehw > limit ||
                       dirtybufcounthw >= nbuf / 2) {
                        if (flushbufqueues(BQUEUE_DIRTY_HW) == 0)
                                break;
-                       waitrunningbufspace(limit);
+                       if (runningbufspace < hirunningspace)
+                               continue;
+                       waitrunningbufspace();
                }
 
                /*
index 948195e..3fe1417 100644 (file)
@@ -376,8 +376,10 @@ extern char        *buffers;               /* The buffer contents. */
 extern int     bufpages;               /* Number of memory pages in the buffer pool. */
 extern struct  buf *swbuf;             /* Swap I/O buffer headers. */
 extern int     nswbuf;                 /* Number of swap I/O buffer headers. */
-extern int     bioq_reorder_interval;
-extern int     bioq_reorder_bytes;
+extern int     bioq_reorder_burst_interval;
+extern int     bioq_reorder_burst_bytes;
+extern int     bioq_reorder_minor_interval;
+extern int     bioq_reorder_minor_bytes;
 
 struct uio;
 
@@ -385,6 +387,7 @@ void        bufinit (void);
 int    bd_heatup (void);
 void   bd_wait (int count);
 int    buf_dirty_count_severe (void);
+int    buf_runningbufspace_severe (void);
 void   initbufbio(struct buf *);
 void   reinitbufbio(struct buf *);
 void   clearbiocache(struct bio *);