From 4afeea0db0b1a8e6b5132e06a39111a64ad2487e Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Wed, 19 Aug 2009 23:27:58 -0700 Subject: [PATCH] BIOQ - Create a more sophisticated bursting mechanic for writes. * Add sysctls: vfs.bioq_reorder_minor_interval vfs.bioq_reorder_minor_bytes vfs.bioq_reorder_burst_interval vfs.bioq_reorder_burst_bytes * Reads are prioritized over writes. Every N (minor_interval) reads up to (minor_bytes) worth of writes are allowed in order to avoid complete write starvation (which is just as bad as complete read starvation). * If a large amount of write data is queued to devices the related buffer cache buffers will be locked. This is not a good place to be as any readers trying to access those buffers will then also block. When the amount of data is excessive the (burst_interval) and (burst_bytes) comes into play. This primarily helps HAMMER flush cycles. * Try to keep vfs.hirunningspace reasonably bounded. We do not want too large a value because of the above buffer locking issue. On the flip side, we can have large numbers of delayed-write dirty buffers sitting around because they aren't locked. * When the buffer cache is hitting hidirtybufspace the buffer daemons now try to keep the vfs.runningbufspace at hirunningspace levels instead of lorunningspace levels in order to trigger the bioq's bursting mode. --- sys/dev/disk/nata/ata-queue.c | 4 +- sys/kern/subr_disk.c | 83 ++++++++++++++++++++++++++--------- sys/kern/vfs_bio.c | 77 +++++++++++++++++++++++--------- sys/sys/buf.h | 7 ++- 4 files changed, 127 insertions(+), 44 deletions(-) diff --git a/sys/dev/disk/nata/ata-queue.c b/sys/dev/disk/nata/ata-queue.c index 0aa98d08e6..2c1747828d 100644 --- a/sys/dev/disk/nata/ata-queue.c +++ b/sys/dev/disk/nata/ata-queue.c @@ -614,7 +614,7 @@ ata_sort_queue(struct ata_channel *ch, struct ata_request *request) * Insert before the first write */ TAILQ_INSERT_BEFORE(ch->transition, request, chain); - if (++ch->reorder >= bioq_reorder_interval) { + if (++ch->reorder >= bioq_reorder_minor_interval) { ch->reorder = 0; atawritereorder(ch); } @@ -651,7 +651,7 @@ atawritereorder(struct ata_channel *ch) { struct ata_request *req; u_int64_t next_offset; - size_t left = (size_t)bioq_reorder_bytes; + size_t left = (size_t)bioq_reorder_minor_bytes; size_t n; next_offset = ata_get_lba(ch->transition); diff --git a/sys/kern/subr_disk.c b/sys/kern/subr_disk.c index e95823ca87..716d39c381 100644 --- a/sys/kern/subr_disk.c +++ b/sys/kern/subr_disk.c @@ -874,16 +874,26 @@ SYSCTL_INT(_debug_sizeof, OID_AUTO, disk, CTLFLAG_RD, 0, sizeof(struct disk), "sizeof(struct disk)"); /* - * How sorted do we want to be? The higher the number the harder we try - * to sort, but also the higher the risk of bio's getting starved do - * to insertions in front of them. + * Reorder interval for burst write allowance and minor write + * allowance. + * + * We always want to trickle some writes in to make use of the + * disk's zone cache. Bursting occurs on a longer interval and only + * runningbufspace is well over the hirunningspace limit. */ -int bioq_reorder_interval = 8; -SYSCTL_INT(_kern, OID_AUTO, bioq_reorder_interval, - CTLFLAG_RW, &bioq_reorder_interval, 0, ""); -int bioq_reorder_bytes = 262144; -SYSCTL_INT(_kern, OID_AUTO, bioq_reorder_bytes, - CTLFLAG_RW, &bioq_reorder_bytes, 0, ""); +int bioq_reorder_burst_interval = 60; /* should be multiple of minor */ +SYSCTL_INT(_kern, OID_AUTO, bioq_reorder_burst_interval, + CTLFLAG_RW, &bioq_reorder_burst_interval, 0, ""); +int bioq_reorder_minor_interval = 5; +SYSCTL_INT(_kern, OID_AUTO, bioq_reorder_minor_interval, + CTLFLAG_RW, &bioq_reorder_minor_interval, 0, ""); + +int bioq_reorder_burst_bytes = 3000000; +SYSCTL_INT(_kern, OID_AUTO, bioq_reorder_burst_bytes, + CTLFLAG_RW, &bioq_reorder_burst_bytes, 0, ""); +int bioq_reorder_minor_bytes = 262144; +SYSCTL_INT(_kern, OID_AUTO, bioq_reorder_minor_bytes, + CTLFLAG_RW, &bioq_reorder_minor_bytes, 0, ""); /* @@ -895,23 +905,40 @@ SYSCTL_INT(_kern, OID_AUTO, bioq_reorder_bytes, * of writes to be queued asynchronously. This creates a huge bottleneck * for reads which reduce read bandwidth to a trickle. * - * To solve this problem we generally reorder reads before writes. However, - * a large number of random reads can also starve writes and make poor use - * of the drive zone cache so we allow writes to trickle in every N reads. + * To solve this problem we generally reorder reads before writes. + * + * However, a large number of random reads can also starve writes and + * make poor use of the drive zone cache so we allow writes to trickle + * in every N reads. */ void bioqdisksort(struct bio_queue_head *bioq, struct bio *bio) { + /* + * The BIO wants to be ordered. Adding to the tail also + * causes transition to be set to NULL, forcing the ordering + * of all prior I/O's. + */ + if (bio->bio_buf->b_flags & B_ORDERED) { + bioq_insert_tail(bioq, bio); + return; + } + switch(bio->bio_buf->b_cmd) { case BUF_CMD_READ: if (bioq->transition) { /* - * Insert before the first write. + * Insert before the first write. Bleedover writes + * based on reorder intervals to prevent starvation. */ TAILQ_INSERT_BEFORE(bioq->transition, bio, bio_act); - if (++bioq->reorder >= bioq_reorder_interval) { - bioq->reorder = 0; + ++bioq->reorder; + if (bioq->reorder % bioq_reorder_minor_interval == 0) { bioqwritereorder(bioq); + if (bioq->reorder >= + bioq_reorder_burst_interval) { + bioq->reorder = 0; + } } } else { /* @@ -936,14 +963,19 @@ bioqdisksort(struct bio_queue_head *bioq, struct bio *bio) * All other request types are forced to be ordered. */ bioq_insert_tail(bioq, bio); - return; + break; } } /* - * Move the transition point to prevent reads from completely - * starving our writes. This brings a number of writes into + * Move the read-write transition point to prevent reads from + * completely starving our writes. This brings a number of writes into * the fold every N reads. + * + * We bring a few linear writes into the fold on a minor interval + * and we bring a non-linear burst of writes into the fold on a major + * interval. Bursting only occurs if runningbufspace is really high + * (typically from syncs, fsyncs, or HAMMER flushes). */ static void @@ -951,12 +983,23 @@ bioqwritereorder(struct bio_queue_head *bioq) { struct bio *bio; off_t next_offset; - size_t left = (size_t)bioq_reorder_bytes; + size_t left; size_t n; + int check_off; + + if (bioq->reorder < bioq_reorder_burst_interval || + !buf_runningbufspace_severe()) { + left = (size_t)bioq_reorder_minor_bytes; + check_off = 1; + } else { + left = (size_t)bioq_reorder_burst_bytes; + check_off = 0; + } next_offset = bioq->transition->bio_offset; while ((bio = bioq->transition) != NULL && - next_offset == bio->bio_offset) { + (check_off == 0 || next_offset == bio->bio_offset) + ) { n = bio->bio_buf->b_bcount; next_offset = bio->bio_offset + n; bioq->transition = TAILQ_NEXT(bio, bio_act); diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index 3240f58475..650ad7f4cc 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -236,12 +236,18 @@ static __inline void runningbufwakeup(struct buf *bp) { int totalspace; + int limit; if ((totalspace = bp->b_runningbufspace) != 0) { atomic_subtract_int(&runningbufspace, totalspace); atomic_subtract_int(&runningbufcount, 1); bp->b_runningbufspace = 0; - if (runningbufreq && runningbufspace <= lorunningspace) { + + /* + * see waitrunningbufspace() for limit test. + */ + limit = hirunningspace * 2 / 3; + if (runningbufreq && runningbufspace <= limit) { runningbufreq = 0; wakeup(&runningbufreq); } @@ -273,33 +279,33 @@ bufcountwakeup(void) /* * waitrunningbufspace() * - * Wait for the amount of running I/O to drop to a reasonable level. + * Wait for the amount of running I/O to drop to hirunningspace * 2 / 3. + * This is the point where write bursting stops so we don't want to wait + * for the running amount to drop below it (at least if we still want bioq + * to burst writes). * * The caller may be using this function to block in a tight loop, we - * must block of runningbufspace is greater then the passed limit. + * must block while runningbufspace is greater then or equal to + * hirunningspace * 2 / 3. + * * And even with that it may not be enough, due to the presence of * B_LOCKED dirty buffers, so also wait for at least one running buffer * to complete. */ static __inline void -waitrunningbufspace(int limit) +waitrunningbufspace(void) { - int lorun; - - if (lorunningspace < limit) - lorun = lorunningspace; - else - lorun = limit; + int limit = hirunningspace * 2 / 3; crit_enter(); - if (runningbufspace > lorun) { - while (runningbufspace > lorun) { + if (runningbufspace > limit) { + while (runningbufspace > limit) { ++runningbufreq; - tsleep(&runningbufreq, 0, "wdrain", 0); + tsleep(&runningbufreq, 0, "wdrn1", 0); } } else if (runningbufspace) { ++runningbufreq; - tsleep(&runningbufreq, 0, "wdrain2", 1); + tsleep(&runningbufreq, 0, "wdrn2", 1); } crit_exit(); } @@ -316,6 +322,16 @@ buf_dirty_count_severe(void) dirtybufcount >= nbuf / 2); } +/* + * Return true if the amount of running I/O is severe and BIOQ should + * start bursting. + */ +int +buf_runningbufspace_severe(void) +{ + return (runningbufspace >= hirunningspace * 2 / 3); +} + /* * vfs_buf_test_cache: * @@ -603,7 +619,7 @@ bufinit(void) lobufspace = hibufspace - MAXBSIZE; lorunningspace = 512 * 1024; - hirunningspace = 1024 * 1024; + /* hirunningspace -- see below */ /* * Limit the amount of malloc memory since it is wired permanently @@ -617,8 +633,17 @@ bufinit(void) /* * Reduce the chance of a deadlock occuring by limiting the number * of delayed-write dirty buffers we allow to stack up. + * + * We don't want too much actually queued to the device at once + * (XXX this needs to be per-mount!), because the buffers will + * wind up locked for a very long period of time while the I/O + * drains. */ - hidirtybufspace = hibufspace / 2; + hidirtybufspace = hibufspace / 2; /* dirty + running */ + hirunningspace = hibufspace / 16; /* locked & queued to device */ + if (hirunningspace < 1024 * 1024) + hirunningspace = 1024 * 1024; + dirtybufspace = 0; dirtybufspacehw = 0; @@ -2265,6 +2290,10 @@ buf_daemon(void) kproc_suspend_loop(); /* + * Do the flush as long as the number of dirty buffers + * (including those running) exceeds lodirtybufspace. + * + * When flushing limit running I/O to hirunningspace * Do the flush. Limit the amount of in-transit I/O we * allow to build up, otherwise we would completely saturate * the I/O system. Wakeup any waiting processes before we @@ -2274,13 +2303,15 @@ buf_daemon(void) * but because we split the operation into two threads we * have to cut it in half for each thread. */ + waitrunningbufspace(); limit = lodirtybufspace / 2; - waitrunningbufspace(limit); while (runningbufspace + dirtybufspace > limit || dirtybufcount - dirtybufcounthw >= nbuf / 2) { if (flushbufqueues(BQUEUE_DIRTY) == 0) break; - waitrunningbufspace(limit); + if (runningbufspace < hirunningspace) + continue; + waitrunningbufspace(); } /* @@ -2324,17 +2355,23 @@ buf_daemon_hw(void) * the I/O system. Wakeup any waiting processes before we * normally would so they can run in parallel with our drain. * + * Once we decide to flush push the queued I/O up to + * hirunningspace in order to trigger bursting by the bioq + * subsystem. + * * Our aggregate normal+HW lo water mark is lodirtybufspace, * but because we split the operation into two threads we * have to cut it in half for each thread. */ + waitrunningbufspace(); limit = lodirtybufspace / 2; - waitrunningbufspace(limit); while (runningbufspace + dirtybufspacehw > limit || dirtybufcounthw >= nbuf / 2) { if (flushbufqueues(BQUEUE_DIRTY_HW) == 0) break; - waitrunningbufspace(limit); + if (runningbufspace < hirunningspace) + continue; + waitrunningbufspace(); } /* diff --git a/sys/sys/buf.h b/sys/sys/buf.h index 948195e33d..3fe1417d3b 100644 --- a/sys/sys/buf.h +++ b/sys/sys/buf.h @@ -376,8 +376,10 @@ extern char *buffers; /* The buffer contents. */ extern int bufpages; /* Number of memory pages in the buffer pool. */ extern struct buf *swbuf; /* Swap I/O buffer headers. */ extern int nswbuf; /* Number of swap I/O buffer headers. */ -extern int bioq_reorder_interval; -extern int bioq_reorder_bytes; +extern int bioq_reorder_burst_interval; +extern int bioq_reorder_burst_bytes; +extern int bioq_reorder_minor_interval; +extern int bioq_reorder_minor_bytes; struct uio; @@ -385,6 +387,7 @@ void bufinit (void); int bd_heatup (void); void bd_wait (int count); int buf_dirty_count_severe (void); +int buf_runningbufspace_severe (void); void initbufbio(struct buf *); void reinitbufbio(struct buf *); void clearbiocache(struct bio *); -- 2.41.0