From 364c022c7c967c27285b70d069fc14df03ffe267 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Sun, 15 Aug 2010 17:20:48 -0700 Subject: [PATCH] kernel - revamp cluster_read API and improve performance * Revamp the API to be much less confusing. Pass a minimum read ahead based on the higher level uio length, and a maximum read ahead based on the sequential heuristic. These combine together to determine how much read-ahead to do. For example if a program is doing random-reads with 1MB read() requests the minreq will take precedence, whereas if the program is doing sequential-reads of 8K the maxreq will take precedence. The sequential heuristic currently maxes out at 128 * 16384 = 2MB. * Introduce sysctl vfs.max_readahead instead of hardwiring the maximum read-ahead. This defaults to 2MB which is big enough for just about anything. Generally speaking this value should be larger than your stripe width. Note that currently the sequential heuristic also maxes out at 2MB so you cannot go larger than 2MB. * Also correct bugs that existed in the old cluster_read(), dramatically improving performance on striped volumes with large chunk sizes (128K+) and a large number of drives (3+). * No change here but note that HAMMER will currently issue a minimum of 64K worth of read-ahead when accessing meta-data. --- sys/kern/vfs_cluster.c | 157 +++++++++++++++++----------- sys/sys/buf.h | 2 +- sys/vfs/gnu/ext2fs/ext2_readwrite.c | 3 +- sys/vfs/hammer/hammer_io.c | 9 +- sys/vfs/hammer/hammer_vnops.c | 4 +- sys/vfs/isofs/cd9660/cd9660_vnops.c | 3 +- sys/vfs/ufs/ffs_balloc.c | 7 +- sys/vfs/ufs/ffs_subr.c | 2 +- 8 files changed, 113 insertions(+), 74 deletions(-) diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c index a3787e30ce..8eed32e347 100644 --- a/sys/kern/vfs_cluster.c +++ b/sys/kern/vfs_cluster.c @@ -58,6 +58,7 @@ #include +#define CLUSTERDEBUG #if defined(CLUSTERDEBUG) #include static int rcluster= 0; @@ -78,51 +79,72 @@ static void cluster_setram (struct buf *); static int write_behind = 1; SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0, ""); +static int max_readahead = 2 * 1024 * 1024; +SYSCTL_INT(_vfs, OID_AUTO, max_readahead, CTLFLAG_RW, &max_readahead, 0, ""); extern vm_page_t bogus_page; extern int cluster_pbuf_freecnt; -/* - * Maximum number of blocks for read-ahead. - */ -#define MAXRA 32 - /* * This replaces bread. + * + * filesize - read-ahead @ blksize will not cross this boundary + * loffset - loffset for returned *bpp + * blksize - blocksize for returned *bpp and read-ahead bps + * minreq - minimum (not a hard minimum) in bytes, typically reflects + * a higher level uio resid. + * maxreq - maximum (sequential heuristic) in bytes (highet typ ~2MB) + * bpp - return buffer (*bpp) for (loffset,blksize) */ int cluster_read(struct vnode *vp, off_t filesize, off_t loffset, - int blksize, size_t resid, int seqcount, struct buf **bpp) + int blksize, size_t minreq, size_t maxreq, struct buf **bpp) { struct buf *bp, *rbp, *reqbp; off_t origoffset; off_t doffset; int error; int i; - int maxra, racluster; - int totread; + int maxra; + int maxrbuild; error = 0; - totread = (resid > INT_MAX) ? INT_MAX : (int)resid; /* - * racluster - calculate maximum cluster IO size (limited by - * backing block device). + * Calculate the desired read-ahead in blksize'd blocks (maxra). + * To do this we calculate maxreq. * - * Try to limit the amount of read-ahead by a few ad-hoc parameters. - * This needs work!!! + * maxreq typically starts out as a sequential heuristic. If the + * high level uio/resid is bigger (minreq), we pop maxreq up to + * minreq. This represents the case where random I/O is being + * performed by the userland is issuing big read()'s. * - * NOTE! The BMAP operations may involve synchronous I/O so we - * really want several cluster IOs in progress to absorb - * the time lag. + * Then we limit maxreq to max_readahead to ensure it is a reasonable + * value. + * + * Finally we must ensure that loffset + maxreq does not cross the + * boundary (filesize) for the current blocksize. If we allowed it + * to cross we could end up with buffers past the boundary with the + * wrong block size (HAMMER large-data areas use mixed block sizes). */ - racluster = vmaxiosize(vp) / blksize; - maxra = 2 * racluster + (totread / blksize); - if (maxra > MAXRA) - maxra = MAXRA; - if (maxra > nbuf / 8) - maxra = nbuf / 8; + if (maxreq < minreq) + maxreq = minreq; + if (maxreq > max_readahead) { + maxreq = max_readahead; + if (maxreq > 16 * 1024 * 1024) + maxreq = 16 * 1024 * 1024; + } + if (maxreq < blksize) + maxreq = blksize; + if (loffset + maxreq > filesize) { + if (loffset > filesize) + maxreq = 0; + else + maxreq = filesize - loffset; + } + + maxra = (int)(maxreq / blksize); /* * Get the requested block. @@ -130,6 +152,12 @@ cluster_read(struct vnode *vp, off_t filesize, off_t loffset, *bpp = reqbp = bp = getblk(vp, loffset, blksize, 0, 0); origoffset = loffset; + /* + * Calculate the maximum cluster size for a single I/O, used + * by cluster_rbuild(). + */ + maxrbuild = vmaxiosize(vp) / blksize; + /* * if it is in the cache, then check to see if the reads have been * sequential. If they have, then try some read-ahead, otherwise @@ -139,8 +167,7 @@ cluster_read(struct vnode *vp, off_t filesize, off_t loffset, /* * Not sequential, do not do any read-ahead */ - seqcount -= (bp->b_bufsize + BKVASIZE - 1) / BKVASIZE; - if (seqcount <= 0 || maxra == 0) + if (maxra <= 1) return 0; /* @@ -175,6 +202,11 @@ cluster_read(struct vnode *vp, off_t filesize, off_t loffset, } ++i; } + + /* + * We got everything or everything is in the cache, no + * point continuing. + */ if (i >= maxra) return 0; maxra -= i; @@ -193,38 +225,41 @@ cluster_read(struct vnode *vp, off_t filesize, off_t loffset, KASSERT(firstread != NOOFFSET, ("cluster_read: no buffer offset")); - if (firstread + totread > filesize) - totread = (int)(filesize - firstread); - nblks = totread / blksize; - if (nblks) { - int burstbytes; - if (nblks > racluster) - nblks = racluster; + /* + * nblks is our cluster_rbuild request size, limited + * primarily by the device. + */ + if ((nblks = maxra) > maxrbuild) + nblks = maxrbuild; + + if (nblks > 1) { + int burstbytes; error = VOP_BMAP(vp, loffset, &doffset, &burstbytes, NULL, BUF_CMD_READ); if (error) goto single_block_read; + if (nblks > burstbytes / blksize) + nblks = burstbytes / blksize; if (doffset == NOOFFSET) goto single_block_read; - if (burstbytes < blksize * 2) + if (nblks <= 1) goto single_block_read; - if (nblks > burstbytes / blksize) - nblks = burstbytes / blksize; bp = cluster_rbuild(vp, filesize, loffset, doffset, blksize, nblks, bp); loffset += bp->b_bufsize; - maxra -= (bp->b_bufsize - blksize) / blksize; + maxra -= bp->b_bufsize / blksize; } else { single_block_read: /* - * if it isn't in the cache, then get a chunk from + * If it isn't in the cache, then get a chunk from * disk if sequential, otherwise just get the block. */ cluster_setram(bp); loffset += blksize; + --maxra; } } @@ -238,13 +273,12 @@ single_block_read: if (bp) { #if defined(CLUSTERDEBUG) if (rcluster) - kprintf("S(%lld,%d,%d)\n", - bp->b_loffset, bp->b_bcount, seqcount); + kprintf("S(%012jx,%d,%d)\n", + (intmax_t)bp->b_loffset, bp->b_bcount, maxra); #endif if ((bp->b_flags & B_CLUSTER) == 0) vfs_busy_pages(vp, bp); bp->b_flags &= ~(B_ERROR|B_INVAL); - seqcount -= (bp->b_bufsize + BKVASIZE - 1) / BKVASIZE; vn_strategy(vp, &bp->b_bio1); error = 0; /* bp invalid now */ @@ -259,12 +293,10 @@ single_block_read: * will do device-readahead irrespective of what the blocks * represent. */ - while (!error && seqcount > 0 && maxra > 0 && - loffset + blksize <= filesize) { - int nblksread; - int ntoread; + while (error == 0 && maxra > 0) { int burstbytes; int tmp_error; + int nblks; rbp = getblk(vp, loffset, blksize, GETBLK_SZMATCH|GETBLK_NOWAIT, 0); @@ -287,12 +319,10 @@ single_block_read: rbp = NULL; goto no_read_ahead; } - ntoread = burstbytes / blksize; - nblksread = (totread + blksize - 1) / blksize; - if (seqcount < nblksread) - seqcount = nblksread; - if (ntoread > seqcount) - ntoread = seqcount; + if ((nblks = maxra) > maxrbuild) + nblks = maxrbuild; + if (nblks > burstbytes / blksize) + nblks = burstbytes / blksize; /* * rbp: async read @@ -301,26 +331,29 @@ single_block_read: /*rbp->b_flags |= B_AGE*/; cluster_setram(rbp); - if (burstbytes) { + if (nblks > 1) { rbp = cluster_rbuild(vp, filesize, loffset, doffset, blksize, - ntoread, rbp); + nblks, rbp); } else { rbp->b_bio2.bio_offset = doffset; } - seqcount -= (rbp->b_bufsize + BKVASIZE - 1) / BKVASIZE; + #if defined(CLUSTERDEBUG) if (rcluster) { - if (bp) - kprintf("A+(%lld,%d,%lld,%d) ra=%d\n", - rbp->b_loffset, rbp->b_bcount, - rbp->b_loffset - origoffset, - seqcount, maxra); - else - kprintf("A-(%lld,%d,%lld,%d) ra=%d\n", - rbp->b_loffset, rbp->b_bcount, - rbp->b_loffset - origoffset, - seqcount, maxra); + if (bp) { + kprintf("A+(%012jx,%d,%jd) " + "doff=%012jx minr=%zd ra=%d\n", + (intmax_t)loffset, rbp->b_bcount, + (intmax_t)(loffset - origoffset), + (intmax_t)doffset, minreq, maxra); + } else { + kprintf("A-(%012jx,%d,%jd) " + "doff=%012jx minr=%zd ra=%d\n", + (intmax_t)rbp->b_loffset, rbp->b_bcount, + (intmax_t)(loffset - origoffset), + (intmax_t)doffset, minreq, maxra); + } } #endif rbp->b_flags &= ~(B_ERROR|B_INVAL); diff --git a/sys/sys/buf.h b/sys/sys/buf.h index 2978c62686..159aaca24d 100644 --- a/sys/sys/buf.h +++ b/sys/sys/buf.h @@ -432,7 +432,7 @@ void biodone_sync (struct bio *); void cluster_append(struct bio *, struct buf *); int cluster_read (struct vnode *, off_t, off_t, int, - size_t, int, struct buf **); + size_t, size_t, struct buf **); int cluster_wbuild (struct vnode *, int, off_t, int); void cluster_write (struct buf *, off_t, int, int); int physread (struct dev_read_args *); diff --git a/sys/vfs/gnu/ext2fs/ext2_readwrite.c b/sys/vfs/gnu/ext2fs/ext2_readwrite.c index a09592ed03..d31754d80d 100644 --- a/sys/vfs/gnu/ext2fs/ext2_readwrite.c +++ b/sys/vfs/gnu/ext2fs/ext2_readwrite.c @@ -111,7 +111,8 @@ ext2_read(struct vop_read_args *ap) error = cluster_read(vp, (off_t)ip->i_size, lblktodoff(fs, lbn), size, uio->uio_resid, - (ap->a_ioflag >> 16), &bp); + (ap->a_ioflag >> 16) * BKVASIZE, + &bp); } else if (seqcount > 1) { int nextsize = BLKSIZE(fs, ip, nextlbn); error = breadn(vp, lblktodoff(fs, lbn), diff --git a/sys/vfs/hammer/hammer_io.c b/sys/vfs/hammer/hammer_io.c index 3f9f8ba9e5..69a496dc03 100644 --- a/sys/vfs/hammer/hammer_io.c +++ b/sys/vfs/hammer/hammer_io.c @@ -281,9 +281,9 @@ hammer_io_notmeta(hammer_buffer_t buffer) * speaking HAMMER assumes some locality of reference and will cluster * a 64K read. * - * Note that clustering occurs at the device layer, not the logical layer. - * If the buffers do not apply to the current operation they may apply to - * some other. + * Note that the clustering which occurs here is clustering within the + * block device... typically meta-data and small-file data. Regular + * file clustering is different and handled in hammer_vnops.c */ int hammer_io_read(struct vnode *devvp, struct hammer_io *io, hammer_off_t limit) @@ -298,7 +298,8 @@ hammer_io_read(struct vnode *devvp, struct hammer_io *io, hammer_off_t limit) error = cluster_read(devvp, limit, io->offset, io->bytes, HAMMER_CLUSTER_SIZE, - HAMMER_CLUSTER_BUFS, &io->bp); + HAMMER_CLUSTER_SIZE, + &io->bp); } else { error = bread(devvp, io->offset, io->bytes, &io->bp); } diff --git a/sys/vfs/hammer/hammer_vnops.c b/sys/vfs/hammer/hammer_vnops.c index 681e5cef82..7e2b805794 100644 --- a/sys/vfs/hammer/hammer_vnops.c +++ b/sys/vfs/hammer/hammer_vnops.c @@ -407,8 +407,8 @@ hammer_vop_read(struct vop_read_args *ap) } error = cluster_read(ap->a_vp, file_limit, base_offset, - blksize, MAXPHYS, - seqcount, &bp); + blksize, uio->uio_resid, + seqcount * BKVASIZE, &bp); } else { error = bread(ap->a_vp, base_offset, blksize, &bp); } diff --git a/sys/vfs/isofs/cd9660/cd9660_vnops.c b/sys/vfs/isofs/cd9660/cd9660_vnops.c index a1bfb5a032..38aabf0c07 100644 --- a/sys/vfs/isofs/cd9660/cd9660_vnops.c +++ b/sys/vfs/isofs/cd9660/cd9660_vnops.c @@ -272,7 +272,8 @@ cd9660_read(struct vop_read_args *ap) error = cluster_read(vp, (off_t)ip->i_size, loffset, size, uio->uio_resid, - (ap->a_ioflag >> 16), + (ap->a_ioflag >> 16) * + BKVASIZE, &bp); } else { error = bread(vp, loffset, size, &bp); diff --git a/sys/vfs/ufs/ffs_balloc.c b/sys/vfs/ufs/ffs_balloc.c index 9d73a54622..85abfbc6e0 100644 --- a/sys/vfs/ufs/ffs_balloc.c +++ b/sys/vfs/ufs/ffs_balloc.c @@ -418,9 +418,12 @@ ffs_balloc(struct vop_balloc_args *ap) error = cluster_read(vp, (off_t)ip->i_size, lblktodoff(fs, lbn), (int)fs->fs_bsize, - MAXBSIZE, seqcount, &dbp); + fs->fs_bsize, + seqcount * BKVASIZE, + &dbp); } else { - error = bread(vp, lblktodoff(fs, lbn), (int)fs->fs_bsize, &dbp); + error = bread(vp, lblktodoff(fs, lbn), + (int)fs->fs_bsize, &dbp); } if (error) goto fail; diff --git a/sys/vfs/ufs/ffs_subr.c b/sys/vfs/ufs/ffs_subr.c index aac23e7734..ab231ecaee 100644 --- a/sys/vfs/ufs/ffs_subr.c +++ b/sys/vfs/ufs/ffs_subr.c @@ -131,7 +131,7 @@ ffs_blkatoff_ra(struct vnode *vp, off_t uoffset, char **res, struct buf **bpp, */ error = cluster_read(vp, (off_t)ip->i_size, base_loffset, bsize, - MAXBSIZE, seqcount, &bp); + bsize, seqcount * BKVASIZE, &bp); } else if (seqcount > 1) { /* * Faked read ahead -- 2.41.0