From cf1bb2a83970df7e08f3ca2ed871657ca2185944 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Thu, 14 Jan 2010 19:40:14 -0800 Subject: [PATCH] kernel - Improve VM fault performance for sequential access * VM fault I/O pipelining was not working properly. * Temporarily fix pipelining by introducing PG_RAM, A read-ahead mark for vm_page_t, and adjust vm_fault to pass VM pages through to getpages calls if PG_RAM is set, even if they are fully valid. * Remove code in vnode_pager_generic_getpages() which shortcutted the operation when the requested page was fully valid. This prevented read-aheads from being issued. * A more permanent solution is in the works (basically getting rid of the whole VM read-ahead/read-behind array entirely, just passing a single page through to vnode_pager_generic_getpages(), and letting the filesystem handle the read-ahead in a more efficient fashion. Reported-by: "Mikhail T." --- sys/kern/vfs_cluster.c | 19 ++++++++++++++----- sys/sys/vnode.h | 1 + sys/vm/vm_fault.c | 22 +++++++++++++++++----- sys/vm/vm_page.h | 1 + sys/vm/vnode_pager.c | 27 ++++++++++----------------- 5 files changed, 43 insertions(+), 27 deletions(-) diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c index 8923bb1d8a..6e26d8fc4d 100644 --- a/sys/kern/vfs_cluster.c +++ b/sys/kern/vfs_cluster.c @@ -74,7 +74,7 @@ static struct buf * off_t doffset, int blksize, int run, struct buf *fbp); static void cluster_callback (struct bio *); - +static void cluster_setram (struct buf *); static int write_behind = 1; SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0, ""); @@ -149,7 +149,7 @@ cluster_read(struct vnode *vp, off_t filesize, off_t loffset, break; if (((i % racluster) == (racluster - 1)) || (i == (maxra - 1))) { - tbp->b_flags |= B_RAM; + cluster_setram(tbp); } BUF_UNLOCK(tbp); } @@ -200,7 +200,7 @@ single_block_read: * if it isn't in the cache, then get a chunk from * disk if sequential, otherwise just get the block. */ - bp->b_flags |= B_RAM; + cluster_setram(bp); loffset += blksize; } } @@ -276,7 +276,8 @@ single_block_read: * rbp: async read */ rbp->b_cmd = BUF_CMD_READ; - rbp->b_flags |= B_RAM/* | B_AGE*/; + /*rbp->b_flags |= B_AGE*/; + cluster_setram(rbp); if (burstbytes) { rbp = cluster_rbuild(vp, filesize, loffset, @@ -440,7 +441,7 @@ cluster_rbuild(struct vnode *vp, off_t filesize, off_t loffset, off_t doffset, * Set a read-ahead mark as appropriate */ if (i == 1 || i == (run - 1)) - tbp->b_flags |= B_RAM; + cluster_setram(tbp); /* * Depress the priority of buffers not explicitly @@ -1014,3 +1015,11 @@ cluster_append(struct bio *bio, struct buf *tbp) } } +static +void +cluster_setram (struct buf *bp) +{ + bp->b_flags |= B_RAM; + if (bp->b_xio.xio_npages) + vm_page_flag_set(bp->b_xio.xio_pages[0], PG_RAM); +} diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h index 2a5e36410a..cdb0fa67b6 100644 --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -586,6 +586,7 @@ void vx_put (struct vnode *vp); int vget (struct vnode *vp, int lockflag); void vput (struct vnode *vp); void vhold (struct vnode *); +void vhold_interlocked (struct vnode *); void vdrop (struct vnode *); void vref (struct vnode *vp); void vrele (struct vnode *vp); diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index 0fae06422f..2c9fac87be 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -126,6 +126,8 @@ struct faultstate { static int burst_fault = 1; SYSCTL_INT(_vm, OID_AUTO, burst_fault, CTLFLAG_RW, &burst_fault, 0, ""); +static int debug_cluster = 0; +SYSCTL_INT(_vm, OID_AUTO, debug_cluster, CTLFLAG_RW, &debug_cluster, 0, ""); static int vm_fault_object(struct faultstate *, vm_pindex_t, vm_prot_t); static int vm_fault_vpagetable(struct faultstate *, vm_pindex_t *, vpte_t, int); @@ -971,7 +973,8 @@ vm_fault_object(struct faultstate *fs, /* * Mark page busy for other processes, and the * pagedaemon. If it still isn't completely valid - * (readable), jump to readrest, else we found the + * (readable), or if a read-ahead-mark is set on + * the VM page, jump to readrest, else we found the * page and can return. * * We can release the spl once we have marked the @@ -980,9 +983,17 @@ vm_fault_object(struct faultstate *fs, vm_page_busy(fs->m); crit_exit(); - if (((fs->m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) && - fs->m->object != &kernel_object) { - goto readrest; + if (fs->m->object != &kernel_object) { + if ((fs->m->valid & VM_PAGE_BITS_ALL) != + VM_PAGE_BITS_ALL) { + goto readrest; + } + if (fs->m->flags & PG_RAM) { + if (debug_cluster) + kprintf("R"); + vm_page_flag_clear(fs->m, PG_RAM); + goto readrest; + } } break; /* break to PAGE HAS BEEN FOUND */ } @@ -1040,7 +1051,8 @@ readrest: /* * We have found a valid page or we have allocated a new page. * The page thus may not be valid or may not be entirely - * valid. + * valid. Even if entirely valid we may have hit a read-ahead + * mark and desire to keep the pipeline going. * * Attempt to fault-in the page if there is a chance that the * pager has it, and potentially fault in additional pages diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h index 9c312edbf0..658c9bf7a2 100644 --- a/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -332,6 +332,7 @@ extern struct vpgqueues vm_page_queues[PQ_COUNT]; #define PG_NOSYNC 0x0400 /* do not collect for syncer */ #define PG_UNMANAGED 0x0800 /* No PV management for page */ #define PG_MARKER 0x1000 /* special queue marker page */ +#define PG_RAM 0x2000 /* read ahead mark */ /* * Misc constants. diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c index a063460385..fdf9bf01b1 100644 --- a/sys/vm/vnode_pager.c +++ b/sys/vm/vnode_pager.c @@ -451,23 +451,15 @@ vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, int bytecount, count = bytecount / PAGE_SIZE; /* - * If we have a completely valid page available to us, we can - * clean up and return. Otherwise we have to re-read the - * media. + * We could check m[reqpage]->valid here and shortcut the operation, + * but doing so breaks read-ahead. Instead assume that the VM + * system has already done at least the check, don't worry about + * any races, and issue the VOP_READ to allow read-ahead to function. * - * Note that this does not work with NFS, so NFS has its own - * getpages routine. The problem is that NFS can have partially - * valid pages associated with the buffer cache due to the piecemeal - * write support. If we were to fall through and re-read the media - * as we do here, dirty data could be lost. + * This keeps the pipeline full for I/O bound sequentially scanned + * mmap()'s */ - if (m[reqpage]->valid == VM_PAGE_BITS_ALL) { - for (i = 0; i < count; i++) { - if (i != reqpage) - vnode_pager_freepage(m[i]); - } - return VM_PAGER_OK; - } + /* don't shortcut */ /* * Discard pages past the file EOF. If the requested page is past @@ -520,10 +512,11 @@ vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, int bytecount, } /* - * Issue the I/O without any read-ahead + * Issue the I/O with some read-ahead if bytecount > PAGE_SIZE */ ioflags = IO_VMIO; - /*ioflags |= IO_SEQMAX << IO_SEQSHIFT;*/ +/* if (bytecount > PAGE_SIZE)*/ + ioflags |= IO_SEQMAX << IO_SEQSHIFT; aiov.iov_base = (caddr_t) 0; aiov.iov_len = bytecount; -- 2.41.0