From cf1bb2a83970df7e08f3ca2ed871657ca2185944 Mon Sep 17 00:00:00 2001
From: Matthew Dillon <dillon@apollo.backplane.com>
Date: Thu, 14 Jan 2010 19:40:14 -0800
Subject: [PATCH] kernel - Improve VM fault performance for sequential access

* VM fault I/O pipelining was not working properly.

* Temporarily fix pipelining by introducing PG_RAM, A read-ahead mark
  for vm_page_t, and adjust vm_fault to pass VM pages through to
  getpages calls if PG_RAM is set, even if they are fully valid.

* Remove code in vnode_pager_generic_getpages() which shortcutted
  the operation when the requested page was fully valid.  This
  prevented read-aheads from being issued.

* A more permanent solution is in the works (basically getting rid of
  the whole VM read-ahead/read-behind array entirely, just passing
  a single page through to vnode_pager_generic_getpages(), and
  letting the filesystem handle the read-ahead in a more efficient
  fashion.

Reported-by: "Mikhail T." <mi+thun@aldan.algebra.com>
---
 sys/kern/vfs_cluster.c | 19 ++++++++++++++-----
 sys/sys/vnode.h        |  1 +
 sys/vm/vm_fault.c      | 22 +++++++++++++++++-----
 sys/vm/vm_page.h       |  1 +
 sys/vm/vnode_pager.c   | 27 ++++++++++-----------------
 5 files changed, 43 insertions(+), 27 deletions(-)

diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c
index 8923bb1d8a..6e26d8fc4d 100644
--- a/sys/kern/vfs_cluster.c
+++ b/sys/kern/vfs_cluster.c
@@ -74,7 +74,7 @@ static struct buf *
 			    off_t doffset, int blksize, int run, 
 			    struct buf *fbp);
 static void cluster_callback (struct bio *);
-
+static void cluster_setram (struct buf *);
 
 static int write_behind = 1;
 SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0, "");
@@ -149,7 +149,7 @@ cluster_read(struct vnode *vp, off_t filesize, off_t loffset,
 					break;
 				if (((i % racluster) == (racluster - 1)) ||
 				    (i == (maxra - 1))) {
-					tbp->b_flags |= B_RAM;
+					cluster_setram(tbp);
 				}
 				BUF_UNLOCK(tbp);
 			}
@@ -200,7 +200,7 @@ single_block_read:
 			 * if it isn't in the cache, then get a chunk from
 			 * disk if sequential, otherwise just get the block.
 			 */
-			bp->b_flags |= B_RAM;
+			cluster_setram(bp);
 			loffset += blksize;
 		}
 	}
@@ -276,7 +276,8 @@ single_block_read:
 		 * rbp: async read
 		 */
 		rbp->b_cmd = BUF_CMD_READ;
-		rbp->b_flags |= B_RAM/* | B_AGE*/;
+		/*rbp->b_flags |= B_AGE*/;
+		cluster_setram(rbp);
 
 		if (burstbytes) {
 			rbp = cluster_rbuild(vp, filesize, loffset,
@@ -440,7 +441,7 @@ cluster_rbuild(struct vnode *vp, off_t filesize, off_t loffset, off_t doffset,
 			 * Set a read-ahead mark as appropriate
 			 */
 			if (i == 1 || i == (run - 1))
-				tbp->b_flags |= B_RAM;
+				cluster_setram(tbp);
 
 			/*
 			 * Depress the priority of buffers not explicitly
@@ -1014,3 +1015,11 @@ cluster_append(struct bio *bio, struct buf *tbp)
 	}
 }
 
+static
+void
+cluster_setram (struct buf *bp)
+{
+	bp->b_flags |= B_RAM;
+	if (bp->b_xio.xio_npages)
+		vm_page_flag_set(bp->b_xio.xio_pages[0], PG_RAM);
+}
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index 2a5e36410a..cdb0fa67b6 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -586,6 +586,7 @@ void	vx_put (struct vnode *vp);
 int	vget (struct vnode *vp, int lockflag);
 void	vput (struct vnode *vp);
 void	vhold (struct vnode *);
+void	vhold_interlocked (struct vnode *);
 void	vdrop (struct vnode *);
 void	vref (struct vnode *vp);
 void	vrele (struct vnode *vp);
diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c
index 0fae06422f..2c9fac87be 100644
--- a/sys/vm/vm_fault.c
+++ b/sys/vm/vm_fault.c
@@ -126,6 +126,8 @@ struct faultstate {
 
 static int burst_fault = 1;
 SYSCTL_INT(_vm, OID_AUTO, burst_fault, CTLFLAG_RW, &burst_fault, 0, "");
+static int debug_cluster = 0;
+SYSCTL_INT(_vm, OID_AUTO, debug_cluster, CTLFLAG_RW, &debug_cluster, 0, "");
 
 static int vm_fault_object(struct faultstate *, vm_pindex_t, vm_prot_t);
 static int vm_fault_vpagetable(struct faultstate *, vm_pindex_t *, vpte_t, int);
@@ -971,7 +973,8 @@ vm_fault_object(struct faultstate *fs,
 			/*
 			 * Mark page busy for other processes, and the 
 			 * pagedaemon.  If it still isn't completely valid
-			 * (readable), jump to readrest, else we found the
+			 * (readable), or if a read-ahead-mark is set on
+			 * the VM page, jump to readrest, else we found the
 			 * page and can return.
 			 *
 			 * We can release the spl once we have marked the
@@ -980,9 +983,17 @@ vm_fault_object(struct faultstate *fs,
 			vm_page_busy(fs->m);
 			crit_exit();
 
-			if (((fs->m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) &&
-			    fs->m->object != &kernel_object) {
-				goto readrest;
+			if (fs->m->object != &kernel_object) {
+				if ((fs->m->valid & VM_PAGE_BITS_ALL) !=
+				    VM_PAGE_BITS_ALL) {
+					goto readrest;
+				}
+				if (fs->m->flags & PG_RAM) {
+					if (debug_cluster)
+						kprintf("R");
+					vm_page_flag_clear(fs->m, PG_RAM);
+					goto readrest;
+				}
 			}
 			break; /* break to PAGE HAS BEEN FOUND */
 		}
@@ -1040,7 +1051,8 @@ readrest:
 		/*
 		 * We have found a valid page or we have allocated a new page.
 		 * The page thus may not be valid or may not be entirely 
-		 * valid.
+		 * valid.  Even if entirely valid we may have hit a read-ahead
+		 * mark and desire to keep the pipeline going.
 		 *
 		 * Attempt to fault-in the page if there is a chance that the
 		 * pager has it, and potentially fault in additional pages
diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h
index 9c312edbf0..658c9bf7a2 100644
--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h
@@ -332,6 +332,7 @@ extern struct vpgqueues vm_page_queues[PQ_COUNT];
 #define PG_NOSYNC	0x0400		/* do not collect for syncer */
 #define PG_UNMANAGED	0x0800		/* No PV management for page */
 #define PG_MARKER	0x1000		/* special queue marker page */
+#define PG_RAM		0x2000		/* read ahead mark */
 
 /*
  * Misc constants.
diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c
index a063460385..fdf9bf01b1 100644
--- a/sys/vm/vnode_pager.c
+++ b/sys/vm/vnode_pager.c
@@ -451,23 +451,15 @@ vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, int bytecount,
 	count = bytecount / PAGE_SIZE;
 
 	/*
-	 * If we have a completely valid page available to us, we can
-	 * clean up and return.  Otherwise we have to re-read the
-	 * media.
+	 * We could check m[reqpage]->valid here and shortcut the operation,
+	 * but doing so breaks read-ahead.  Instead assume that the VM
+	 * system has already done at least the check, don't worry about
+	 * any races, and issue the VOP_READ to allow read-ahead to function.
 	 *
-	 * Note that this does not work with NFS, so NFS has its own
-	 * getpages routine.  The problem is that NFS can have partially
-	 * valid pages associated with the buffer cache due to the piecemeal
-	 * write support.  If we were to fall through and re-read the media
-	 * as we do here, dirty data could be lost.
+	 * This keeps the pipeline full for I/O bound sequentially scanned
+	 * mmap()'s
 	 */
-	if (m[reqpage]->valid == VM_PAGE_BITS_ALL) {
-		for (i = 0; i < count; i++) {
-			if (i != reqpage)
-				vnode_pager_freepage(m[i]);
-		}
-		return VM_PAGER_OK;
-	}
+	/* don't shortcut */
 
 	/*
 	 * Discard pages past the file EOF.  If the requested page is past
@@ -520,10 +512,11 @@ vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, int bytecount,
 	}
 
 	/*
-	 * Issue the I/O without any read-ahead
+	 * Issue the I/O with some read-ahead if bytecount > PAGE_SIZE
 	 */
 	ioflags = IO_VMIO;
-	/*ioflags |= IO_SEQMAX << IO_SEQSHIFT;*/
+/*	if (bytecount > PAGE_SIZE)*/
+		ioflags |= IO_SEQMAX << IO_SEQSHIFT;
 
 	aiov.iov_base = (caddr_t) 0;
 	aiov.iov_len = bytecount;
-- 
2.41.0