kernel - Improve VM fault performance for sequential access
authorMatthew Dillon <dillon@apollo.backplane.com>
Fri, 15 Jan 2010 03:40:14 +0000 (19:40 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Fri, 15 Jan 2010 03:40:14 +0000 (19:40 -0800)
* VM fault I/O pipelining was not working properly.

* Temporarily fix pipelining by introducing PG_RAM, A read-ahead mark
  for vm_page_t, and adjust vm_fault to pass VM pages through to
  getpages calls if PG_RAM is set, even if they are fully valid.

* Remove code in vnode_pager_generic_getpages() which shortcutted
  the operation when the requested page was fully valid.  This
  prevented read-aheads from being issued.

* A more permanent solution is in the works (basically getting rid of
  the whole VM read-ahead/read-behind array entirely, just passing
  a single page through to vnode_pager_generic_getpages(), and
  letting the filesystem handle the read-ahead in a more efficient
  fashion.

Reported-by: "Mikhail T." <mi+thun@aldan.algebra.com>
sys/kern/vfs_cluster.c
sys/sys/vnode.h
sys/vm/vm_fault.c
sys/vm/vm_page.h
sys/vm/vnode_pager.c

index 8923bb1..6e26d8f 100644 (file)
@@ -74,7 +74,7 @@ static struct buf *
                            off_t doffset, int blksize, int run, 
                            struct buf *fbp);
 static void cluster_callback (struct bio *);
-
+static void cluster_setram (struct buf *);
 
 static int write_behind = 1;
 SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0, "");
@@ -149,7 +149,7 @@ cluster_read(struct vnode *vp, off_t filesize, off_t loffset,
                                        break;
                                if (((i % racluster) == (racluster - 1)) ||
                                    (i == (maxra - 1))) {
-                                       tbp->b_flags |= B_RAM;
+                                       cluster_setram(tbp);
                                }
                                BUF_UNLOCK(tbp);
                        }
@@ -200,7 +200,7 @@ single_block_read:
                         * if it isn't in the cache, then get a chunk from
                         * disk if sequential, otherwise just get the block.
                         */
-                       bp->b_flags |= B_RAM;
+                       cluster_setram(bp);
                        loffset += blksize;
                }
        }
@@ -276,7 +276,8 @@ single_block_read:
                 * rbp: async read
                 */
                rbp->b_cmd = BUF_CMD_READ;
-               rbp->b_flags |= B_RAM/* | B_AGE*/;
+               /*rbp->b_flags |= B_AGE*/;
+               cluster_setram(rbp);
 
                if (burstbytes) {
                        rbp = cluster_rbuild(vp, filesize, loffset,
@@ -440,7 +441,7 @@ cluster_rbuild(struct vnode *vp, off_t filesize, off_t loffset, off_t doffset,
                         * Set a read-ahead mark as appropriate
                         */
                        if (i == 1 || i == (run - 1))
-                               tbp->b_flags |= B_RAM;
+                               cluster_setram(tbp);
 
                        /*
                         * Depress the priority of buffers not explicitly
@@ -1014,3 +1015,11 @@ cluster_append(struct bio *bio, struct buf *tbp)
        }
 }
 
+static
+void
+cluster_setram (struct buf *bp)
+{
+       bp->b_flags |= B_RAM;
+       if (bp->b_xio.xio_npages)
+               vm_page_flag_set(bp->b_xio.xio_pages[0], PG_RAM);
+}
index 2a5e364..cdb0fa6 100644 (file)
@@ -586,6 +586,7 @@ void        vx_put (struct vnode *vp);
 int    vget (struct vnode *vp, int lockflag);
 void   vput (struct vnode *vp);
 void   vhold (struct vnode *);
+void   vhold_interlocked (struct vnode *);
 void   vdrop (struct vnode *);
 void   vref (struct vnode *vp);
 void   vrele (struct vnode *vp);
index 0fae064..2c9fac8 100644 (file)
@@ -126,6 +126,8 @@ struct faultstate {
 
 static int burst_fault = 1;
 SYSCTL_INT(_vm, OID_AUTO, burst_fault, CTLFLAG_RW, &burst_fault, 0, "");
+static int debug_cluster = 0;
+SYSCTL_INT(_vm, OID_AUTO, debug_cluster, CTLFLAG_RW, &debug_cluster, 0, "");
 
 static int vm_fault_object(struct faultstate *, vm_pindex_t, vm_prot_t);
 static int vm_fault_vpagetable(struct faultstate *, vm_pindex_t *, vpte_t, int);
@@ -971,7 +973,8 @@ vm_fault_object(struct faultstate *fs,
                        /*
                         * Mark page busy for other processes, and the 
                         * pagedaemon.  If it still isn't completely valid
-                        * (readable), jump to readrest, else we found the
+                        * (readable), or if a read-ahead-mark is set on
+                        * the VM page, jump to readrest, else we found the
                         * page and can return.
                         *
                         * We can release the spl once we have marked the
@@ -980,9 +983,17 @@ vm_fault_object(struct faultstate *fs,
                        vm_page_busy(fs->m);
                        crit_exit();
 
-                       if (((fs->m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) &&
-                           fs->m->object != &kernel_object) {
-                               goto readrest;
+                       if (fs->m->object != &kernel_object) {
+                               if ((fs->m->valid & VM_PAGE_BITS_ALL) !=
+                                   VM_PAGE_BITS_ALL) {
+                                       goto readrest;
+                               }
+                               if (fs->m->flags & PG_RAM) {
+                                       if (debug_cluster)
+                                               kprintf("R");
+                                       vm_page_flag_clear(fs->m, PG_RAM);
+                                       goto readrest;
+                               }
                        }
                        break; /* break to PAGE HAS BEEN FOUND */
                }
@@ -1040,7 +1051,8 @@ readrest:
                /*
                 * We have found a valid page or we have allocated a new page.
                 * The page thus may not be valid or may not be entirely 
-                * valid.
+                * valid.  Even if entirely valid we may have hit a read-ahead
+                * mark and desire to keep the pipeline going.
                 *
                 * Attempt to fault-in the page if there is a chance that the
                 * pager has it, and potentially fault in additional pages
index 9c312ed..658c9bf 100644 (file)
@@ -332,6 +332,7 @@ extern struct vpgqueues vm_page_queues[PQ_COUNT];
 #define PG_NOSYNC      0x0400          /* do not collect for syncer */
 #define PG_UNMANAGED   0x0800          /* No PV management for page */
 #define PG_MARKER      0x1000          /* special queue marker page */
+#define PG_RAM         0x2000          /* read ahead mark */
 
 /*
  * Misc constants.
index a063460..fdf9bf0 100644 (file)
@@ -451,23 +451,15 @@ vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, int bytecount,
        count = bytecount / PAGE_SIZE;
 
        /*
-        * If we have a completely valid page available to us, we can
-        * clean up and return.  Otherwise we have to re-read the
-        * media.
+        * We could check m[reqpage]->valid here and shortcut the operation,
+        * but doing so breaks read-ahead.  Instead assume that the VM
+        * system has already done at least the check, don't worry about
+        * any races, and issue the VOP_READ to allow read-ahead to function.
         *
-        * Note that this does not work with NFS, so NFS has its own
-        * getpages routine.  The problem is that NFS can have partially
-        * valid pages associated with the buffer cache due to the piecemeal
-        * write support.  If we were to fall through and re-read the media
-        * as we do here, dirty data could be lost.
+        * This keeps the pipeline full for I/O bound sequentially scanned
+        * mmap()'s
         */
-       if (m[reqpage]->valid == VM_PAGE_BITS_ALL) {
-               for (i = 0; i < count; i++) {
-                       if (i != reqpage)
-                               vnode_pager_freepage(m[i]);
-               }
-               return VM_PAGER_OK;
-       }
+       /* don't shortcut */
 
        /*
         * Discard pages past the file EOF.  If the requested page is past
@@ -520,10 +512,11 @@ vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, int bytecount,
        }
 
        /*
-        * Issue the I/O without any read-ahead
+        * Issue the I/O with some read-ahead if bytecount > PAGE_SIZE
         */
        ioflags = IO_VMIO;
-       /*ioflags |= IO_SEQMAX << IO_SEQSHIFT;*/
+/*     if (bytecount > PAGE_SIZE)*/
+               ioflags |= IO_SEQMAX << IO_SEQSHIFT;
 
        aiov.iov_base = (caddr_t) 0;
        aiov.iov_len = bytecount;