kernel - SWAP CACHE part 7/many - Add vm_swapcache.c core (write side)
authorMatthew Dillon <dillon@apollo.backplane.com>
Thu, 4 Feb 2010 17:05:57 +0000 (09:05 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Thu, 4 Feb 2010 17:05:57 +0000 (09:05 -0800)
* Add vm_swapcache.c which will be responsible for assigning swap to clean
  vnode-backed VM pages and writing the data out.

  Implement a very simple inactive queue scanner and swap-writer for
  testing.

* Track swap space use, split up into the piece used for anonymous
  data and the piece used for clean vnode-backed data.

* Add PG_SWAPPED tracking for newly allocated VM pages via
  swap_pager_page_inserted().

* Conditionalize the swap code's dirtying/undirtying of VM pages.  We
  don't want to mess with the dirty state when working the swap
  cache since it isn't the definitive backing store for the VM page.

sys/conf/files
sys/vm/swap_pager.c
sys/vm/swap_pager.h
sys/vm/vm_page.c
sys/vm/vm_swapcache.c [new file with mode: 0644]

index c2efd66..430873f 100644 (file)
@@ -1441,6 +1441,7 @@ vm/vm_mmap.c                      standard
 vm/vm_object.c                 standard
 vm/vm_page.c                   standard
 vm/vm_pageout.c                        standard
+vm/vm_swapcache.c              standard
 vm/vm_pager.c                  standard
 vm/vm_swap.c                   standard
 vm/vm_unix.c                   standard
index 913a93d..f79aa0f 100644 (file)
@@ -155,6 +155,9 @@ struct swfreeinfo {
 extern int vm_swap_size;       /* number of free swap blocks, in pages */
 
 int swap_pager_full;           /* swap space exhaustion (task killing) */
+int vm_swap_cache_use;
+int vm_swap_anon_use;
+
 static int swap_pager_almost_full; /* swap space exhaustion (w/ hysteresis)*/
 static int nsw_rcount;         /* free read buffers                    */
 static int nsw_wcount_sync;    /* limit write buffers / synchronous    */
@@ -173,6 +176,11 @@ SYSCTL_INT(_vm, OID_AUTO, swap_async_max,
 SYSCTL_INT(_vm, OID_AUTO, swap_burst_read,
         CTLFLAG_RW, &swap_burst_read, 0, "Allow burst reads for pageins");
 
+SYSCTL_INT(_vm, OID_AUTO, swap_cache_use,
+        CTLFLAG_RD, &vm_swap_cache_use, 0, "");
+SYSCTL_INT(_vm, OID_AUTO, swap_anon_use,
+        CTLFLAG_RD, &vm_swap_anon_use, 0, "");
+
 vm_zone_t              swap_zone;
 
 /*
@@ -245,8 +253,8 @@ static void swp_pager_async_iodone (struct bio *bio);
  * Swap bitmap functions
  */
 
-static __inline void   swp_pager_freeswapspace (daddr_t blk, int npages);
-static __inline daddr_t        swp_pager_getswapspace (int npages);
+static __inline void   swp_pager_freeswapspace (vm_object_t object, daddr_t blk, int npages);
+static __inline daddr_t        swp_pager_getswapspace (vm_object_t object, int npages);
 
 /*
  * Metadata functions
@@ -490,9 +498,8 @@ swap_pager_dealloc(vm_object_t object)
  *     This routine may not block
  *     This routine must be called at splvm().
  */
-
 static __inline daddr_t
-swp_pager_getswapspace(int npages)
+swp_pager_getswapspace(vm_object_t object, int npages)
 {
        daddr_t blk;
 
@@ -504,6 +511,10 @@ swp_pager_getswapspace(int npages)
                }
        } else {
                vm_swap_size -= npages;
+               if (object->type == OBJT_SWAP)
+                       vm_swap_anon_use += npages;
+               else
+                       vm_swap_cache_use += npages;
                swp_sizecheck();
        }
        return(blk);
@@ -525,10 +536,14 @@ swp_pager_getswapspace(int npages)
  */
 
 static __inline void
-swp_pager_freeswapspace(daddr_t blk, int npages)
+swp_pager_freeswapspace(vm_object_t object, daddr_t blk, int npages)
 {
        blist_free(swapblist, blk, npages);
        vm_swap_size += npages;
+       if (object->type == OBJT_SWAP)
+               vm_swap_anon_use -= npages;
+       else
+               vm_swap_cache_use -= npages;
        swp_sizecheck();
 }
 
@@ -563,6 +578,22 @@ swap_pager_freespace_all(vm_object_t object)
        crit_exit();
 }
 
+/*
+ * Called by vm_page_alloc() when a new VM page is inserted
+ * into a VM object.  Checks whether swap has been assigned to
+ * the page and sets PG_SWAPPED as necessary.
+ */
+void
+swap_pager_page_inserted(vm_page_t m)
+{
+       if (m->object->swblock_count) {
+               crit_enter();
+               if (swp_pager_meta_ctl(m->object, m->pindex, 0) != SWAPBLK_NONE)
+                       vm_page_flag_set(m, PG_SWAPPED);
+               crit_exit();
+       }
+}
+
 /*
  * SWAP_PAGER_RESERVE() - reserve swap blocks in object
  *
@@ -582,7 +613,9 @@ swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_size_t size)
        while (size) {
                if (n == 0) {
                        n = BLIST_MAX_ALLOC;
-                       while ((blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE) {
+                       while ((blk = swp_pager_getswapspace(object, n)) ==
+                              SWAPBLK_NONE)
+                       {
                                n >>= 1;
                                if (n == 0) {
                                        swp_pager_meta_free(object, beg,
@@ -894,7 +927,7 @@ swap_pager_strategy(vm_object_t object, struct bio *bio)
                 */
                blk = swp_pager_meta_ctl(object, start, 0);
                if ((blk == SWAPBLK_NONE) && bp->b_cmd != BUF_CMD_READ) {
-                       blk = swp_pager_getswapspace(1);
+                       blk = swp_pager_getswapspace(object, 1);
                        if (blk == SWAPBLK_NONE) {
                                bp->b_error = ENOMEM;
                                bp->b_flags |= B_ERROR;
@@ -1418,7 +1451,7 @@ swap_pager_putpages(vm_object_t object, vm_page_t *m, int count,
                 * fragment swap.
                 */
                while (
-                   (blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE &&
+                   (blk = swp_pager_getswapspace(object, n)) == SWAPBLK_NONE &&
                    n > 4
                ) {
                        n >>= 1;
@@ -1437,7 +1470,7 @@ swap_pager_putpages(vm_object_t object, vm_page_t *m, int count,
                 */
                if ((blk ^ (blk + n)) & dmmax_mask) {
                        j = ((blk + dmmax) & dmmax_mask) - blk;
-                       swp_pager_freeswapspace(blk + j, n - j);
+                       swp_pager_freeswapspace(object, blk + j, n - j);
                        n = j;
                }
 
@@ -1445,7 +1478,6 @@ swap_pager_putpages(vm_object_t object, vm_page_t *m, int count,
                 * All I/O parameters have been satisfied, build the I/O
                 * request and assign the swap space.
                 */
-
                if (sync == TRUE)
                        bp = getpbuf(&nsw_wcount_sync);
                else
@@ -1460,12 +1492,10 @@ swap_pager_putpages(vm_object_t object, vm_page_t *m, int count,
                for (j = 0; j < n; ++j) {
                        vm_page_t mreq = m[i+j];
 
-                       swp_pager_meta_build(
-                           mreq->object, 
-                           mreq->pindex,
-                           blk + j
-                       );
-                       vm_page_dirty(mreq);
+                       swp_pager_meta_build(mreq->object, mreq->pindex,
+                                            blk + j);
+                       if (object->type == OBJT_SWAP)
+                               vm_page_dirty(mreq);
                        rtvals[i+j] = VM_PAGER_OK;
 
                        vm_page_flag_set(mreq, PG_SWAPINPROG);
@@ -1642,10 +1672,16 @@ swp_pager_async_iodone(struct bio *bio)
                                 * If a write error occurs, reactivate page
                                 * so it doesn't clog the inactive list,
                                 * then finish the I/O.
+                                *
+                                * Only for OBJT_SWAP.  When using the swap
+                                * as a cache for clean vnode-backed pages
+                                * we don't mess with the page dirty state.
                                 */
-                               vm_page_dirty(m);
                                vm_page_flag_clear(m, PG_SWAPINPROG);
-                               vm_page_activate(m);
+                               if (m->object->type == OBJT_SWAP) {
+                                       vm_page_dirty(m);
+                                       vm_page_activate(m);
+                               }
                                vm_page_io_finish(m);
                        }
                } else if (bio->bio_caller_info1.index & SWBIO_READ) {
@@ -1709,8 +1745,12 @@ swp_pager_async_iodone(struct bio *bio)
                         * page.  Do not try to cache it (which would also
                         * involve a pmap op), because the page might still
                         * be read-heavy.
+                        *
+                        * When using the swap to cache clean vnode pages
+                        * we do not mess with the page dirty bits.
                         */
-                       vm_page_undirty(m);
+                       if (m->object->type == OBJT_SWAP)
+                               vm_page_undirty(m);
                        vm_page_flag_clear(m, PG_SWAPINPROG);
                        vm_page_flag_set(m, PG_SWAPPED);
                        vm_page_io_finish(m);
@@ -1854,7 +1894,7 @@ retry:
        index &= SWAP_META_MASK;
 
        if (swap->swb_pages[index] != SWAPBLK_NONE) {
-               swp_pager_freeswapspace(swap->swb_pages[index], 1);
+               swp_pager_freeswapspace(object, swap->swb_pages[index], 1);
                --swap->swb_count;
        }
 
@@ -1939,7 +1979,7 @@ swp_pager_meta_free_callback(struct swblock *swap, void *data)
                daddr_t v = swap->swb_pages[index];
 
                if (v != SWAPBLK_NONE) {
-                       swp_pager_freeswapspace(v, 1);
+                       swp_pager_freeswapspace(object, v, 1);
                        swap->swb_pages[index] = SWAPBLK_NONE;
                        if (--swap->swb_count == 0) {
                                swp_pager_remove(object, swap);
@@ -1974,7 +2014,7 @@ swp_pager_meta_free_all(vm_object_t object)
                        daddr_t v = swap->swb_pages[i];
                        if (v != SWAPBLK_NONE) {
                                --swap->swb_count;
-                               swp_pager_freeswapspace(v, 1);
+                               swp_pager_freeswapspace(object, v, 1);
                        }
                }
                if (swap->swb_count != 0)
@@ -2025,7 +2065,7 @@ swp_pager_meta_ctl(vm_object_t object, vm_pindex_t index, int flags)
 
                if (r1 != SWAPBLK_NONE) {
                        if (flags & SWM_FREE) {
-                               swp_pager_freeswapspace(r1, 1);
+                               swp_pager_freeswapspace(object, r1, 1);
                                r1 = SWAPBLK_NONE;
                        }
                        if (flags & (SWM_FREE|SWM_POP)) {
index c8bf2ea..49decef 100644 (file)
@@ -87,6 +87,8 @@ struct swblock {
 
 #ifdef _KERNEL
 extern int swap_pager_full;
+extern int vm_swap_cache_use;
+extern int vm_swap_anon_use;
 extern struct blist *swapblist;
 
 void swap_pager_putpages (vm_object_t, struct vm_page **, int, boolean_t, int *);
@@ -96,6 +98,7 @@ int swap_pager_swp_alloc (vm_object_t, int);
 void swap_pager_copy (vm_object_t, vm_object_t, vm_pindex_t, int);
 void swap_pager_freespace (vm_object_t, vm_pindex_t, vm_pindex_t);
 void swap_pager_freespace_all (vm_object_t);
+void swap_pager_page_inserted(vm_page_t);
 void swap_pager_swap_init (void);
 void swap_pager_newswap (void);
 int swap_pager_reserve (vm_object_t, vm_pindex_t, vm_size_t);
index a5bebc6..cf7652c 100644 (file)
@@ -88,6 +88,7 @@
 #include <vm/vm_pager.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_page2.h>
+#include <vm/swap_pager.h>
 
 #include <machine/md_var.h>
 
@@ -404,6 +405,11 @@ vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
         */
        if ((m->valid & m->dirty) || (m->flags & PG_WRITEABLE))
                vm_object_set_writeable_dirty(object);
+
+       /*
+        * Checks for a swap assignment and sets PG_SWAPPED if appropriate.
+        */
+       swap_pager_page_inserted(m);
 }
 
 /*
diff --git a/sys/vm/vm_swapcache.c b/sys/vm/vm_swapcache.c
new file mode 100644 (file)
index 0000000..14d7b08
--- /dev/null
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@backplane.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Implement the swapcache daemon.  When enabled swap is assumed to be
+ * configured on a fast storage device such as a SSD.  Swap is assigned
+ * to clean vnode-backed pages in the inactive queue, clustered by object
+ * if possible, and written out.  The swap assignment sticks around even
+ * after the underlying pages have been recycled.
+ *
+ * The daemon manages write bandwidth based on sysctl settings to control
+ * wear on the SSD.
+ *
+ * The vnode strategy code will check for the swap assignments and divert
+ * reads to the swap device.
+ *
+ * This operates on both regular files and the block device vnodes used by
+ * filesystems to manage meta-data.
+ */
+
+#include "opt_vm.h"
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/kthread.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/vnode.h>
+#include <sys/vmmeter.h>
+#include <sys/sysctl.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <sys/lock.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_pager.h>
+#include <vm/swap_pager.h>
+#include <vm/vm_extern.h>
+
+#include <sys/thread2.h>
+#include <vm/vm_page2.h>
+
+#define INACTIVE_LIST  (&vm_page_queues[PQ_INACTIVE].pl)
+
+/* the kernel process "vm_pageout"*/
+static void vm_swapcached (void);
+static void vm_swapcached_flush (vm_page_t m);
+struct thread *swapcached_thread;
+
+static struct kproc_desc swpc_kp = {
+       "swapcached",
+       vm_swapcached,
+       &swapcached_thread
+};
+SYSINIT(swapcached, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start, &swpc_kp)
+
+SYSCTL_NODE(_vm, OID_AUTO, swapcache, CTLFLAG_RW, NULL, NULL);
+
+static int vm_swapcache_sleep;
+static int vm_swapcache_maxlaunder = 64;
+static int vm_swapcache_data_enable = 0;
+static int vm_swapcache_meta_enable = 0;
+static int64_t vm_swapcache_write_count;
+
+SYSCTL_INT(_vm_swapcache, OID_AUTO, maxlaunder,
+       CTLFLAG_RW, &vm_swapcache_maxlaunder, 0, "");
+SYSCTL_INT(_vm_swapcache, OID_AUTO, data_enable,
+       CTLFLAG_RW, &vm_swapcache_data_enable, 0, "");
+SYSCTL_INT(_vm_swapcache, OID_AUTO, meta_enable,
+       CTLFLAG_RW, &vm_swapcache_meta_enable, 0, "");
+SYSCTL_QUAD(_vm_swapcache, OID_AUTO, write_count,
+       CTLFLAG_RW, &vm_swapcache_write_count, 0, "");
+
+/*
+ * vm_swapcached is the high level pageout daemon.
+ */
+static void
+vm_swapcached(void)
+{
+       struct vm_page marker;
+       vm_object_t object;
+       vm_page_t m;
+       int count;
+
+       /*
+        * Thread setup
+        */
+       curthread->td_flags |= TDF_SYSTHREAD;
+
+       /*
+        * Initialize our marker
+        */
+       bzero(&marker, sizeof(marker));
+       marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
+       marker.queue = PQ_INACTIVE;
+       marker.wire_count = 1;
+
+       crit_enter();
+       TAILQ_INSERT_HEAD(INACTIVE_LIST, &marker, pageq);
+
+       for (;;) {
+               /*
+                * Loop once a second or so looking for work when enabled.
+                */
+               if (vm_swapcache_data_enable == 0 &&
+                   vm_swapcache_meta_enable == 0) {
+                       tsleep(&vm_swapcache_sleep, 0, "csleep", hz * 5);
+                       continue;
+               }
+               tsleep(&vm_swapcache_sleep, 0, "csleep", hz);
+
+               /*
+                * Calculate the number of pages to test.  We don't want
+                * to get into a cpu-bound loop.
+                */
+               count = vmstats.v_inactive_count;
+               if (count > vm_swapcache_maxlaunder)
+                       count = vm_swapcache_maxlaunder;
+
+               /*
+                * Scan the inactive queue from our marker to locate
+                * suitable pages to push to the swap cache.
+                *
+                * We are looking for clean vnode-backed pages.
+                */
+               m = &marker;
+               while ((m = TAILQ_NEXT(m, pageq)) != NULL && count--) {
+                       if (m->flags & PG_MARKER) {
+                               ++count;
+                               continue;
+                       }
+                       if (m->flags & (PG_SWAPPED | PG_BUSY | PG_UNMANAGED))
+                               continue;
+                       if (m->busy || m->hold_count || m->wire_count)
+                               continue;
+                       if (m->valid != VM_PAGE_BITS_ALL)
+                               continue;
+                       if (m->dirty & m->valid)
+                               continue;
+                       if ((object = m->object) == NULL)
+                               continue;
+                       if (object->type != OBJT_VNODE)
+                               continue;
+                       vm_page_test_dirty(m);
+                       if (m->dirty & m->valid)
+                               continue;
+
+                       /*
+                        * Ok, move the marker and soft-busy the page.
+                        */
+                       TAILQ_REMOVE(INACTIVE_LIST, &marker, pageq);
+                       TAILQ_INSERT_AFTER(INACTIVE_LIST, m, &marker, pageq);
+
+                       /*
+                        * Assign swap and initiate I/O
+                        */
+                       vm_swapcached_flush(m);
+
+                       /*
+                        * Setup for next loop using marker.
+                        */
+                       m = &marker;
+               }
+               TAILQ_REMOVE(INACTIVE_LIST, &marker, pageq);
+               if (m)
+                       TAILQ_INSERT_BEFORE(m, &marker, pageq);
+               else
+                       TAILQ_INSERT_HEAD(INACTIVE_LIST, &marker, pageq);
+
+       }
+       TAILQ_REMOVE(INACTIVE_LIST, &marker, pageq);
+       crit_exit();
+}
+
+/*
+ * Flush the specified page using the swap_pager.
+ */
+static
+void
+vm_swapcached_flush(vm_page_t m)
+{
+       vm_object_t object;
+       int rtvals;
+
+       vm_page_io_start(m);
+       vm_page_protect(m, VM_PROT_READ);
+
+       object = m->object;
+       vm_object_pip_add(object, 1);
+       swap_pager_putpages(object, &m, 1, FALSE, &rtvals);
+
+       if (rtvals != VM_PAGER_PEND) {
+               vm_object_pip_wakeup(object);
+               vm_page_io_finish(m);
+       }
+}