* Add vn_cache_strategy() and adjust vn_strategy() to call it. This
implements the read intercept. If vn_cache_strategy() determines that
the entire request can be handled by the swap cache it issues an
appropriate swap_pager_strategy() call and returns 1, else it returns 0
and the normal vn_strategy() function is run.
vn_cache_strategy() only intercepts READ's which meet some fairly strict
requirements, including no bogus pages and page alignment (so certain
meta-data in UFS which uses a 6144 byte block size cannot be read via
the swap cache, sorry).
* Implement numerous sysctls.
vm.swapcache.accrate (default 1000000)
The average long-term write rate in bytes/second for writing
data to the swap cache. This is what ultimately controls the
wear rate of the SSD swap.
vm.swapcache.maxburst (default
1000000000)
vm.swapcache.curburst (default starts at
1000000000)
On machine boot curburst defaults to maxburst and will automatically
be trimmed to maxburst if you change maxburst. This allows a high
write-rate after boot.
During normal operation writes reduce curburst and accrate increases
curburst (up to maxburst), so periods of inactivity will allow another
burst of write activity later on.
vm.swapcache.read_enable (default 0 - disabled)
Enable the swap cache read intercept. When turned on vn_strategy()
calls will read from the swap cache if possible. When turned off
vn_strategy() calls read from the underlying vnode whether data
is available in the swap cache or not.
vm.swapcache.meta_enable (default 0 - disabled)
Enable swap caching of meta-data (The VM-backed block devices used
by filesystems). The swapcache code scans the VM page inactive
queue for suitable clean VCHR-backed VM pages and writes them to
the swap cache.
vm.swapcache.data_enable (default 0 - disabled)
Enable swap caching of data (Regular files). The swapcache code
scans the VM page inactive queue for suitable clean VREG-backed VM
pages and writes them to the swap cache.
vm.swapcache.maxlaunder (default 128 pages per 1/10 second)
Specifies the maximum number of pages in the inactive queue to
scan every 1/10 second. Set fairly low for the moment but
the default will ultimately be increased to something like 512
or 1024.
vm.swapcache.write_count
The total amount of data written by the swap cache to swap,
in bytes, since boot.
* Call swap_pager_unswapped() in a few more places that need it.
* NFS doesn't use bread/vn_strategy so it has been modified to call
vn_cache_strategy() directly for async IO. Currently we cannot
easily do it for synchronous IO. But async IO will get most of
it.
* The swap cache will use up to 2/3 of available swap space to
cache clean vnode-backed data. Currently once this limit is
reached it will rely on vnode recycling to clean out space
and make room for more.
Vnode recycling is currently excessively limiting the amount of
data which can be cached, since when a vnode is recycled it's
backing VM object is also recycled and the swap cache assignments
are freed. Meta-data has other problems... it can choke the
swap cache.
Dealing with these issues is on the TODO.
#include <vm/vm_object.h>
#include <vm/vm_extern.h>
#include <vm/vm_map.h>
+#include <vm/vm_pager.h>
+#include <vm/swap_pager.h>
#include <sys/buf2.h>
#include <sys/thread2.h>
char *buf_wmesg = BUF_WMESG;
-extern int vm_swap_size;
-
#define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */
#define VFS_BIO_NEED_UNUSED02 0x02
#define VFS_BIO_NEED_UNUSED04 0x04
}
KASSERT(presid >= 0, ("brelse: extra page"));
vm_page_set_invalid(m, poffset, presid);
+
+ /*
+ * Also make sure any swap cache is removed
+ * as it is now stale (HAMMER in particular
+ * uses B_NOCACHE to deal with buffer
+ * aliasing).
+ */
+ swap_pager_unswapped(m);
}
resid -= PAGE_SIZE - (foff & PAGE_MASK);
foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
/*
* Initiate I/O on a vnode.
+ *
+ * SWAPCACHE OPERATION:
+ *
+ * Real buffer cache buffers have a non-NULL bp->b_vp. Unfortunately
+ * devfs also uses b_vp for fake buffers so we also have to check
+ * that B_PAGING is 0. In this case the passed 'vp' is probably the
+ * underlying block device. The swap assignments are related to the
+ * buffer cache buffer's b_vp, not the passed vp.
+ *
+ * The passed vp == bp->b_vp only in the case where the strategy call
+ * is made on the vp itself for its own buffers (a regular file or
+ * block device vp). The filesystem usually then re-calls vn_strategy()
+ * after translating the request to an underlying device.
+ *
+ * Cluster buffers set B_CLUSTER and the passed vp is the vp of the
+ * underlying buffer cache buffers.
+ *
+ * We can only deal with page-aligned buffers at the moment, because
+ * we can't tell what the real dirty state for pages straddling a buffer
+ * are.
+ *
+ * In order to call swap_pager_strategy() we must provide the VM object
+ * and base offset for the underlying buffer cache pages so it can find
+ * the swap blocks.
*/
void
vn_strategy(struct vnode *vp, struct bio *bio)
{
struct bio_track *track;
+ struct buf *bp = bio->bio_buf;
+
+ KKASSERT(bp->b_cmd != BUF_CMD_DONE);
+
+ /*
+ * Handle the swap cache intercept.
+ */
+ if (vn_cache_strategy(vp, bio))
+ return;
- KKASSERT(bio->bio_buf->b_cmd != BUF_CMD_DONE);
- if (bio->bio_buf->b_cmd == BUF_CMD_READ)
+ /*
+ * Otherwise do the operation through the filesystem
+ */
+ if (bp->b_cmd == BUF_CMD_READ)
track = &vp->v_track_read;
else
track = &vp->v_track_write;
vop_strategy(*vp->v_ops, vp, bio);
}
+int
+vn_cache_strategy(struct vnode *vp, struct bio *bio)
+{
+ struct buf *bp = bio->bio_buf;
+ struct bio *nbio;
+ vm_object_t object;
+ vm_page_t m;
+ int i;
+
+ /*
+ * Is this buffer cache buffer suitable for reading from
+ * the swap cache?
+ */
+ if (vm_swapcache_read_enable == 0 ||
+ bp->b_cmd != BUF_CMD_READ ||
+ ((bp->b_flags & B_CLUSTER) == 0 &&
+ (bp->b_vp == NULL || (bp->b_flags & B_PAGING))) ||
+ ((int)bp->b_loffset & PAGE_MASK) != 0 ||
+ (bp->b_bcount & PAGE_MASK) != 0) {
+ return(0);
+ }
+
+ /*
+ * Figure out the original VM object (it will match the underlying
+ * VM pages). Note that swap cached data uses page indices relative
+ * to that object, not relative to bio->bio_offset.
+ */
+ if (bp->b_flags & B_CLUSTER)
+ object = vp->v_object;
+ else
+ object = bp->b_vp->v_object;
+
+ /*
+ * In order to be able to use the swap cache all underlying VM
+ * pages must be marked as such, and we can't have any bogus pages.
+ */
+ for (i = 0; i < bp->b_xio.xio_npages; ++i) {
+ m = bp->b_xio.xio_pages[i];
+ if ((m->flags & PG_SWAPPED) == 0)
+ break;
+ if (m == bogus_page)
+ break;
+ }
+
+ /*
+ * If we are good then issue the I/O using swap_pager_strategy()
+ */
+ if (i == bp->b_xio.xio_npages) {
+ m = bp->b_xio.xio_pages[0];
+ nbio = push_bio(bio);
+ nbio->bio_offset = ptoa(m->pindex);
+ KKASSERT(m->object == object);
+ swap_pager_strategy(object, nbio);
+ return(1);
+ }
+ return(0);
+}
+
/*
* bpdone:
*
* Assume that vm_page_protect() can block (it can block
* if VM_PROT_NONE, don't take any chances regardless).
*
- * In particularly note that for writes we must incorporate
+ * In particular note that for writes we must incorporate
* page dirtyness from the VM system into the buffer's
* dirty range.
*
* vm_page_protect(). We may not be able
* to clear all dirty bits for a page if it
* was also memory mapped (NFS).
+ *
+ * Finally be sure to unassign any swap-cache
+ * backing store as it is now stale.
*/
vm_page_protect(m, VM_PROT_READ);
vfs_clean_one_page(bp, i, m);
+ swap_pager_unswapped(m);
} else if (m->valid == VM_PAGE_BITS_ALL) {
/*
* When readying a vnode-backed buffer for
int vrecycle (struct vnode *vp);
int vmaxiosize (struct vnode *vp);
void vn_strategy(struct vnode *vp, struct bio *bio);
+int vn_cache_strategy(struct vnode *vp, struct bio *bio);
int vn_close (struct vnode *vp, int flags);
int vn_isdisk (struct vnode *vp, int *errp);
int vn_lock (struct vnode *vp, int flags);
KKASSERT(vp->v_tag == VT_NFS);
BUF_KERNPROC(bp);
+
+ /*
+ * Shortcut swap cache (not done automatically because we are not
+ * using bread()).
+ */
+ if (vn_cache_strategy(vp, bio))
+ return;
+
bio->bio_driver_info = vp;
crit_enter();
TAILQ_INSERT_TAIL(&nmp->nm_bioq, bio, bio_act);
struct uio uio;
struct iovec io;
+#if 0
+ /*
+ * Shortcut swap cache (not done automatically because we are not
+ * using bread()).
+ *
+ * XXX The biowait is a hack until we can figure out how to stop a
+ * biodone chain when a middle element is BIO_SYNC. BIO_SYNC is
+ * set so the bp shouldn't get ripped out from under us. The only
+ * use-cases are fully synchronous I/O cases.
+ *
+ * XXX This is having problems, give up for now.
+ */
+ if (vn_cache_strategy(vp, bio)) {
+ kprintf("X");
+ error = biowait(&bio->bio_buf->b_bio1, "nfsrsw");
+ return (error);
+ }
+#endif
+
KKASSERT(vp->v_tag == VT_NFS);
np = VTONFS(vp);
nmp = VFSTONFS(vp->v_mount);
* in the old system.
*/
-extern int vm_swap_size; /* number of free swap blocks, in pages */
-
int swap_pager_full; /* swap space exhaustion (task killing) */
int vm_swap_cache_use;
int vm_swap_anon_use;
#ifdef _KERNEL
extern int swap_pager_full;
+extern int vm_swap_size;
extern int vm_swap_cache_use;
extern int vm_swap_anon_use;
+extern int vm_swapcache_read_enable;
extern struct blist *swapblist;
void swap_pager_putpages (vm_object_t, struct vm_page **, int, boolean_t, int *);
static int vm_pageout_req_swapout; /* XXX */
static int vm_daemon_needed;
#endif
-extern int vm_swap_size;
static int vm_max_launder = 32;
static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0;
static int vm_pageout_full_stats_interval = 0;
SYSCTL_NODE(_vm, OID_AUTO, swapcache, CTLFLAG_RW, NULL, NULL);
+int vm_swapcache_read_enable;
static int vm_swapcache_sleep;
-static int vm_swapcache_maxlaunder = 64;
+static int vm_swapcache_maxlaunder = 128;
static int vm_swapcache_data_enable = 0;
static int vm_swapcache_meta_enable = 0;
+static int64_t vm_swapcache_curburst = 1000000000LL;
+static int64_t vm_swapcache_maxburst = 1000000000LL;
+static int64_t vm_swapcache_accrate = 1000000LL;
static int64_t vm_swapcache_write_count;
SYSCTL_INT(_vm_swapcache, OID_AUTO, maxlaunder,
CTLFLAG_RW, &vm_swapcache_maxlaunder, 0, "");
+
SYSCTL_INT(_vm_swapcache, OID_AUTO, data_enable,
CTLFLAG_RW, &vm_swapcache_data_enable, 0, "");
SYSCTL_INT(_vm_swapcache, OID_AUTO, meta_enable,
CTLFLAG_RW, &vm_swapcache_meta_enable, 0, "");
+SYSCTL_INT(_vm_swapcache, OID_AUTO, read_enable,
+ CTLFLAG_RW, &vm_swapcache_read_enable, 0, "");
+
+SYSCTL_QUAD(_vm_swapcache, OID_AUTO, curburst,
+ CTLFLAG_RW, &vm_swapcache_curburst, 0, "");
+SYSCTL_QUAD(_vm_swapcache, OID_AUTO, maxburst,
+ CTLFLAG_RW, &vm_swapcache_maxburst, 0, "");
+SYSCTL_QUAD(_vm_swapcache, OID_AUTO, accrate,
+ CTLFLAG_RW, &vm_swapcache_accrate, 0, "");
SYSCTL_QUAD(_vm_swapcache, OID_AUTO, write_count,
CTLFLAG_RW, &vm_swapcache_write_count, 0, "");
{
struct vm_page marker;
vm_object_t object;
+ struct vnode *vp;
vm_page_t m;
int count;
tsleep(&vm_swapcache_sleep, 0, "csleep", hz * 5);
continue;
}
- tsleep(&vm_swapcache_sleep, 0, "csleep", hz);
+
+ /*
+ * Polling rate when enabled is 10 hz. Deal with write
+ * bandwidth limits.
+ *
+ * We don't want to nickle-and-dime the scan as that will
+ * create unnecessary fragmentation.
+ */
+ tsleep(&vm_swapcache_sleep, 0, "csleep", hz / 10);
+ vm_swapcache_curburst += vm_swapcache_accrate / 10;
+ if (vm_swapcache_curburst > vm_swapcache_maxburst)
+ vm_swapcache_curburst = vm_swapcache_maxburst;
+ if (vm_swapcache_curburst < vm_swapcache_accrate)
+ continue;
+
+ /*
+ * Don't load any more into the cache once we have exceeded
+ * 2/3 of available swap space. XXX need to start cleaning
+ * it out, though vnode recycling will accomplish that to
+ * some degree.
+ */
+ if (vm_swap_cache_use > vm_swap_size * 2 / 3)
+ continue;
/*
* Calculate the number of pages to test. We don't want
++count;
continue;
}
+ if (vm_swapcache_curburst < 0)
+ break;
if (m->flags & (PG_SWAPPED | PG_BUSY | PG_UNMANAGED))
continue;
if (m->busy || m->hold_count || m->wire_count)
continue;
if ((object = m->object) == NULL)
continue;
- if (object->type != OBJT_VNODE)
+ if (object->type != OBJT_VNODE ||
+ (object->flags & OBJ_DEAD)) {
continue;
+ }
vm_page_test_dirty(m);
if (m->dirty & m->valid)
continue;
+ vp = object->handle;
+ if (vp == NULL)
+ continue;
+ switch(vp->v_type) {
+ case VREG:
+ if (vm_swapcache_data_enable == 0)
+ continue;
+ break;
+ case VCHR:
+ if (vm_swapcache_meta_enable == 0)
+ continue;
+ break;
+ default:
+ continue;
+ }
/*
* Ok, move the marker and soft-busy the page.
object = m->object;
vm_object_pip_add(object, 1);
swap_pager_putpages(object, &m, 1, FALSE, &rtvals);
+ vm_swapcache_write_count += PAGE_SIZE;
+ vm_swapcache_curburst -= PAGE_SIZE;
if (rtvals != VM_PAGER_PEND) {
vm_object_pip_wakeup(object);