| Commit | Line | Data |
|---|---|---|
| 096e95c0 MD |
1 | /* |
| 2 | * Copyright (c) 2010 The DragonFly Project. All rights reserved. | |
| 3 | * | |
| 4 | * This code is derived from software contributed to The DragonFly Project | |
| 5 | * by Matthew Dillon <dillon@backplane.com> | |
| 6 | * | |
| 7 | * Redistribution and use in source and binary forms, with or without | |
| 8 | * modification, are permitted provided that the following conditions | |
| 9 | * are met: | |
| 10 | * | |
| 11 | * 1. Redistributions of source code must retain the above copyright | |
| 12 | * notice, this list of conditions and the following disclaimer. | |
| 13 | * 2. Redistributions in binary form must reproduce the above copyright | |
| 14 | * notice, this list of conditions and the following disclaimer in | |
| 15 | * the documentation and/or other materials provided with the | |
| 16 | * distribution. | |
| 17 | * 3. Neither the name of The DragonFly Project nor the names of its | |
| 18 | * contributors may be used to endorse or promote products derived | |
| 19 | * from this software without specific, prior written permission. | |
| 20 | * | |
| 21 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
| 22 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
| 23 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS | |
| 24 | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE | |
| 25 | * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, | |
| 26 | * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, | |
| 27 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
| 28 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED | |
| 29 | * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
| 30 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT | |
| 31 | * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
| 32 | * SUCH DAMAGE. | |
| 33 | */ | |
| 34 | ||
| 35 | /* | |
| 36 | * Implement the swapcache daemon. When enabled swap is assumed to be | |
| 37 | * configured on a fast storage device such as a SSD. Swap is assigned | |
| 38 | * to clean vnode-backed pages in the inactive queue, clustered by object | |
| 39 | * if possible, and written out. The swap assignment sticks around even | |
| 40 | * after the underlying pages have been recycled. | |
| 41 | * | |
| 42 | * The daemon manages write bandwidth based on sysctl settings to control | |
| 43 | * wear on the SSD. | |
| 44 | * | |
| 45 | * The vnode strategy code will check for the swap assignments and divert | |
| 46 | * reads to the swap device. | |
| 47 | * | |
| 48 | * This operates on both regular files and the block device vnodes used by | |
| 49 | * filesystems to manage meta-data. | |
| 50 | */ | |
| 51 | ||
| 52 | #include "opt_vm.h" | |
| 53 | #include <sys/param.h> | |
| 54 | #include <sys/systm.h> | |
| 55 | #include <sys/kernel.h> | |
| 56 | #include <sys/proc.h> | |
| 57 | #include <sys/kthread.h> | |
| 58 | #include <sys/resourcevar.h> | |
| 59 | #include <sys/signalvar.h> | |
| 60 | #include <sys/vnode.h> | |
| 61 | #include <sys/vmmeter.h> | |
| 62 | #include <sys/sysctl.h> | |
| 63 | ||
| 64 | #include <vm/vm.h> | |
| 65 | #include <vm/vm_param.h> | |
| 66 | #include <sys/lock.h> | |
| 67 | #include <vm/vm_object.h> | |
| 68 | #include <vm/vm_page.h> | |
| 69 | #include <vm/vm_map.h> | |
| 70 | #include <vm/vm_pageout.h> | |
| 71 | #include <vm/vm_pager.h> | |
| 72 | #include <vm/swap_pager.h> | |
| 73 | #include <vm/vm_extern.h> | |
| 74 | ||
| 75 | #include <sys/thread2.h> | |
| 76 | #include <vm/vm_page2.h> | |
| 77 | ||
| 78 | #define INACTIVE_LIST (&vm_page_queues[PQ_INACTIVE].pl) | |
| 79 | ||
| 80 | /* the kernel process "vm_pageout"*/ | |
| 81 | static void vm_swapcached (void); | |
| 82 | static void vm_swapcached_flush (vm_page_t m); | |
| 83 | struct thread *swapcached_thread; | |
| 84 | ||
| 85 | static struct kproc_desc swpc_kp = { | |
| 86 | "swapcached", | |
| 87 | vm_swapcached, | |
| 88 | &swapcached_thread | |
| 89 | }; | |
| 90 | SYSINIT(swapcached, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start, &swpc_kp) | |
| 91 | ||
| 92 | SYSCTL_NODE(_vm, OID_AUTO, swapcache, CTLFLAG_RW, NULL, NULL); | |
| 93 | ||
| c504e38e | 94 | int vm_swapcache_read_enable; |
| 096e95c0 | 95 | static int vm_swapcache_sleep; |
| 1e5196f0 | 96 | static int vm_swapcache_maxlaunder = 256; |
| 096e95c0 MD |
97 | static int vm_swapcache_data_enable = 0; |
| 98 | static int vm_swapcache_meta_enable = 0; | |
| c504e38e MD |
99 | static int64_t vm_swapcache_curburst = 1000000000LL; |
| 100 | static int64_t vm_swapcache_maxburst = 1000000000LL; | |
| 101 | static int64_t vm_swapcache_accrate = 1000000LL; | |
| 096e95c0 MD |
102 | static int64_t vm_swapcache_write_count; |
| 103 | ||
| 104 | SYSCTL_INT(_vm_swapcache, OID_AUTO, maxlaunder, | |
| 105 | CTLFLAG_RW, &vm_swapcache_maxlaunder, 0, ""); | |
| c504e38e | 106 | |
| 096e95c0 MD |
107 | SYSCTL_INT(_vm_swapcache, OID_AUTO, data_enable, |
| 108 | CTLFLAG_RW, &vm_swapcache_data_enable, 0, ""); | |
| 109 | SYSCTL_INT(_vm_swapcache, OID_AUTO, meta_enable, | |
| 110 | CTLFLAG_RW, &vm_swapcache_meta_enable, 0, ""); | |
| c504e38e MD |
111 | SYSCTL_INT(_vm_swapcache, OID_AUTO, read_enable, |
| 112 | CTLFLAG_RW, &vm_swapcache_read_enable, 0, ""); | |
| 113 | ||
| 114 | SYSCTL_QUAD(_vm_swapcache, OID_AUTO, curburst, | |
| 115 | CTLFLAG_RW, &vm_swapcache_curburst, 0, ""); | |
| 116 | SYSCTL_QUAD(_vm_swapcache, OID_AUTO, maxburst, | |
| 117 | CTLFLAG_RW, &vm_swapcache_maxburst, 0, ""); | |
| 118 | SYSCTL_QUAD(_vm_swapcache, OID_AUTO, accrate, | |
| 119 | CTLFLAG_RW, &vm_swapcache_accrate, 0, ""); | |
| 096e95c0 MD |
120 | SYSCTL_QUAD(_vm_swapcache, OID_AUTO, write_count, |
| 121 | CTLFLAG_RW, &vm_swapcache_write_count, 0, ""); | |
| 122 | ||
| 123 | /* | |
| 124 | * vm_swapcached is the high level pageout daemon. | |
| 125 | */ | |
| 126 | static void | |
| 127 | vm_swapcached(void) | |
| 128 | { | |
| 129 | struct vm_page marker; | |
| 130 | vm_object_t object; | |
| c504e38e | 131 | struct vnode *vp; |
| 096e95c0 MD |
132 | vm_page_t m; |
| 133 | int count; | |
| 134 | ||
| 135 | /* | |
| 136 | * Thread setup | |
| 137 | */ | |
| 138 | curthread->td_flags |= TDF_SYSTHREAD; | |
| 139 | ||
| 140 | /* | |
| 141 | * Initialize our marker | |
| 142 | */ | |
| 143 | bzero(&marker, sizeof(marker)); | |
| 144 | marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER; | |
| 145 | marker.queue = PQ_INACTIVE; | |
| 146 | marker.wire_count = 1; | |
| 147 | ||
| 148 | crit_enter(); | |
| 149 | TAILQ_INSERT_HEAD(INACTIVE_LIST, &marker, pageq); | |
| 150 | ||
| 151 | for (;;) { | |
| 152 | /* | |
| 153 | * Loop once a second or so looking for work when enabled. | |
| 154 | */ | |
| 155 | if (vm_swapcache_data_enable == 0 && | |
| 156 | vm_swapcache_meta_enable == 0) { | |
| 157 | tsleep(&vm_swapcache_sleep, 0, "csleep", hz * 5); | |
| 158 | continue; | |
| 159 | } | |
| c504e38e MD |
160 | |
| 161 | /* | |
| 162 | * Polling rate when enabled is 10 hz. Deal with write | |
| 163 | * bandwidth limits. | |
| 164 | * | |
| 165 | * We don't want to nickle-and-dime the scan as that will | |
| 166 | * create unnecessary fragmentation. | |
| 167 | */ | |
| 168 | tsleep(&vm_swapcache_sleep, 0, "csleep", hz / 10); | |
| 169 | vm_swapcache_curburst += vm_swapcache_accrate / 10; | |
| 170 | if (vm_swapcache_curburst > vm_swapcache_maxburst) | |
| 171 | vm_swapcache_curburst = vm_swapcache_maxburst; | |
| 172 | if (vm_swapcache_curburst < vm_swapcache_accrate) | |
| 173 | continue; | |
| 174 | ||
| 175 | /* | |
| 176 | * Don't load any more into the cache once we have exceeded | |
| 1e5196f0 | 177 | * 3/4 of available swap space. XXX need to start cleaning |
| c504e38e MD |
178 | * it out, though vnode recycling will accomplish that to |
| 179 | * some degree. | |
| 180 | */ | |
| 1e5196f0 | 181 | if (vm_swap_cache_use > vm_swap_max * 3 / 4) |
| c504e38e | 182 | continue; |
| 096e95c0 MD |
183 | |
| 184 | /* | |
| 185 | * Calculate the number of pages to test. We don't want | |
| 186 | * to get into a cpu-bound loop. | |
| 187 | */ | |
| 188 | count = vmstats.v_inactive_count; | |
| 189 | if (count > vm_swapcache_maxlaunder) | |
| 190 | count = vm_swapcache_maxlaunder; | |
| 191 | ||
| 192 | /* | |
| 193 | * Scan the inactive queue from our marker to locate | |
| 194 | * suitable pages to push to the swap cache. | |
| 195 | * | |
| 196 | * We are looking for clean vnode-backed pages. | |
| 5ac04117 MD |
197 | * |
| 198 | * NOTE: PG_SWAPPED pages in particular are not part of | |
| 199 | * our count because once the cache stabilizes we | |
| 200 | * can end up with a very high datarate of VM pages | |
| 201 | * cycling from it. | |
| 096e95c0 MD |
202 | */ |
| 203 | m = ▮ | |
| 204 | while ((m = TAILQ_NEXT(m, pageq)) != NULL && count--) { | |
| 5ac04117 | 205 | if (m->flags & (PG_MARKER | PG_SWAPPED)) { |
| 096e95c0 MD |
206 | ++count; |
| 207 | continue; | |
| 208 | } | |
| c504e38e MD |
209 | if (vm_swapcache_curburst < 0) |
| 210 | break; | |
| 5ac04117 | 211 | if (m->flags & (PG_BUSY | PG_UNMANAGED)) |
| 096e95c0 MD |
212 | continue; |
| 213 | if (m->busy || m->hold_count || m->wire_count) | |
| 214 | continue; | |
| 215 | if (m->valid != VM_PAGE_BITS_ALL) | |
| 216 | continue; | |
| 217 | if (m->dirty & m->valid) | |
| 218 | continue; | |
| 219 | if ((object = m->object) == NULL) | |
| 220 | continue; | |
| c504e38e MD |
221 | if (object->type != OBJT_VNODE || |
| 222 | (object->flags & OBJ_DEAD)) { | |
| 096e95c0 | 223 | continue; |
| c504e38e | 224 | } |
| 096e95c0 MD |
225 | vm_page_test_dirty(m); |
| 226 | if (m->dirty & m->valid) | |
| 227 | continue; | |
| c504e38e MD |
228 | vp = object->handle; |
| 229 | if (vp == NULL) | |
| 230 | continue; | |
| 231 | switch(vp->v_type) { | |
| 232 | case VREG: | |
| 233 | if (vm_swapcache_data_enable == 0) | |
| 234 | continue; | |
| 235 | break; | |
| 236 | case VCHR: | |
| 237 | if (vm_swapcache_meta_enable == 0) | |
| 238 | continue; | |
| 239 | break; | |
| 240 | default: | |
| 241 | continue; | |
| 242 | } | |
| 096e95c0 MD |
243 | |
| 244 | /* | |
| 245 | * Ok, move the marker and soft-busy the page. | |
| 246 | */ | |
| 247 | TAILQ_REMOVE(INACTIVE_LIST, &marker, pageq); | |
| 248 | TAILQ_INSERT_AFTER(INACTIVE_LIST, m, &marker, pageq); | |
| 249 | ||
| 250 | /* | |
| 251 | * Assign swap and initiate I/O | |
| 252 | */ | |
| 253 | vm_swapcached_flush(m); | |
| 254 | ||
| 255 | /* | |
| 256 | * Setup for next loop using marker. | |
| 257 | */ | |
| 258 | m = ▮ | |
| 259 | } | |
| 1e5196f0 MD |
260 | |
| 261 | /* | |
| 262 | * Cleanup marker position. If we hit the end of the | |
| 263 | * list the marker is placed at the tail. Newly deactivated | |
| 264 | * pages will be placed after it. | |
| 265 | * | |
| 266 | * Earlier inactive pages that were dirty and become clean | |
| 267 | * are typically moved to the end of PQ_INACTIVE by virtue | |
| 268 | * of vfs_vmio_release() when they become unwired from the | |
| 269 | * buffer cache. | |
| 270 | */ | |
| 096e95c0 MD |
271 | TAILQ_REMOVE(INACTIVE_LIST, &marker, pageq); |
| 272 | if (m) | |
| 273 | TAILQ_INSERT_BEFORE(m, &marker, pageq); | |
| 274 | else | |
| 1e5196f0 | 275 | TAILQ_INSERT_TAIL(INACTIVE_LIST, &marker, pageq); |
| 096e95c0 MD |
276 | |
| 277 | } | |
| 278 | TAILQ_REMOVE(INACTIVE_LIST, &marker, pageq); | |
| 279 | crit_exit(); | |
| 280 | } | |
| 281 | ||
| 282 | /* | |
| 283 | * Flush the specified page using the swap_pager. | |
| 284 | */ | |
| 285 | static | |
| 286 | void | |
| 287 | vm_swapcached_flush(vm_page_t m) | |
| 288 | { | |
| 289 | vm_object_t object; | |
| 290 | int rtvals; | |
| 291 | ||
| 292 | vm_page_io_start(m); | |
| 293 | vm_page_protect(m, VM_PROT_READ); | |
| 294 | ||
| 295 | object = m->object; | |
| 296 | vm_object_pip_add(object, 1); | |
| 297 | swap_pager_putpages(object, &m, 1, FALSE, &rtvals); | |
| c504e38e MD |
298 | vm_swapcache_write_count += PAGE_SIZE; |
| 299 | vm_swapcache_curburst -= PAGE_SIZE; | |
| 096e95c0 MD |
300 | |
| 301 | if (rtvals != VM_PAGER_PEND) { | |
| 302 | vm_object_pip_wakeup(object); | |
| 303 | vm_page_io_finish(m); | |
| 304 | } | |
| 305 | } |