sys/vm/vm_swapcache.c

   1 /*
   2  * (MPSAFE)
   3  *
   4  * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
   5  *
   6  * This code is derived from software contributed to The DragonFly Project
   7  * by Matthew Dillon <dillon@backplane.com>
   8  *
   9  * Redistribution and use in source and binary forms, with or without
  10  * modification, are permitted provided that the following conditions
  11  * are met:
  12  *
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in
  17  *    the documentation and/or other materials provided with the
  18  *    distribution.
  19  * 3. Neither the name of The DragonFly Project nor the names of its
  20  *    contributors may be used to endorse or promote products derived
  21  *    from this software without specific, prior written permission.
  22  *
  23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  34  * SUCH DAMAGE.
  35  */
  36
  37 /*
  38  * Implement the swapcache daemon.  When enabled swap is assumed to be
  39  * configured on a fast storage device such as a SSD.  Swap is assigned
  40  * to clean vnode-backed pages in the inactive queue, clustered by object
  41  * if possible, and written out.  The swap assignment sticks around even
  42  * after the underlying pages have been recycled.
  43  *
  44  * The daemon manages write bandwidth based on sysctl settings to control
  45  * wear on the SSD.
  46  *
  47  * The vnode strategy code will check for the swap assignments and divert
  48  * reads to the swap device when the data is present in the swapcache.
  49  *
  50  * This operates on both regular files and the block device vnodes used by
  51  * filesystems to manage meta-data.
  52  */
  53
  54 #include "opt_vm.h"
  55 #include <sys/param.h>
  56 #include <sys/systm.h>
  57 #include <sys/kernel.h>
  58 #include <sys/proc.h>
  59 #include <sys/kthread.h>
  60 #include <sys/resourcevar.h>
  61 #include <sys/signalvar.h>
  62 #include <sys/vnode.h>
  63 #include <sys/vmmeter.h>
  64 #include <sys/sysctl.h>
  65
  66 #include <vm/vm.h>
  67 #include <vm/vm_param.h>
  68 #include <sys/lock.h>
  69 #include <vm/vm_object.h>
  70 #include <vm/vm_page.h>
  71 #include <vm/vm_map.h>
  72 #include <vm/vm_pageout.h>
  73 #include <vm/vm_pager.h>
  74 #include <vm/swap_pager.h>
  75 #include <vm/vm_extern.h>
  76
  77 #include <sys/thread2.h>
  78 #include <vm/vm_page2.h>
  79
  80 #define INACTIVE_LIST   (&vm_page_queues[PQ_INACTIVE].pl)
  81
  82 /* the kernel process "vm_pageout"*/
  83 static void vm_swapcached (void);
  84 static int vm_swapcached_flush (vm_page_t m, int isblkdev);
  85 static int vm_swapcache_test(vm_page_t m);
  86 static void vm_swapcache_writing(vm_page_t marker);
  87 static void vm_swapcache_cleaning(vm_object_t marker);
  88 struct thread *swapcached_thread;
  89
  90 static struct kproc_desc swpc_kp = {
  91         "swapcached",
  92         vm_swapcached,
  93         &swapcached_thread
  94 };
  95 SYSINIT(swapcached, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start, &swpc_kp)
  96
  97 SYSCTL_NODE(_vm, OID_AUTO, swapcache, CTLFLAG_RW, NULL, NULL);
  98
  99 int vm_swapcache_read_enable;
 100 int vm_swapcache_inactive_heuristic;
 101 static int vm_swapcache_sleep;
 102 static int vm_swapcache_maxlaunder = 256;
 103 static int vm_swapcache_data_enable = 0;
 104 static int vm_swapcache_meta_enable = 0;
 105 static int vm_swapcache_maxswappct = 75;
 106 static int vm_swapcache_hysteresis;
 107 static int vm_swapcache_use_chflags = 1;        /* require chflags cache */
 108 static int64_t vm_swapcache_minburst = 10000000LL;      /* 10MB */
 109 static int64_t vm_swapcache_curburst = 4000000000LL;    /* 4G after boot */
 110 static int64_t vm_swapcache_maxburst = 2000000000LL;    /* 2G nominal max */
 111 static int64_t vm_swapcache_accrate = 100000LL;         /* 100K/s */
 112 static int64_t vm_swapcache_write_count;
 113 static int64_t vm_swapcache_maxfilesize;
 114
 115 SYSCTL_INT(_vm_swapcache, OID_AUTO, maxlaunder,
 116         CTLFLAG_RW, &vm_swapcache_maxlaunder, 0, "");
 117
 118 SYSCTL_INT(_vm_swapcache, OID_AUTO, data_enable,
 119         CTLFLAG_RW, &vm_swapcache_data_enable, 0, "");
 120 SYSCTL_INT(_vm_swapcache, OID_AUTO, meta_enable,
 121         CTLFLAG_RW, &vm_swapcache_meta_enable, 0, "");
 122 SYSCTL_INT(_vm_swapcache, OID_AUTO, read_enable,
 123         CTLFLAG_RW, &vm_swapcache_read_enable, 0, "");
 124 SYSCTL_INT(_vm_swapcache, OID_AUTO, maxswappct,
 125         CTLFLAG_RW, &vm_swapcache_maxswappct, 0, "");
 126 SYSCTL_INT(_vm_swapcache, OID_AUTO, hysteresis,
 127         CTLFLAG_RW, &vm_swapcache_hysteresis, 0, "");
 128 SYSCTL_INT(_vm_swapcache, OID_AUTO, use_chflags,
 129         CTLFLAG_RW, &vm_swapcache_use_chflags, 0, "");
 130
 131 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, minburst,
 132         CTLFLAG_RW, &vm_swapcache_minburst, 0, "");
 133 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, curburst,
 134         CTLFLAG_RW, &vm_swapcache_curburst, 0, "");
 135 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, maxburst,
 136         CTLFLAG_RW, &vm_swapcache_maxburst, 0, "");
 137 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, maxfilesize,
 138         CTLFLAG_RW, &vm_swapcache_maxfilesize, 0, "");
 139 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, accrate,
 140         CTLFLAG_RW, &vm_swapcache_accrate, 0, "");
 141 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, write_count,
 142         CTLFLAG_RW, &vm_swapcache_write_count, 0, "");
 143
 144 #define SWAPMAX(adj)    \
 145         ((int64_t)vm_swap_max * (vm_swapcache_maxswappct + (adj)) / 100)
 146
 147 /*
 148  * vm_swapcached is the high level pageout daemon.
 149  *
 150  * No requirements.
 151  */
 152 static void
 153 vm_swapcached(void)
 154 {
 155         enum { SWAPC_WRITING, SWAPC_CLEANING } state = SWAPC_WRITING;
 156         enum { SWAPB_BURSTING, SWAPB_RECOVERING } burst = SWAPB_BURSTING;
 157         struct vm_page page_marker;
 158         struct vm_object object_marker;
 159
 160         /*
 161          * Thread setup
 162          */
 163         curthread->td_flags |= TDF_SYSTHREAD;
 164         crit_enter();
 165         lwkt_gettoken(&vm_token);
 166
 167         /*
 168          * Initialize our marker for the inactive scan (SWAPC_WRITING)
 169          */
 170         bzero(&page_marker, sizeof(page_marker));
 171         page_marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
 172         page_marker.queue = PQ_INACTIVE;
 173         page_marker.wire_count = 1;
 174         TAILQ_INSERT_HEAD(INACTIVE_LIST, &page_marker, pageq);
 175         vm_swapcache_hysteresis = vmstats.v_inactive_target / 2;
 176         vm_swapcache_inactive_heuristic = -vm_swapcache_hysteresis;
 177
 178         /*
 179          * Initialize our marker for the vm_object scan (SWAPC_CLEANING)
 180          */
 181         bzero(&object_marker, sizeof(object_marker));
 182         object_marker.type = OBJT_MARKER;
 183         lwkt_gettoken(&vmobj_token);
 184         TAILQ_INSERT_HEAD(&vm_object_list, &object_marker, object_list);
 185         lwkt_reltoken(&vmobj_token);
 186
 187         for (;;) {
 188                 /*
 189                  * Check every 5 seconds when not enabled or if no swap
 190                  * is present.
 191                  */
 192                 if ((vm_swapcache_data_enable == 0 &&
 193                      vm_swapcache_meta_enable == 0) ||
 194                     vm_swap_max == 0) {
 195                         tsleep(&vm_swapcache_sleep, 0, "csleep", hz * 5);
 196                         continue;
 197                 }
 198
 199                 /*
 200                  * Polling rate when enabled is approximately 10 hz.
 201                  */
 202                 tsleep(&vm_swapcache_sleep, 0, "csleep", hz / 10);
 203
 204                 /*
 205                  * State hysteresis.  Generate write activity up to 75% of
 206                  * swap, then clean out swap assignments down to 70%, then
 207                  * repeat.
 208                  */
 209                 if (state == SWAPC_WRITING) {
 210                         if (vm_swap_cache_use > SWAPMAX(0))
 211                                 state = SWAPC_CLEANING;
 212                 } else {
 213                         if (vm_swap_cache_use < SWAPMAX(-5))
 214                                 state = SWAPC_WRITING;
 215                 }
 216
 217                 /*
 218                  * We are allowed to continue accumulating burst value
 219                  * in either state.  Allow the user to set curburst > maxburst
 220                  * for the initial load-in.
 221                  */
 222                 if (vm_swapcache_curburst < vm_swapcache_maxburst) {
 223                         vm_swapcache_curburst += vm_swapcache_accrate / 10;
 224                         if (vm_swapcache_curburst > vm_swapcache_maxburst)
 225                                 vm_swapcache_curburst = vm_swapcache_maxburst;
 226                 }
 227
 228                 /*
 229                  * We don't want to nickle-and-dime the scan as that will
 230                  * create unnecessary fragmentation.  The minimum burst
 231                  * is one-seconds worth of accumulation.
 232                  */
 233                 if (state == SWAPC_WRITING) {
 234                         if (vm_swapcache_curburst >= vm_swapcache_accrate) {
 235                                 if (burst == SWAPB_BURSTING) {
 236                                         vm_swapcache_writing(&page_marker);
 237                                         if (vm_swapcache_curburst <= 0)
 238                                                 burst = SWAPB_RECOVERING;
 239                                 } else if (vm_swapcache_curburst >
 240                                            vm_swapcache_minburst) {
 241                                         vm_swapcache_writing(&page_marker);
 242                                         burst = SWAPB_BURSTING;
 243                                 }
 244                         }
 245                 } else {
 246                         vm_swapcache_cleaning(&object_marker);
 247                 }
 248         }
 249         TAILQ_REMOVE(INACTIVE_LIST, &page_marker, pageq);
 250         lwkt_gettoken(&vmobj_token);
 251         TAILQ_REMOVE(&vm_object_list, &object_marker, object_list);
 252         lwkt_reltoken(&vmobj_token);
 253         lwkt_reltoken(&vm_token);
 254         crit_exit();
 255 }
 256
 257 /*
 258  * The caller must hold vm_token.
 259  */
 260 static void
 261 vm_swapcache_writing(vm_page_t marker)
 262 {
 263         vm_object_t object;
 264         struct vnode *vp;
 265         vm_page_t m;
 266         int count;
 267         int isblkdev;
 268
 269         /*
 270          * Deal with an overflow of the heuristic counter or if the user
 271          * manually changes the hysteresis.
 272          *
 273          * Try to avoid small incremental pageouts by waiting for enough
 274          * pages to buildup in the inactive queue to hopefully get a good
 275          * burst in.  This heuristic is bumped by the VM system and reset
 276          * when our scan hits the end of the queue.
 277          */
 278         if (vm_swapcache_inactive_heuristic < -vm_swapcache_hysteresis)
 279                 vm_swapcache_inactive_heuristic = -vm_swapcache_hysteresis;
 280         if (vm_swapcache_inactive_heuristic < 0)
 281                 return;
 282
 283         /*
 284          * Scan the inactive queue from our marker to locate
 285          * suitable pages to push to the swap cache.
 286          *
 287          * We are looking for clean vnode-backed pages.
 288          *
 289          * NOTE: PG_SWAPPED pages in particular are not part of
 290          *       our count because once the cache stabilizes we
 291          *       can end up with a very high datarate of VM pages
 292          *       cycling from it.
 293          */
 294         m = marker;
 295         count = vm_swapcache_maxlaunder;
 296
 297         while ((m = TAILQ_NEXT(m, pageq)) != NULL && count--) {
 298                 if (m->flags & (PG_MARKER | PG_SWAPPED)) {
 299                         ++count;
 300                         continue;
 301                 }
 302                 if (vm_swapcache_curburst < 0)
 303                         break;
 304                 if (vm_swapcache_test(m))
 305                         continue;
 306                 object = m->object;
 307                 vp = object->handle;
 308                 if (vp == NULL)
 309                         continue;
 310
 311                 switch(vp->v_type) {
 312                 case VREG:
 313                         /*
 314                          * If data_enable is 0 do not try to swapcache data.
 315                          * If use_chflags is set then only swapcache data for
 316                          * VSWAPCACHE marked vnodes, otherwise any vnode.
 317                          */
 318                         if (vm_swapcache_data_enable == 0 ||
 319                             ((vp->v_flag & VSWAPCACHE) == 0 &&
 320                              vm_swapcache_use_chflags)) {
 321                                 continue;
 322                         }
 323                         if (vm_swapcache_maxfilesize &&
 324                             object->size >
 325                             (vm_swapcache_maxfilesize >> PAGE_SHIFT)) {
 326                                 continue;
 327                         }
 328                         isblkdev = 0;
 329                         break;
 330                 case VCHR:
 331                         /*
 332                          * The PG_NOTMETA flag only applies to pages
 333                          * associated with block devices.
 334                          */
 335                         if (m->flags & PG_NOTMETA)
 336                                 continue;
 337                         if (vm_swapcache_meta_enable == 0)
 338                                 continue;
 339                         isblkdev = 1;
 340                         break;
 341                 default:
 342                         continue;
 343                 }
 344
 345                 /*
 346                  * Ok, move the marker and soft-busy the page.
 347                  */
 348                 TAILQ_REMOVE(INACTIVE_LIST, marker, pageq);
 349                 TAILQ_INSERT_AFTER(INACTIVE_LIST, m, marker, pageq);
 350
 351                 /*
 352                  * Assign swap and initiate I/O.
 353                  *
 354                  * (adjust for the --count which also occurs in the loop)
 355                  */
 356                 count -= vm_swapcached_flush(m, isblkdev) - 1;
 357
 358                 /*
 359                  * Setup for next loop using marker.
 360                  */
 361                 m = marker;
 362         }
 363
 364         /*
 365          * Cleanup marker position.  If we hit the end of the
 366          * list the marker is placed at the tail.  Newly deactivated
 367          * pages will be placed after it.
 368          *
 369          * Earlier inactive pages that were dirty and become clean
 370          * are typically moved to the end of PQ_INACTIVE by virtue
 371          * of vfs_vmio_release() when they become unwired from the
 372          * buffer cache.
 373          */
 374         TAILQ_REMOVE(INACTIVE_LIST, marker, pageq);
 375         if (m) {
 376                 TAILQ_INSERT_BEFORE(m, marker, pageq);
 377         } else {
 378                 TAILQ_INSERT_TAIL(INACTIVE_LIST, marker, pageq);
 379                 vm_swapcache_inactive_heuristic = -vm_swapcache_hysteresis;
 380         }
 381 }
 382
 383 /*
 384  * Flush the specified page using the swap_pager.
 385  *
 386  * Try to collect surrounding pages, including pages which may
 387  * have already been assigned swap.  Try to cluster within a
 388  * contiguous aligned SMAP_META_PAGES (typ 16 x PAGE_SIZE) block
 389  * to match what swap_pager_putpages() can do.
 390  *
 391  * We also want to try to match against the buffer cache blocksize
 392  * but we don't really know what it is here.  Since the buffer cache
 393  * wires and unwires pages in groups the fact that we skip wired pages
 394  * should be sufficient.
 395  *
 396  * Returns a count of pages we might have flushed (minimum 1)
 397  *
 398  * The caller must hold vm_token.
 399  */
 400 static
 401 int
 402 vm_swapcached_flush(vm_page_t m, int isblkdev)
 403 {
 404         vm_object_t object;
 405         vm_page_t marray[SWAP_META_PAGES];
 406         vm_pindex_t basei;
 407         int rtvals[SWAP_META_PAGES];
 408         int x;
 409         int i;
 410         int j;
 411         int count;
 412
 413         vm_page_io_start(m);
 414         vm_page_protect(m, VM_PROT_READ);
 415         object = m->object;
 416
 417         /*
 418          * Try to cluster around (m), keeping in mind that the swap pager
 419          * can only do SMAP_META_PAGES worth of continguous write.
 420          */
 421         x = (int)m->pindex & SWAP_META_MASK;
 422         marray[x] = m;
 423         basei = m->pindex;
 424
 425         for (i = x - 1; i >= 0; --i) {
 426                 m = vm_page_lookup(object, basei - x + i);
 427                 if (m == NULL)
 428                         break;
 429                 if (vm_swapcache_test(m))
 430                         break;
 431                 if (isblkdev && (m->flags & PG_NOTMETA))
 432                         break;
 433                 vm_page_io_start(m);
 434                 vm_page_protect(m, VM_PROT_READ);
 435                 if (m->queue - m->pc == PQ_CACHE) {
 436                         vm_page_unqueue_nowakeup(m);
 437                         vm_page_deactivate(m);
 438                 }
 439                 marray[i] = m;
 440         }
 441         ++i;
 442
 443         for (j = x + 1; j < SWAP_META_PAGES; ++j) {
 444                 m = vm_page_lookup(object, basei - x + j);
 445                 if (m == NULL)
 446                         break;
 447                 if (vm_swapcache_test(m))
 448                         break;
 449                 if (isblkdev && (m->flags & PG_NOTMETA))
 450                         break;
 451                 vm_page_io_start(m);
 452                 vm_page_protect(m, VM_PROT_READ);
 453                 if (m->queue - m->pc == PQ_CACHE) {
 454                         vm_page_unqueue_nowakeup(m);
 455                         vm_page_deactivate(m);
 456                 }
 457                 marray[j] = m;
 458         }
 459
 460         count = j - i;
 461         vm_object_pip_add(object, count);
 462         swap_pager_putpages(object, marray + i, count, FALSE, rtvals + i);
 463         vm_swapcache_write_count += count * PAGE_SIZE;
 464         vm_swapcache_curburst -= count * PAGE_SIZE;
 465
 466         while (i < j) {
 467                 if (rtvals[i] != VM_PAGER_PEND) {
 468                         vm_page_io_finish(marray[i]);
 469                         vm_object_pip_wakeup(object);
 470                 }
 471                 ++i;
 472         }
 473         return(count);
 474 }
 475
 476 /*
 477  * Test whether a VM page is suitable for writing to the swapcache.
 478  * Does not test m->queue, PG_MARKER, or PG_SWAPPED.
 479  *
 480  * Returns 0 on success, 1 on failure
 481  *
 482  * The caller must hold vm_token.
 483  */
 484 static int
 485 vm_swapcache_test(vm_page_t m)
 486 {
 487         vm_object_t object;
 488
 489         if (m->flags & (PG_BUSY | PG_UNMANAGED))
 490                 return(1);
 491         if (m->busy || m->hold_count || m->wire_count)
 492                 return(1);
 493         if (m->valid != VM_PAGE_BITS_ALL)
 494                 return(1);
 495         if (m->dirty & m->valid)
 496                 return(1);
 497         if ((object = m->object) == NULL)
 498                 return(1);
 499         if (object->type != OBJT_VNODE ||
 500             (object->flags & OBJ_DEAD)) {
 501                 return(1);
 502         }
 503         vm_page_test_dirty(m);
 504         if (m->dirty & m->valid)
 505                 return(1);
 506         return(0);
 507 }
 508
 509 /*
 510  * Cleaning pass
 511  *
 512  * The caller must hold vm_token.
 513  */
 514 static
 515 void
 516 vm_swapcache_cleaning(vm_object_t marker)
 517 {
 518         vm_object_t object;
 519         struct vnode *vp;
 520         int count;
 521         int n;
 522
 523         object = marker;
 524         count = vm_swapcache_maxlaunder;
 525
 526         /*
 527          * Look for vnode objects
 528          */
 529         lwkt_gettoken(&vm_token);
 530         lwkt_gettoken(&vmobj_token);
 531
 532         while ((object = TAILQ_NEXT(object, object_list)) != NULL && count--) {
 533                 if (object->type != OBJT_VNODE)
 534                         continue;
 535                 if ((object->flags & OBJ_DEAD) || object->swblock_count == 0)
 536                         continue;
 537                 if ((vp = object->handle) == NULL)
 538                         continue;
 539                 if (vp->v_type != VREG && vp->v_type != VCHR)
 540                         continue;
 541
 542                 /*
 543                  * Adjust iterator.
 544                  */
 545                 if (marker->backing_object != object)
 546                         marker->size = 0;
 547
 548                 /*
 549                  * Move the marker so we can work on the VM object
 550                  */
 551                 TAILQ_REMOVE(&vm_object_list, marker, object_list);
 552                 TAILQ_INSERT_AFTER(&vm_object_list, object,
 553                                    marker, object_list);
 554
 555                 /*
 556                  * Look for swblocks starting at our iterator.
 557                  *
 558                  * The swap_pager_condfree() function attempts to free
 559                  * swap space starting at the specified index.  The index
 560                  * will be updated on return.  The function will return
 561                  * a scan factor (NOT the number of blocks freed).
 562                  *
 563                  * If it must cut its scan of the object short due to an
 564                  * excessive number of swblocks, or is able to free the
 565                  * requested number of blocks, it will return n >= count
 566                  * and we break and pick it back up on a future attempt.
 567                  */
 568                 n = swap_pager_condfree(object, &marker->size, count);
 569                 count -= n;
 570                 if (count < 0)
 571                         break;
 572
 573                 /*
 574                  * Setup for loop.
 575                  */
 576                 marker->size = 0;
 577                 object = marker;
 578         }
 579
 580         /*
 581          * Adjust marker so we continue the scan from where we left off.
 582          * When we reach the end we start back at the beginning.
 583          */
 584         TAILQ_REMOVE(&vm_object_list, marker, object_list);
 585         if (object)
 586                 TAILQ_INSERT_BEFORE(object, marker, object_list);
 587         else
 588                 TAILQ_INSERT_HEAD(&vm_object_list, marker, object_list);
 589         marker->backing_object = object;
 590
 591         lwkt_reltoken(&vmobj_token);
 592         lwkt_reltoken(&vm_token);
 593 }