sys/vm/vm_swapcache.c

   1 /*
   2  * (MPSAFE)
   3  *
   4  * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
   5  *
   6  * This code is derived from software contributed to The DragonFly Project
   7  * by Matthew Dillon <dillon@backplane.com>
   8  *
   9  * Redistribution and use in source and binary forms, with or without
  10  * modification, are permitted provided that the following conditions
  11  * are met:
  12  *
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in
  17  *    the documentation and/or other materials provided with the
  18  *    distribution.
  19  * 3. Neither the name of The DragonFly Project nor the names of its
  20  *    contributors may be used to endorse or promote products derived
  21  *    from this software without specific, prior written permission.
  22  *
  23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  34  * SUCH DAMAGE.
  35  */
  36
  37 /*
  38  * Implement the swapcache daemon.  When enabled swap is assumed to be
  39  * configured on a fast storage device such as a SSD.  Swap is assigned
  40  * to clean vnode-backed pages in the inactive queue, clustered by object
  41  * if possible, and written out.  The swap assignment sticks around even
  42  * after the underlying pages have been recycled.
  43  *
  44  * The daemon manages write bandwidth based on sysctl settings to control
  45  * wear on the SSD.
  46  *
  47  * The vnode strategy code will check for the swap assignments and divert
  48  * reads to the swap device when the data is present in the swapcache.
  49  *
  50  * This operates on both regular files and the block device vnodes used by
  51  * filesystems to manage meta-data.
  52  */
  53
  54 #include "opt_vm.h"
  55 #include <sys/param.h>
  56 #include <sys/systm.h>
  57 #include <sys/kernel.h>
  58 #include <sys/proc.h>
  59 #include <sys/kthread.h>
  60 #include <sys/resourcevar.h>
  61 #include <sys/signalvar.h>
  62 #include <sys/vnode.h>
  63 #include <sys/vmmeter.h>
  64 #include <sys/sysctl.h>
  65 #include <sys/eventhandler.h>
  66
  67 #include <vm/vm.h>
  68 #include <vm/vm_param.h>
  69 #include <sys/lock.h>
  70 #include <vm/vm_object.h>
  71 #include <vm/vm_page.h>
  72 #include <vm/vm_map.h>
  73 #include <vm/vm_pageout.h>
  74 #include <vm/vm_pager.h>
  75 #include <vm/swap_pager.h>
  76 #include <vm/vm_extern.h>
  77
  78 #include <sys/thread2.h>
  79 #include <sys/spinlock2.h>
  80 #include <vm/vm_page2.h>
  81
  82 /* the kernel process "vm_pageout"*/
  83 static int vm_swapcached_flush (vm_page_t m, int isblkdev);
  84 static int vm_swapcache_test(vm_page_t m);
  85 static int vm_swapcache_writing_heuristic(void);
  86 static int vm_swapcache_writing(vm_page_t marker, int count, int scount);
  87 static void vm_swapcache_cleaning(vm_object_t marker,
  88                         struct vm_object_hash **swindexp);
  89 static void vm_swapcache_movemarker(vm_object_t marker,
  90                         struct vm_object_hash *swindex, vm_object_t object);
  91 struct thread *swapcached_thread;
  92
  93 SYSCTL_NODE(_vm, OID_AUTO, swapcache, CTLFLAG_RW, NULL, NULL);
  94
  95 int vm_swapcache_read_enable;
  96 int vm_swapcache_inactive_heuristic;
  97 static int vm_swapcache_sleep;
  98 static int vm_swapcache_maxscan = PQ_L2_SIZE * 8;
  99 static int vm_swapcache_maxlaunder = PQ_L2_SIZE * 4;
 100 static int vm_swapcache_data_enable = 0;
 101 static int vm_swapcache_meta_enable = 0;
 102 static int vm_swapcache_maxswappct = 75;
 103 static int vm_swapcache_hysteresis;
 104 static int vm_swapcache_min_hysteresis;
 105 int vm_swapcache_use_chflags = 0;       /* require chflags cache */
 106 static int64_t vm_swapcache_minburst = 10000000LL;      /* 10MB */
 107 static int64_t vm_swapcache_curburst = 4000000000LL;    /* 4G after boot */
 108 static int64_t vm_swapcache_maxburst = 2000000000LL;    /* 2G nominal max */
 109 static int64_t vm_swapcache_accrate = 100000LL;         /* 100K/s */
 110 static int64_t vm_swapcache_write_count;
 111 static int64_t vm_swapcache_maxfilesize;
 112 static int64_t vm_swapcache_cleanperobj = 16*1024*1024;
 113
 114 SYSCTL_INT(_vm_swapcache, OID_AUTO, maxlaunder,
 115         CTLFLAG_RW, &vm_swapcache_maxlaunder, 0, "");
 116 SYSCTL_INT(_vm_swapcache, OID_AUTO, maxscan,
 117         CTLFLAG_RW, &vm_swapcache_maxscan, 0, "");
 118
 119 SYSCTL_INT(_vm_swapcache, OID_AUTO, data_enable,
 120         CTLFLAG_RW, &vm_swapcache_data_enable, 0, "");
 121 SYSCTL_INT(_vm_swapcache, OID_AUTO, meta_enable,
 122         CTLFLAG_RW, &vm_swapcache_meta_enable, 0, "");
 123 SYSCTL_INT(_vm_swapcache, OID_AUTO, read_enable,
 124         CTLFLAG_RW, &vm_swapcache_read_enable, 0, "");
 125 SYSCTL_INT(_vm_swapcache, OID_AUTO, maxswappct,
 126         CTLFLAG_RW, &vm_swapcache_maxswappct, 0, "");
 127 SYSCTL_INT(_vm_swapcache, OID_AUTO, hysteresis,
 128         CTLFLAG_RD, &vm_swapcache_hysteresis, 0, "");
 129 SYSCTL_INT(_vm_swapcache, OID_AUTO, min_hysteresis,
 130         CTLFLAG_RW, &vm_swapcache_min_hysteresis, 0, "");
 131 SYSCTL_INT(_vm_swapcache, OID_AUTO, use_chflags,
 132         CTLFLAG_RW, &vm_swapcache_use_chflags, 0, "");
 133
 134 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, minburst,
 135         CTLFLAG_RW, &vm_swapcache_minburst, 0, "");
 136 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, curburst,
 137         CTLFLAG_RW, &vm_swapcache_curburst, 0, "");
 138 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, maxburst,
 139         CTLFLAG_RW, &vm_swapcache_maxburst, 0, "");
 140 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, maxfilesize,
 141         CTLFLAG_RW, &vm_swapcache_maxfilesize, 0, "");
 142 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, accrate,
 143         CTLFLAG_RW, &vm_swapcache_accrate, 0, "");
 144 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, write_count,
 145         CTLFLAG_RW, &vm_swapcache_write_count, 0, "");
 146 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, cleanperobj,
 147         CTLFLAG_RW, &vm_swapcache_cleanperobj, 0, "");
 148
 149 #define SWAPMAX(adj)    \
 150         ((int64_t)vm_swap_max * (vm_swapcache_maxswappct + (adj)) / 100)
 151
 152 /*
 153  * When shutting down the machine we want to stop swapcache operation
 154  * immediately so swap is not accessed after devices have been shuttered.
 155  */
 156 static void
 157 shutdown_swapcache(void *arg __unused)
 158 {
 159         vm_swapcache_read_enable = 0;
 160         vm_swapcache_data_enable = 0;
 161         vm_swapcache_meta_enable = 0;
 162         wakeup(&vm_swapcache_sleep);    /* shortcut 5-second wait */
 163 }
 164
 165 /*
 166  * vm_swapcached is the high level pageout daemon.
 167  *
 168  * No requirements.
 169  */
 170 static void
 171 vm_swapcached_thread(void)
 172 {
 173         enum { SWAPC_WRITING, SWAPC_CLEANING } state = SWAPC_WRITING;
 174         enum { SWAPB_BURSTING, SWAPB_RECOVERING } burst = SWAPB_BURSTING;
 175         static struct vm_page page_marker[PQ_L2_SIZE];
 176         static struct vm_object swmarker;
 177         static struct vm_object_hash *swindex;
 178         int q;
 179
 180         /*
 181          * Thread setup
 182          */
 183         curthread->td_flags |= TDF_SYSTHREAD;
 184         EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc,
 185                               swapcached_thread, SHUTDOWN_PRI_FIRST);
 186         EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_swapcache,
 187                               NULL, SHUTDOWN_PRI_SECOND);
 188
 189         /*
 190          * Initialize our marker for the inactive scan (SWAPC_WRITING)
 191          */
 192         bzero(&page_marker, sizeof(page_marker));
 193         for (q = 0; q < PQ_L2_SIZE; ++q) {
 194                 page_marker[q].flags = PG_FICTITIOUS | PG_MARKER;
 195                 page_marker[q].busy_count = PBUSY_LOCKED;
 196                 page_marker[q].queue = PQ_INACTIVE + q;
 197                 page_marker[q].pc = q;
 198                 page_marker[q].wire_count = 1;
 199                 vm_page_queues_spin_lock(PQ_INACTIVE + q);
 200                 TAILQ_INSERT_HEAD(
 201                         &vm_page_queues[PQ_INACTIVE + q].pl,
 202                         &page_marker[q], pageq);
 203                 vm_page_queues_spin_unlock(PQ_INACTIVE + q);
 204         }
 205
 206         vm_swapcache_min_hysteresis = 1024;
 207         vm_swapcache_hysteresis = vm_swapcache_min_hysteresis;
 208         vm_swapcache_inactive_heuristic = -vm_swapcache_hysteresis;
 209
 210         /*
 211          * Initialize our marker for the vm_object scan (SWAPC_CLEANING)
 212          */
 213         bzero(&swmarker, sizeof(swmarker));
 214         swmarker.type = OBJT_MARKER;
 215         swindex = &vm_object_hash[0];
 216         lwkt_gettoken(&swindex->token);
 217         TAILQ_INSERT_HEAD(&swindex->list, &swmarker, object_list);
 218         lwkt_reltoken(&swindex->token);
 219
 220         for (;;) {
 221                 int reached_end;
 222                 int scount;
 223                 int count;
 224
 225                 /*
 226                  * Handle shutdown
 227                  */
 228                 kproc_suspend_loop();
 229
 230                 /*
 231                  * Check every 5 seconds when not enabled or if no swap
 232                  * is present.
 233                  */
 234                 if ((vm_swapcache_data_enable == 0 &&
 235                      vm_swapcache_meta_enable == 0 &&
 236                      vm_swap_cache_use <= SWAPMAX(0)) ||
 237                     vm_swap_max == 0) {
 238                         tsleep(&vm_swapcache_sleep, 0, "csleep", hz * 5);
 239                         continue;
 240                 }
 241
 242                 /*
 243                  * Polling rate when enabled is approximately 10 hz.
 244                  */
 245                 tsleep(&vm_swapcache_sleep, 0, "csleep", hz / 10);
 246
 247                 /*
 248                  * State hysteresis.  Generate write activity up to 75% of
 249                  * swap, then clean out swap assignments down to 70%, then
 250                  * repeat.
 251                  */
 252                 if (state == SWAPC_WRITING) {
 253                         if (vm_swap_cache_use > SWAPMAX(0))
 254                                 state = SWAPC_CLEANING;
 255                 } else {
 256                         if (vm_swap_cache_use < SWAPMAX(-10))
 257                                 state = SWAPC_WRITING;
 258                 }
 259
 260                 /*
 261                  * We are allowed to continue accumulating burst value
 262                  * in either state.  Allow the user to set curburst > maxburst
 263                  * for the initial load-in.
 264                  */
 265                 if (vm_swapcache_curburst < vm_swapcache_maxburst) {
 266                         vm_swapcache_curburst += vm_swapcache_accrate / 10;
 267                         if (vm_swapcache_curburst > vm_swapcache_maxburst)
 268                                 vm_swapcache_curburst = vm_swapcache_maxburst;
 269                 }
 270
 271                 /*
 272                  * We don't want to nickle-and-dime the scan as that will
 273                  * create unnecessary fragmentation.  The minimum burst
 274                  * is one-seconds worth of accumulation.
 275                  */
 276                 if (state != SWAPC_WRITING) {
 277                         vm_swapcache_cleaning(&swmarker, &swindex);
 278                         continue;
 279                 }
 280                 if (vm_swapcache_curburst < vm_swapcache_accrate)
 281                         continue;
 282
 283                 reached_end = 0;
 284                 count = vm_swapcache_maxlaunder / PQ_L2_SIZE + 2;
 285                 scount = vm_swapcache_maxscan / PQ_L2_SIZE + 2;
 286
 287                 if (burst == SWAPB_BURSTING) {
 288                         if (vm_swapcache_writing_heuristic()) {
 289                                 for (q = 0; q < PQ_L2_SIZE; ++q) {
 290                                         reached_end +=
 291                                                 vm_swapcache_writing(
 292                                                         &page_marker[q],
 293                                                         count,
 294                                                         scount);
 295                                 }
 296                         }
 297                         if (vm_swapcache_curburst <= 0)
 298                                 burst = SWAPB_RECOVERING;
 299                 } else if (vm_swapcache_curburst > vm_swapcache_minburst) {
 300                         if (vm_swapcache_writing_heuristic()) {
 301                                 for (q = 0; q < PQ_L2_SIZE; ++q) {
 302                                         reached_end +=
 303                                                 vm_swapcache_writing(
 304                                                         &page_marker[q],
 305                                                         count,
 306                                                         scount);
 307                                 }
 308                         }
 309                         burst = SWAPB_BURSTING;
 310                 }
 311                 if (reached_end == PQ_L2_SIZE) {
 312                         vm_swapcache_inactive_heuristic =
 313                                 -vm_swapcache_hysteresis;
 314                 }
 315         }
 316
 317         /*
 318          * Cleanup (NOT REACHED)
 319          */
 320         for (q = 0; q < PQ_L2_SIZE; ++q) {
 321                 vm_page_queues_spin_lock(PQ_INACTIVE + q);
 322                 TAILQ_REMOVE(
 323                         &vm_page_queues[PQ_INACTIVE + q].pl,
 324                         &page_marker[q], pageq);
 325                 vm_page_queues_spin_unlock(PQ_INACTIVE + q);
 326         }
 327
 328         lwkt_gettoken(&swindex->token);
 329         TAILQ_REMOVE(&swindex->list, &swmarker, object_list);
 330         lwkt_reltoken(&swindex->token);
 331 }
 332
 333 static struct kproc_desc swpc_kp = {
 334         "swapcached",
 335         vm_swapcached_thread,
 336         &swapcached_thread
 337 };
 338 SYSINIT(swapcached, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start, &swpc_kp);
 339
 340 /*
 341  * Deal with an overflow of the heuristic counter or if the user
 342  * manually changes the hysteresis.
 343  *
 344  * Try to avoid small incremental pageouts by waiting for enough
 345  * pages to buildup in the inactive queue to hopefully get a good
 346  * burst in.  This heuristic is bumped by the VM system and reset
 347  * when our scan hits the end of the queue.
 348  *
 349  * Return TRUE if we need to take a writing pass.
 350  */
 351 static int
 352 vm_swapcache_writing_heuristic(void)
 353 {
 354         int hyst;
 355
 356         hyst = vmstats.v_inactive_count / 4;
 357         if (hyst < vm_swapcache_min_hysteresis)
 358                 hyst = vm_swapcache_min_hysteresis;
 359         cpu_ccfence();
 360         vm_swapcache_hysteresis = hyst;
 361
 362         if (vm_swapcache_inactive_heuristic < -hyst)
 363                 vm_swapcache_inactive_heuristic = -hyst;
 364
 365         return (vm_swapcache_inactive_heuristic >= 0);
 366 }
 367
 368 /*
 369  * Take a writing pass on one of the inactive queues, return non-zero if
 370  * we hit the end of the queue.
 371  */
 372 static int
 373 vm_swapcache_writing(vm_page_t marker, int count, int scount)
 374 {
 375         vm_object_t object;
 376         struct vnode *vp;
 377         vm_page_t m;
 378         int isblkdev;
 379
 380         /*
 381          * Scan the inactive queue from our marker to locate
 382          * suitable pages to push to the swap cache.
 383          *
 384          * We are looking for clean vnode-backed pages.
 385          */
 386         vm_page_queues_spin_lock(marker->queue);
 387         while ((m = TAILQ_NEXT(marker, pageq)) != NULL &&
 388                count > 0 && scount-- > 0) {
 389                 KKASSERT(m->queue == marker->queue);
 390
 391                 /*
 392                  * Stop using swap if paniced, dumping, or dumped.
 393                  * Don't try to write if our curburst has been exhausted.
 394                  */
 395                 if (panicstr || dumping)
 396                         break;
 397                 if (vm_swapcache_curburst < 0)
 398                         break;
 399
 400                 /*
 401                  * Move marker
 402                  */
 403                 TAILQ_REMOVE(
 404                         &vm_page_queues[marker->queue].pl, marker, pageq);
 405                 TAILQ_INSERT_AFTER(
 406                         &vm_page_queues[marker->queue].pl, m, marker, pageq);
 407
 408                 /*
 409                  * Ignore markers and ignore pages that already have a swap
 410                  * assignment.
 411                  */
 412                 if (m->flags & (PG_MARKER | PG_SWAPPED))
 413                         continue;
 414                 if (vm_page_busy_try(m, TRUE))
 415                         continue;
 416                 vm_page_queues_spin_unlock(marker->queue);
 417
 418                 if ((object = m->object) == NULL) {
 419                         vm_page_wakeup(m);
 420                         vm_page_queues_spin_lock(marker->queue);
 421                         continue;
 422                 }
 423                 vm_object_hold(object);
 424                 if (m->object != object) {
 425                         vm_object_drop(object);
 426                         vm_page_wakeup(m);
 427                         vm_page_queues_spin_lock(marker->queue);
 428                         continue;
 429                 }
 430                 if (vm_swapcache_test(m)) {
 431                         vm_object_drop(object);
 432                         vm_page_wakeup(m);
 433                         vm_page_queues_spin_lock(marker->queue);
 434                         continue;
 435                 }
 436
 437                 vp = object->handle;
 438                 if (vp == NULL) {
 439                         vm_object_drop(object);
 440                         vm_page_wakeup(m);
 441                         vm_page_queues_spin_lock(marker->queue);
 442                         continue;
 443                 }
 444
 445                 switch(vp->v_type) {
 446                 case VREG:
 447                         /*
 448                          * PG_NOTMETA generically means 'don't swapcache this',
 449                          * and HAMMER will set this for regular data buffers
 450                          * (and leave it unset for meta-data buffers) as
 451                          * appropriate when double buffering is enabled.
 452                          */
 453                         if (m->flags & PG_NOTMETA) {
 454                                 vm_object_drop(object);
 455                                 vm_page_wakeup(m);
 456                                 vm_page_queues_spin_lock(marker->queue);
 457                                 continue;
 458                         }
 459
 460                         /*
 461                          * If data_enable is 0 do not try to swapcache data.
 462                          * If use_chflags is set then only swapcache data for
 463                          * VSWAPCACHE marked vnodes, otherwise any vnode.
 464                          */
 465                         if (vm_swapcache_data_enable == 0 ||
 466                             ((vp->v_flag & VSWAPCACHE) == 0 &&
 467                              vm_swapcache_use_chflags)) {
 468                                 vm_object_drop(object);
 469                                 vm_page_wakeup(m);
 470                                 vm_page_queues_spin_lock(marker->queue);
 471                                 continue;
 472                         }
 473                         if (vm_swapcache_maxfilesize &&
 474                             object->size >
 475                             (vm_swapcache_maxfilesize >> PAGE_SHIFT)) {
 476                                 vm_object_drop(object);
 477                                 vm_page_wakeup(m);
 478                                 vm_page_queues_spin_lock(marker->queue);
 479                                 continue;
 480                         }
 481                         isblkdev = 0;
 482                         break;
 483                 case VCHR:
 484                         /*
 485                          * PG_NOTMETA generically means 'don't swapcache this',
 486                          * and HAMMER will set this for regular data buffers
 487                          * (and leave it unset for meta-data buffers) as
 488                          * appropriate when double buffering is enabled.
 489                          */
 490                         if (m->flags & PG_NOTMETA) {
 491                                 vm_object_drop(object);
 492                                 vm_page_wakeup(m);
 493                                 vm_page_queues_spin_lock(marker->queue);
 494                                 continue;
 495                         }
 496                         if (vm_swapcache_meta_enable == 0) {
 497                                 vm_object_drop(object);
 498                                 vm_page_wakeup(m);
 499                                 vm_page_queues_spin_lock(marker->queue);
 500                                 continue;
 501                         }
 502                         isblkdev = 1;
 503                         break;
 504                 default:
 505                         vm_object_drop(object);
 506                         vm_page_wakeup(m);
 507                         vm_page_queues_spin_lock(marker->queue);
 508                         continue;
 509                 }
 510
 511
 512                 /*
 513                  * Assign swap and initiate I/O.
 514                  *
 515                  * (adjust for the --count which also occurs in the loop)
 516                  */
 517                 count -= vm_swapcached_flush(m, isblkdev);
 518
 519                 /*
 520                  * Setup for next loop using marker.
 521                  */
 522                 vm_object_drop(object);
 523                 vm_page_queues_spin_lock(marker->queue);
 524         }
 525
 526         /*
 527          * The marker could wind up at the end, which is ok.  If we hit the
 528          * end of the list adjust the heuristic.
 529          *
 530          * Earlier inactive pages that were dirty and become clean
 531          * are typically moved to the end of PQ_INACTIVE by virtue
 532          * of vfs_vmio_release() when they become unwired from the
 533          * buffer cache.
 534          */
 535         vm_page_queues_spin_unlock(marker->queue);
 536
 537         /*
 538          * m invalid but can be used to test for NULL
 539          */
 540         return (m == NULL);
 541 }
 542
 543 /*
 544  * Flush the specified page using the swap_pager.  The page
 545  * must be busied by the caller and its disposition will become
 546  * the responsibility of this function.
 547  *
 548  * Try to collect surrounding pages, including pages which may
 549  * have already been assigned swap.  Try to cluster within a
 550  * contiguous aligned SMAP_META_PAGES (typ 16 x PAGE_SIZE) block
 551  * to match what swap_pager_putpages() can do.
 552  *
 553  * We also want to try to match against the buffer cache blocksize
 554  * but we don't really know what it is here.  Since the buffer cache
 555  * wires and unwires pages in groups the fact that we skip wired pages
 556  * should be sufficient.
 557  *
 558  * Returns a count of pages we might have flushed (minimum 1)
 559  */
 560 static
 561 int
 562 vm_swapcached_flush(vm_page_t m, int isblkdev)
 563 {
 564         vm_object_t object;
 565         vm_page_t marray[SWAP_META_PAGES];
 566         vm_pindex_t basei;
 567         int rtvals[SWAP_META_PAGES];
 568         int x;
 569         int i;
 570         int j;
 571         int count;
 572         int error;
 573
 574         vm_page_io_start(m);
 575         vm_page_protect(m, VM_PROT_READ);
 576         object = m->object;
 577         vm_object_hold(object);
 578
 579         /*
 580          * Try to cluster around (m), keeping in mind that the swap pager
 581          * can only do SMAP_META_PAGES worth of continguous write.
 582          */
 583         x = (int)m->pindex & SWAP_META_MASK;
 584         marray[x] = m;
 585         basei = m->pindex;
 586         vm_page_wakeup(m);
 587
 588         for (i = x - 1; i >= 0; --i) {
 589                 m = vm_page_lookup_busy_try(object, basei - x + i,
 590                                             TRUE, &error);
 591                 if (error || m == NULL)
 592                         break;
 593                 if (vm_swapcache_test(m)) {
 594                         vm_page_wakeup(m);
 595                         break;
 596                 }
 597                 if (isblkdev && (m->flags & PG_NOTMETA)) {
 598                         vm_page_wakeup(m);
 599                         break;
 600                 }
 601                 vm_page_io_start(m);
 602                 vm_page_protect(m, VM_PROT_READ);
 603                 if (m->queue - m->pc == PQ_CACHE) {
 604                         vm_page_unqueue_nowakeup(m);
 605                         vm_page_deactivate(m);
 606                 }
 607                 marray[i] = m;
 608                 vm_page_wakeup(m);
 609         }
 610         ++i;
 611
 612         for (j = x + 1; j < SWAP_META_PAGES; ++j) {
 613                 m = vm_page_lookup_busy_try(object, basei - x + j,
 614                                             TRUE, &error);
 615                 if (error || m == NULL)
 616                         break;
 617                 if (vm_swapcache_test(m)) {
 618                         vm_page_wakeup(m);
 619                         break;
 620                 }
 621                 if (isblkdev && (m->flags & PG_NOTMETA)) {
 622                         vm_page_wakeup(m);
 623                         break;
 624                 }
 625                 vm_page_io_start(m);
 626                 vm_page_protect(m, VM_PROT_READ);
 627                 if (m->queue - m->pc == PQ_CACHE) {
 628                         vm_page_unqueue_nowakeup(m);
 629                         vm_page_deactivate(m);
 630                 }
 631                 marray[j] = m;
 632                 vm_page_wakeup(m);
 633         }
 634
 635         count = j - i;
 636         vm_object_pip_add(object, count);
 637         swap_pager_putpages(object, marray + i, count, FALSE, rtvals + i);
 638         vm_swapcache_write_count += count * PAGE_SIZE;
 639         vm_swapcache_curburst -= count * PAGE_SIZE;
 640
 641         while (i < j) {
 642                 if (rtvals[i] != VM_PAGER_PEND) {
 643                         vm_page_busy_wait(marray[i], FALSE, "swppgfd");
 644                         vm_page_io_finish(marray[i]);
 645                         vm_page_wakeup(marray[i]);
 646                         vm_object_pip_wakeup(object);
 647                 }
 648                 ++i;
 649         }
 650         vm_object_drop(object);
 651         return(count);
 652 }
 653
 654 /*
 655  * Test whether a VM page is suitable for writing to the swapcache.
 656  * Does not test m->queue, PG_MARKER, or PG_SWAPPED.
 657  *
 658  * Returns 0 on success, 1 on failure
 659  */
 660 static int
 661 vm_swapcache_test(vm_page_t m)
 662 {
 663         vm_object_t object;
 664
 665         if (m->flags & PG_UNMANAGED)
 666                 return(1);
 667         if (m->hold_count || m->wire_count)
 668                 return(1);
 669         if (m->valid != VM_PAGE_BITS_ALL)
 670                 return(1);
 671         if (m->dirty & m->valid)
 672                 return(1);
 673         if ((object = m->object) == NULL)
 674                 return(1);
 675         if (object->type != OBJT_VNODE ||
 676             (object->flags & OBJ_DEAD)) {
 677                 return(1);
 678         }
 679         vm_page_test_dirty(m);
 680         if (m->dirty & m->valid)
 681                 return(1);
 682         return(0);
 683 }
 684
 685 /*
 686  * Cleaning pass.
 687  *
 688  * We clean whole objects up to 16MB
 689  */
 690 static
 691 void
 692 vm_swapcache_cleaning(vm_object_t marker, struct vm_object_hash **swindexp)
 693 {
 694         vm_object_t object;
 695         struct vnode *vp;
 696         int count;
 697         int scount;
 698         int n;
 699         int didmove;
 700
 701         count = vm_swapcache_maxlaunder;
 702         scount = vm_swapcache_maxscan;
 703
 704         /*
 705          * Look for vnode objects
 706          */
 707         lwkt_gettoken(&(*swindexp)->token);
 708
 709         didmove = 0;
 710 outerloop:
 711         while ((object = TAILQ_NEXT(marker, object_list)) != NULL) {
 712                 /*
 713                  * We have to skip markers.  We cannot hold/drop marker
 714                  * objects!
 715                  */
 716                 if (object->type == OBJT_MARKER) {
 717                         vm_swapcache_movemarker(marker, *swindexp, object);
 718                         didmove = 1;
 719                         continue;
 720                 }
 721
 722                 /*
 723                  * Safety, or in case there are millions of VM objects
 724                  * without swapcache backing.
 725                  */
 726                 if (--scount <= 0)
 727                         goto breakout;
 728
 729                 /*
 730                  * We must hold the object before potentially yielding.
 731                  */
 732                 vm_object_hold(object);
 733                 lwkt_yield();
 734
 735                 /*
 736                  * Only operate on live VNODE objects that are either
 737                  * VREG or VCHR (VCHR for meta-data).
 738                  */
 739                 if ((object->type != OBJT_VNODE) ||
 740                     ((object->flags & OBJ_DEAD) ||
 741                      object->swblock_count == 0) ||
 742                     ((vp = object->handle) == NULL) ||
 743                     (vp->v_type != VREG && vp->v_type != VCHR)) {
 744                         vm_object_drop(object);
 745                         /* object may be invalid now */
 746                         vm_swapcache_movemarker(marker, *swindexp, object);
 747                         didmove = 1;
 748                         continue;
 749                 }
 750
 751                 /*
 752                  * Reset the object pindex stored in the marker if the
 753                  * working object has changed.
 754                  */
 755                 if (marker->backing_object != object || didmove) {
 756                         marker->size = 0;
 757                         marker->backing_object_offset = 0;
 758                         marker->backing_object = object;
 759                         didmove = 0;
 760                 }
 761
 762                 /*
 763                  * Look for swblocks starting at our iterator.
 764                  *
 765                  * The swap_pager_condfree() function attempts to free
 766                  * swap space starting at the specified index.  The index
 767                  * will be updated on return.  The function will return
 768                  * a scan factor (NOT the number of blocks freed).
 769                  *
 770                  * If it must cut its scan of the object short due to an
 771                  * excessive number of swblocks, or is able to free the
 772                  * requested number of blocks, it will return n >= count
 773                  * and we break and pick it back up on a future attempt.
 774                  *
 775                  * Scan the object linearly and try to batch large sets of
 776                  * blocks that are likely to clean out entire swap radix
 777                  * tree leafs.
 778                  */
 779                 lwkt_token_swap();
 780                 lwkt_reltoken(&(*swindexp)->token);
 781
 782                 n = swap_pager_condfree(object, &marker->size,
 783                                     (count + SWAP_META_MASK) & ~SWAP_META_MASK);
 784
 785                 vm_object_drop(object);         /* object may be invalid now */
 786                 lwkt_gettoken(&(*swindexp)->token);
 787
 788                 /*
 789                  * If we have exhausted the object or deleted our per-pass
 790                  * page limit then move us to the next object.  Note that
 791                  * the current object may no longer be on the vm_object_list.
 792                  */
 793                 if (n <= 0 ||
 794                     marker->backing_object_offset > vm_swapcache_cleanperobj) {
 795                         vm_swapcache_movemarker(marker, *swindexp, object);
 796                         didmove = 1;
 797                 }
 798
 799                 /*
 800                  * If we have exhausted our max-launder stop for now.
 801                  */
 802                 count -= n;
 803                 marker->backing_object_offset += n * PAGE_SIZE;
 804                 if (count < 0)
 805                         goto breakout;
 806         }
 807
 808         /*
 809          * Iterate vm_object_lists[] hash table
 810          */
 811         TAILQ_REMOVE(&(*swindexp)->list, marker, object_list);
 812         lwkt_reltoken(&(*swindexp)->token);
 813         if (++*swindexp >= &vm_object_hash[VMOBJ_HSIZE])
 814                 *swindexp = &vm_object_hash[0];
 815         lwkt_gettoken(&(*swindexp)->token);
 816         TAILQ_INSERT_HEAD(&(*swindexp)->list, marker, object_list);
 817
 818         if (*swindexp != &vm_object_hash[0])
 819                 goto outerloop;
 820
 821 breakout:
 822         lwkt_reltoken(&(*swindexp)->token);
 823 }
 824
 825 /*
 826  * Move the marker past the current object.  Object can be stale, but we
 827  * still need it to determine if the marker has to be moved.  If the object
 828  * is still the 'current object' (object after the marker), we hop-scotch
 829  * the marker past it.
 830  */
 831 static void
 832 vm_swapcache_movemarker(vm_object_t marker, struct vm_object_hash *swindex,
 833                         vm_object_t object)
 834 {
 835         if (TAILQ_NEXT(marker, object_list) == object) {
 836                 TAILQ_REMOVE(&swindex->list, marker, object_list);
 837                 TAILQ_INSERT_AFTER(&swindex->list, object, marker, object_list);
 838         }
 839 }