sys/vm/vm_pageout.c

   1 /*
   2  * Copyright (c) 2003-2020 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@backplane.com>
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  * 3. Neither the name of The DragonFly Project nor the names of its
  18  *    contributors may be used to endorse or promote products derived
  19  *    from this software without specific, prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  * Copyright (c) 1991 Regents of the University of California.
  35  * All rights reserved.
  36  * Copyright (c) 1994 John S. Dyson
  37  * All rights reserved.
  38  * Copyright (c) 1994 David Greenman
  39  * All rights reserved.
  40  *
  41  * This code is derived from software contributed to Berkeley by
  42  * The Mach Operating System project at Carnegie-Mellon University.
  43  *
  44  * Redistribution and use in source and binary forms, with or without
  45  * modification, are permitted provided that the following conditions
  46  * are met:
  47  * 1. Redistributions of source code must retain the above copyright
  48  *    notice, this list of conditions and the following disclaimer.
  49  * 2. Redistributions in binary form must reproduce the above copyright
  50  *    notice, this list of conditions and the following disclaimer in the
  51  *    documentation and/or other materials provided with the distribution.
  52  * 3. Neither the name of the University nor the names of its contributors
  53  *    may be used to endorse or promote products derived from this software
  54  *    without specific prior written permission.
  55  *
  56  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  57  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  58  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  59  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  60  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  61  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  62  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  63  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  64  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  65  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  66  * SUCH DAMAGE.
  67  *
  68  *      from: @(#)vm_pageout.c  7.4 (Berkeley) 5/7/91
  69  *
  70  *
  71  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  72  * All rights reserved.
  73  *
  74  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  75  *
  76  * Permission to use, copy, modify and distribute this software and
  77  * its documentation is hereby granted, provided that both the copyright
  78  * notice and this permission notice appear in all copies of the
  79  * software, derivative works or modified versions, and any portions
  80  * thereof, and that both notices appear in supporting documentation.
  81  *
  82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  85  *
  86  * Carnegie Mellon requests users of this software to return to
  87  *
  88  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  89  *  School of Computer Science
  90  *  Carnegie Mellon University
  91  *  Pittsburgh PA 15213-3890
  92  *
  93  * any improvements or extensions that they make and grant Carnegie the
  94  * rights to redistribute these changes.
  95  */
  96
  97 /*
  98  * The proverbial page-out daemon, rewritten many times over the decades.
  99  */
 100
 101 #include "opt_vm.h"
 102 #include <sys/param.h>
 103 #include <sys/systm.h>
 104 #include <sys/kernel.h>
 105 #include <sys/proc.h>
 106 #include <sys/kthread.h>
 107 #include <sys/resourcevar.h>
 108 #include <sys/signalvar.h>
 109 #include <sys/vnode.h>
 110 #include <sys/malloc.h>
 111 #include <sys/vmmeter.h>
 112 #include <sys/conf.h>
 113 #include <sys/sysctl.h>
 114
 115 #include <vm/vm.h>
 116 #include <vm/vm_param.h>
 117 #include <sys/lock.h>
 118 #include <vm/vm_object.h>
 119 #include <vm/vm_page.h>
 120 #include <vm/vm_map.h>
 121 #include <vm/vm_pageout.h>
 122 #include <vm/vm_pager.h>
 123 #include <vm/swap_pager.h>
 124 #include <vm/vm_extern.h>
 125
 126 #include <sys/spinlock2.h>
 127 #include <vm/vm_page2.h>
 128
 129 /*
 130  * Persistent markers held by pageout daemon (array)
 131  */
 132 struct markers {
 133         struct vm_page  hold;
 134         struct vm_page  stat;
 135         struct vm_page  pact;
 136 };
 137
 138 /*
 139  * System initialization
 140  */
 141
 142 /* the kernel process "vm_pageout"*/
 143 static int vm_pageout_page(vm_page_t m, long *max_launderp,
 144                            long *vnodes_skippedp, struct vnode **vpfailedp,
 145                            int pass, int vmflush_flags, long *counts);
 146 static int vm_pageout_clean_helper (vm_page_t, int);
 147 static void vm_pageout_free_page_calc (vm_size_t count);
 148 static void vm_pageout_page_free(vm_page_t m) ;
 149 __read_frequently struct thread *emergpager;
 150 __read_frequently struct thread *pagethread;
 151 static int sequence_emerg_pager;
 152
 153 #if !defined(NO_SWAPPING)
 154 /* the kernel process "vm_daemon"*/
 155 static void vm_daemon (void);
 156 static struct   thread *vmthread;
 157
 158 static struct kproc_desc vm_kp = {
 159         "vmdaemon",
 160         vm_daemon,
 161         &vmthread
 162 };
 163 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp);
 164 #endif
 165
 166 __read_mostly int vm_pages_needed = 0;  /* pageout daemon tsleep event */
 167 __read_mostly int vm_pageout_deficit = 0;/* Estimated number of pages deficit */
 168 __read_mostly int vm_pageout_pages_needed = 0;/* pageout daemon needs pages */
 169 __read_mostly int vm_page_free_hysteresis = 16;
 170 __read_mostly static time_t vm_pagedaemon_uptime;
 171
 172 #if !defined(NO_SWAPPING)
 173 static int vm_daemon_needed;
 174 #endif
 175 __read_mostly static int vm_queue_idle_perc = 20;
 176 __read_mostly static int vm_max_launder = 0;
 177 __read_mostly static int vm_emerg_launder = 100;
 178 __read_mostly static int vm_pageout_stats_actcmp = 0;
 179 __read_mostly static int vm_pageout_stats_inamin = 16;
 180 __read_mostly static int vm_pageout_stats_inalim = 4096;
 181 __read_mostly static int vm_pageout_stats_scan = 0;
 182 __read_mostly static int vm_pageout_stats_ticks = 0;
 183 __read_mostly static int vm_pageout_algorithm = 0;
 184 __read_mostly static int defer_swap_pageouts = 0;
 185 __read_mostly static int disable_swap_pageouts = 0;
 186 __read_mostly static u_int vm_anonmem_decline = ACT_DECLINE;
 187 __read_mostly static u_int vm_filemem_decline = ACT_DECLINE * 2;
 188 __read_mostly static int vm_pageout_debug;
 189 __read_mostly static long vm_pageout_stats_rsecs = 300;
 190
 191 #if defined(NO_SWAPPING)
 192 __read_mostly static int vm_swap_enabled=0;
 193 #else
 194 __read_mostly static int vm_swap_enabled=1;
 195 #endif
 196
 197 /* 0-disable, 1-passive, 2-active swp, 3-acive swp + single-queue dirty pages*/
 198 __read_mostly int vm_pageout_memuse_mode=2;
 199 __read_mostly int vm_pageout_allow_active=1;
 200
 201 SYSCTL_UINT(_vm, VM_PAGEOUT_ALGORITHM, anonmem_decline,
 202         CTLFLAG_RW, &vm_anonmem_decline, 0, "active->inactive anon memory");
 203
 204 SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, filemem_decline,
 205         CTLFLAG_RW, &vm_filemem_decline, 0, "active->inactive file cache");
 206
 207 SYSCTL_INT(_vm, OID_AUTO, page_free_hysteresis,
 208         CTLFLAG_RW, &vm_page_free_hysteresis, 0,
 209         "Free more pages than the minimum required");
 210
 211 SYSCTL_INT(_vm, OID_AUTO, queue_idle_perc,
 212         CTLFLAG_RW, &vm_queue_idle_perc, 0, "page stats stop point, percent");
 213
 214 SYSCTL_INT(_vm, OID_AUTO, max_launder,
 215         CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout");
 216 SYSCTL_INT(_vm, OID_AUTO, emerg_launder,
 217         CTLFLAG_RW, &vm_emerg_launder, 0, "Emergency pager minimum");
 218
 219 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_actcmp,
 220         CTLFLAG_RW, &vm_pageout_stats_actcmp, 0,
 221         "Current dynamic act_count comparator");
 222 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_inamin,
 223         CTLFLAG_RW, &vm_pageout_stats_inamin, 0,
 224         "min out of lim tests must match");
 225 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_inalim,
 226         CTLFLAG_RW, &vm_pageout_stats_inalim, 0,
 227         "min out of lim tests must match");
 228 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_ticks,
 229         CTLFLAG_RW, &vm_pageout_stats_ticks, 0,
 230         "Interval for partial stats scan");
 231 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_scan,
 232         CTLFLAG_RW, &vm_pageout_stats_scan, 0,
 233         "hold/ACT scan count per interval");
 234 SYSCTL_LONG(_vm, OID_AUTO, pageout_stats_rsecs,
 235         CTLFLAG_RW, &vm_pageout_stats_rsecs, 0,
 236         "min out of lim tests must match");
 237
 238 SYSCTL_INT(_vm, OID_AUTO, pageout_memuse_mode,
 239         CTLFLAG_RW, &vm_pageout_memuse_mode, 0, "memoryuse resource mode");
 240 SYSCTL_INT(_vm, OID_AUTO, pageout_allow_active,
 241         CTLFLAG_RW, &vm_pageout_allow_active, 0, "allow inactive+active");
 242 SYSCTL_INT(_vm, OID_AUTO, pageout_debug,
 243         CTLFLAG_RW, &vm_pageout_debug, 0, "debug pageout pages (count)");
 244
 245
 246 #if defined(NO_SWAPPING)
 247 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
 248         CTLFLAG_RD, &vm_swap_enabled, 0, "");
 249 #else
 250 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
 251         CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout");
 252 #endif
 253
 254 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts,
 255         CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem");
 256
 257 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
 258         CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
 259
 260 static int pageout_lock_miss;
 261 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
 262         CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
 263
 264 int vm_page_max_wired;          /* XXX max # of wired pages system-wide */
 265
 266 static MALLOC_DEFINE(M_PAGEOUT, "pageout", "Pageout structures");
 267
 268 #if !defined(NO_SWAPPING)
 269 static void vm_req_vmdaemon (void);
 270 #endif
 271
 272 #define MAXSCAN_DIVIDER         10
 273
 274 #define VM_CACHE_SCAN_MIN       16
 275 #define VM_CACHE_SCAN_NOM       (VM_CACHE_SCAN_MIN * 4)
 276
 277 /*
 278  * Calculate approximately how many pages on each queue to try to
 279  * clean.  An exact calculation creates an edge condition when the
 280  * queues are unbalanced so add significant slop.  The queue scans
 281  * will stop early when targets are reached and will start where they
 282  * left off on the next pass.
 283  *
 284  * We need to be generous here because there are all sorts of loading
 285  * conditions that can cause edge cases if try to average over all queues.
 286  * In particular, storage subsystems have become so fast that paging
 287  * activity can become quite frantic.  Eventually we will probably need
 288  * two paging threads, one for dirty pages and one for clean, to deal
 289  * with the bandwidth requirements.
 290
 291  * So what we do is calculate a value that can be satisfied nominally by
 292  * only having to scan half the queues.
 293  */
 294 static __inline long
 295 PQAVERAGE(long n)
 296 {
 297         long avg;
 298
 299         if (n >= 0) {
 300                 avg = ((n + (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) + 1);
 301         } else {
 302                 avg = ((n - (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) - 1);
 303         }
 304         return avg;
 305 }
 306
 307 /*
 308  * vm_pageout_clean_helper:
 309  *
 310  * Clean the page and remove it from the laundry.  The page must be busied
 311  * by the caller and will be disposed of (put away, flushed) by this routine.
 312  */
 313 static int
 314 vm_pageout_clean_helper(vm_page_t m, int vmflush_flags)
 315 {
 316         vm_object_t object;
 317         vm_page_t mc[BLIST_MAX_ALLOC];
 318         int error;
 319         int ib, is, page_base;
 320         vm_pindex_t pindex = m->pindex;
 321
 322         object = m->object;
 323
 324         /*
 325          * Don't mess with the page if it's held or special.  Theoretically
 326          * we can pageout held pages but there is no real need to press our
 327          * luck, so don't.
 328          */
 329         if (m->hold_count != 0 || (m->flags & PG_UNQUEUED)) {
 330                 vm_page_wakeup(m);
 331                 return 0;
 332         }
 333
 334         /*
 335          * Place page in cluster.  Align cluster for optimal swap space
 336          * allocation (whether it is swap or not).  This is typically ~16-32
 337          * pages, which also tends to align the cluster to multiples of the
 338          * filesystem block size if backed by a filesystem.
 339          */
 340         page_base = pindex % BLIST_MAX_ALLOC;
 341         mc[page_base] = m;
 342         ib = page_base - 1;
 343         is = page_base + 1;
 344
 345         /*
 346          * Scan object for clusterable pages.
 347          *
 348          * We can cluster ONLY if: ->> the page is NOT
 349          * clean, wired, busy, held, or mapped into a
 350          * buffer, and one of the following:
 351          * 1) The page is inactive, or a seldom used
 352          *    active page.
 353          * -or-
 354          * 2) we force the issue.
 355          *
 356          * During heavy mmap/modification loads the pageout
 357          * daemon can really fragment the underlying file
 358          * due to flushing pages out of order and not trying
 359          * align the clusters (which leave sporatic out-of-order
 360          * holes).  To solve this problem we do the reverse scan
 361          * first and attempt to align our cluster, then do a
 362          * forward scan if room remains.
 363          */
 364         vm_object_hold(object);
 365
 366         while (ib >= 0) {
 367                 vm_page_t p;
 368
 369                 p = vm_page_lookup_busy_try(object, pindex - page_base + ib,
 370                                             TRUE, &error);
 371                 if (error || p == NULL)
 372                         break;
 373                 if ((p->queue - p->pc) == PQ_CACHE ||
 374                     (p->flags & PG_UNQUEUED)) {
 375                         vm_page_wakeup(p);
 376                         break;
 377                 }
 378                 vm_page_test_dirty(p);
 379                 if (((p->dirty & p->valid) == 0 &&
 380                      (p->flags & PG_NEED_COMMIT) == 0) ||
 381                     p->wire_count != 0 ||       /* may be held by buf cache */
 382                     p->hold_count != 0) {       /* may be undergoing I/O */
 383                         vm_page_wakeup(p);
 384                         break;
 385                 }
 386                 if (p->queue - p->pc != PQ_INACTIVE) {
 387                         if (p->queue - p->pc != PQ_ACTIVE ||
 388                             (vmflush_flags & OBJPC_ALLOW_ACTIVE) == 0) {
 389                                 vm_page_wakeup(p);
 390                                 break;
 391                         }
 392                 }
 393
 394                 /*
 395                  * Try to maintain page groupings in the cluster.
 396                  */
 397                 if (m->flags & PG_WINATCFLS)
 398                         vm_page_flag_set(p, PG_WINATCFLS);
 399                 else
 400                         vm_page_flag_clear(p, PG_WINATCFLS);
 401                 p->act_count = m->act_count;
 402
 403                 mc[ib] = p;
 404                 --ib;
 405         }
 406         ++ib;   /* fixup */
 407
 408         while (is < BLIST_MAX_ALLOC &&
 409                pindex - page_base + is < object->size) {
 410                 vm_page_t p;
 411
 412                 p = vm_page_lookup_busy_try(object, pindex - page_base + is,
 413                                             TRUE, &error);
 414                 if (error || p == NULL)
 415                         break;
 416                 if (((p->queue - p->pc) == PQ_CACHE) ||
 417                     (p->flags & PG_UNQUEUED)) {
 418                         vm_page_wakeup(p);
 419                         break;
 420                 }
 421                 vm_page_test_dirty(p);
 422                 if (((p->dirty & p->valid) == 0 &&
 423                      (p->flags & PG_NEED_COMMIT) == 0) ||
 424                     p->wire_count != 0 ||       /* may be held by buf cache */
 425                     p->hold_count != 0) {       /* may be undergoing I/O */
 426                         vm_page_wakeup(p);
 427                         break;
 428                 }
 429                 if (p->queue - p->pc != PQ_INACTIVE) {
 430                         if (p->queue - p->pc != PQ_ACTIVE ||
 431                             (vmflush_flags & OBJPC_ALLOW_ACTIVE) == 0) {
 432                                 vm_page_wakeup(p);
 433                                 break;
 434                         }
 435                 }
 436
 437                 /*
 438                  * Try to maintain page groupings in the cluster.
 439                  */
 440                 if (m->flags & PG_WINATCFLS)
 441                         vm_page_flag_set(p, PG_WINATCFLS);
 442                 else
 443                         vm_page_flag_clear(p, PG_WINATCFLS);
 444                 p->act_count = m->act_count;
 445
 446                 mc[is] = p;
 447                 ++is;
 448         }
 449
 450         vm_object_drop(object);
 451
 452         /*
 453          * we allow reads during pageouts...
 454          */
 455         return vm_pageout_flush(&mc[ib], is - ib, vmflush_flags);
 456 }
 457
 458 /*
 459  * vm_pageout_flush() - launder the given pages
 460  *
 461  *      The given pages are laundered.  Note that we setup for the start of
 462  *      I/O ( i.e. busy the page ), mark it read-only, and bump the object
 463  *      reference count all in here rather then in the parent.  If we want
 464  *      the parent to do more sophisticated things we may have to change
 465  *      the ordering.
 466  *
 467  *      The pages in the array must be busied by the caller and will be
 468  *      unbusied by this function.
 469  */
 470 int
 471 vm_pageout_flush(vm_page_t *mc, int count, int vmflush_flags)
 472 {
 473         vm_object_t object;
 474         int pageout_status[count];
 475         int numpagedout = 0;
 476         int i;
 477
 478         /*
 479          * Initiate I/O.  Bump the vm_page_t->busy counter.
 480          */
 481         for (i = 0; i < count; i++) {
 482                 KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL,
 483                         ("vm_pageout_flush page %p index %d/%d: partially "
 484                          "invalid page", mc[i], i, count));
 485                 vm_page_io_start(mc[i]);
 486         }
 487
 488         /*
 489          * We must make the pages read-only.  This will also force the
 490          * modified bit in the related pmaps to be cleared.  The pager
 491          * cannot clear the bit for us since the I/O completion code
 492          * typically runs from an interrupt.  The act of making the page
 493          * read-only handles the case for us.
 494          *
 495          * Then we can unbusy the pages, we still hold a reference by virtue
 496          * of our soft-busy.
 497          */
 498         for (i = 0; i < count; i++) {
 499                 if (vmflush_flags & OBJPC_TRY_TO_CACHE)
 500                         vm_page_protect(mc[i], VM_PROT_NONE);
 501                 else
 502                         vm_page_protect(mc[i], VM_PROT_READ);
 503                 vm_page_wakeup(mc[i]);
 504         }
 505
 506         object = mc[0]->object;
 507         vm_object_pip_add(object, count);
 508
 509         vm_pager_put_pages(object, mc, count,
 510                            (vmflush_flags |
 511                             ((object == kernel_object) ?  OBJPC_SYNC : 0)),
 512                            pageout_status);
 513
 514         for (i = 0; i < count; i++) {
 515                 vm_page_t mt = mc[i];
 516
 517                 switch (pageout_status[i]) {
 518                 case VM_PAGER_OK:
 519                         numpagedout++;
 520                         break;
 521                 case VM_PAGER_PEND:
 522                         numpagedout++;
 523                         break;
 524                 case VM_PAGER_BAD:
 525                         /*
 526                          * Page outside of range of object. Right now we
 527                          * essentially lose the changes by pretending it
 528                          * worked.
 529                          */
 530                         vm_page_busy_wait(mt, FALSE, "pgbad");
 531                         pmap_clear_modify(mt);
 532                         vm_page_undirty(mt);
 533                         vm_page_wakeup(mt);
 534                         break;
 535                 case VM_PAGER_ERROR:
 536                 case VM_PAGER_FAIL:
 537                         /*
 538                          * A page typically cannot be paged out when we
 539                          * have run out of swap.  We leave the page
 540                          * marked inactive and will try to page it out
 541                          * again later.
 542                          *
 543                          * Starvation of the active page list is used to
 544                          * determine when the system is massively memory
 545                          * starved.
 546                          */
 547                         break;
 548                 case VM_PAGER_AGAIN:
 549                         break;
 550                 }
 551
 552                 /*
 553                  * If not PENDing this was a synchronous operation and we
 554                  * clean up after the I/O.  If it is PENDing the mess is
 555                  * cleaned up asynchronously.
 556                  *
 557                  * Also nominally act on the caller's wishes if the caller
 558                  * wants to try to really clean (cache or free) the page.
 559                  *
 560                  * Also nominally deactivate the page if the system is
 561                  * memory-stressed.
 562                  */
 563                 if (pageout_status[i] != VM_PAGER_PEND) {
 564                         vm_page_busy_wait(mt, FALSE, "pgouw");
 565                         vm_page_io_finish(mt);
 566                         if (vmflush_flags & OBJPC_TRY_TO_CACHE) {
 567                                 vm_page_try_to_cache(mt);
 568                         } else if (vm_paging_severe()) {
 569                                 vm_page_deactivate(mt);
 570                                 vm_page_wakeup(mt);
 571                         } else {
 572                                 vm_page_wakeup(mt);
 573                         }
 574                         vm_object_pip_wakeup(object);
 575                 }
 576         }
 577         return numpagedout;
 578 }
 579
 580 #if !defined(NO_SWAPPING)
 581
 582 /*
 583  * Callback function, page busied for us.  We must dispose of the busy
 584  * condition.  Any related pmap pages may be held but will not be locked.
 585  */
 586 static
 587 int
 588 vm_pageout_mdp_callback(struct pmap_pgscan_info *info, vm_offset_t va,
 589                         vm_page_t p)
 590 {
 591         int actcount;
 592         int cleanit = 0;
 593
 594         /*
 595          * Basic tests - There should never be a marker, and we can stop
 596          *               once the RSS is below the required level.
 597          */
 598         KKASSERT((p->flags & PG_MARKER) == 0);
 599         if (pmap_resident_tlnw_count(info->pmap) <= info->limit) {
 600                 vm_page_wakeup(p);
 601                 return(-1);
 602         }
 603
 604         mycpu->gd_cnt.v_pdpages++;
 605
 606         if (p->wire_count || p->hold_count || (p->flags & PG_UNQUEUED)) {
 607                 vm_page_wakeup(p);
 608                 goto done;
 609         }
 610
 611         ++info->actioncount;
 612
 613         /*
 614          * Check if the page has been referened recently.  If it has,
 615          * activate it and skip.
 616          */
 617         actcount = pmap_ts_referenced(p);
 618         if (actcount) {
 619                 vm_page_flag_set(p, PG_REFERENCED);
 620         } else if (p->flags & PG_REFERENCED) {
 621                 actcount = 1;
 622         }
 623
 624         if (actcount) {
 625                 if (p->queue - p->pc != PQ_ACTIVE) {
 626                         vm_page_and_queue_spin_lock(p);
 627                         if (p->queue - p->pc != PQ_ACTIVE) {
 628                                 vm_page_and_queue_spin_unlock(p);
 629                                 vm_page_activate(p);
 630                         } else {
 631                                 vm_page_and_queue_spin_unlock(p);
 632                         }
 633                 } else {
 634                         p->act_count += actcount;
 635                         if (p->act_count > ACT_MAX)
 636                                 p->act_count = ACT_MAX;
 637                 }
 638                 vm_page_flag_clear(p, PG_REFERENCED);
 639                 vm_page_wakeup(p);
 640                 goto done;
 641         }
 642
 643         /*
 644          * Remove the page from this particular pmap.  Once we do this, our
 645          * pmap scans will not see it again (unless it gets faulted in), so
 646          * we must actively dispose of or deal with the page.
 647          */
 648         pmap_remove_specific(info->pmap, p);
 649
 650         /*
 651          * If the page is not mapped to another process (i.e. as would be
 652          * typical if this were a shared page from a library) then deactivate
 653          * the page and clean it in two passes only.
 654          *
 655          * If the page hasn't been referenced since the last check, remove it
 656          * from the pmap.  If it is no longer mapped, deactivate it
 657          * immediately, accelerating the normal decline.
 658          *
 659          * Once the page has been removed from the pmap the RSS code no
 660          * longer tracks it so we have to make sure that it is staged for
 661          * potential flush action.
 662          *
 663          * XXX
 664          */
 665         if ((p->flags & PG_MAPPED) == 0 ||
 666             (pmap_mapped_sync(p) & PG_MAPPED) == 0) {
 667                 if (p->queue - p->pc == PQ_ACTIVE) {
 668                         vm_page_deactivate(p);
 669                 }
 670                 if (p->queue - p->pc == PQ_INACTIVE) {
 671                         cleanit = 1;
 672                 }
 673         }
 674
 675         /*
 676          * Ok, try to fully clean the page and any nearby pages such that at
 677          * least the requested page is freed or moved to the cache queue.
 678          *
 679          * We usually do this synchronously to allow us to get the page into
 680          * the CACHE queue quickly, which will prevent memory exhaustion if
 681          * a process with a memoryuse limit is running away.  However, the
 682          * sysadmin may desire to set vm.swap_user_async which relaxes this
 683          * and improves write performance.
 684          */
 685         if (cleanit) {
 686                 long max_launder = 0x7FFF;
 687                 long vnodes_skipped = 0;
 688                 long counts[4] = { 0, 0, 0, 0 };
 689                 int vmflush_flags;
 690                 struct vnode *vpfailed = NULL;
 691
 692                 info->offset = va;
 693
 694                 if (vm_pageout_memuse_mode >= 2) {
 695                         vmflush_flags = OBJPC_TRY_TO_CACHE |
 696                                         OBJPC_ALLOW_ACTIVE;
 697                         if (swap_user_async == 0)
 698                                 vmflush_flags |= OBJPC_SYNC;
 699                         vm_page_flag_set(p, PG_WINATCFLS);
 700                         info->cleancount +=
 701                                 vm_pageout_page(p, &max_launder,
 702                                                 &vnodes_skipped,
 703                                                 &vpfailed, 1, vmflush_flags,
 704                                                 counts);
 705                 } else {
 706                         vm_page_wakeup(p);
 707                         ++info->cleancount;
 708                 }
 709         } else {
 710                 vm_page_wakeup(p);
 711         }
 712
 713         /*
 714          * Must be at end to avoid SMP races.
 715          */
 716 done:
 717         lwkt_user_yield();
 718         return 0;
 719 }
 720
 721 /*
 722  * Deactivate some number of pages in a map due to set RLIMIT_RSS limits.
 723  * that is relatively difficult to do.  We try to keep track of where we
 724  * left off last time to reduce scan overhead.
 725  *
 726  * Called when vm_pageout_memuse_mode is >= 1.
 727  */
 728 void
 729 vm_pageout_map_deactivate_pages(vm_map_t map, vm_pindex_t limit)
 730 {
 731         vm_offset_t pgout_offset;
 732         struct pmap_pgscan_info info;
 733         int retries = 3;
 734
 735         pgout_offset = map->pgout_offset;
 736 again:
 737 #if 0
 738         kprintf("%016jx ", pgout_offset);
 739 #endif
 740         if (pgout_offset < VM_MIN_USER_ADDRESS)
 741                 pgout_offset = VM_MIN_USER_ADDRESS;
 742         if (pgout_offset >= VM_MAX_USER_ADDRESS)
 743                 pgout_offset = 0;
 744         info.pmap = vm_map_pmap(map);
 745         info.limit = limit;
 746         info.beg_addr = pgout_offset;
 747         info.end_addr = VM_MAX_USER_ADDRESS;
 748         info.callback = vm_pageout_mdp_callback;
 749         info.cleancount = 0;
 750         info.actioncount = 0;
 751         info.busycount = 0;
 752
 753         pmap_pgscan(&info);
 754         pgout_offset = info.offset;
 755 #if 0
 756         kprintf("%016jx %08lx %08lx\n", pgout_offset,
 757                 info.cleancount, info.actioncount);
 758 #endif
 759
 760         if (pgout_offset != VM_MAX_USER_ADDRESS &&
 761             pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) {
 762                 goto again;
 763         } else if (retries &&
 764                    pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) {
 765                 --retries;
 766                 goto again;
 767         }
 768         map->pgout_offset = pgout_offset;
 769 }
 770 #endif
 771
 772 /*
 773  * Called when the pageout scan wants to free a page.  We no longer
 774  * try to cycle the vm_object here with a reference & dealloc, which can
 775  * cause a non-trivial object collapse in a critical path.
 776  *
 777  * It is unclear why we cycled the ref_count in the past, perhaps to try
 778  * to optimize shadow chain collapses but I don't quite see why it would
 779  * be necessary.  An OBJ_DEAD object should terminate any and all vm_pages
 780  * synchronously and not have to be kicked-start.
 781  */
 782 static void
 783 vm_pageout_page_free(vm_page_t m)
 784 {
 785         vm_page_protect(m, VM_PROT_NONE);
 786         vm_page_free(m);
 787 }
 788
 789 /*
 790  * vm_pageout_scan does the dirty work for the pageout daemon.
 791  */
 792 struct vm_pageout_scan_info {
 793         struct proc *bigproc;
 794         vm_offset_t bigsize;
 795 };
 796
 797 static int vm_pageout_scan_callback(struct proc *p, void *data);
 798
 799 /*
 800  * Scan inactive queue for pages we can cache or free.
 801  *
 802  * WARNING! Can be called from two pagedaemon threads simultaneously.
 803  */
 804 static int
 805 vm_pageout_scan_inactive(int pass, int q, long avail_shortage,
 806                          long *vnodes_skipped, long *counts)
 807 {
 808         vm_page_t m;
 809         struct vm_page marker;
 810         struct vnode *vpfailed;         /* warning, allowed to be stale */
 811         long maxscan;
 812         long delta = 0;
 813         long max_launder;
 814         int isep;
 815         int vmflush_flags;
 816
 817         isep = (curthread == emergpager);
 818
 819         /*
 820          * This routine is called for each of PQ_L2_SIZE inactive queues.
 821          * We want the vm_max_launder parameter to apply to the whole
 822          * queue (i.e. per-whole-queue pass, not per-sub-queue).
 823          *
 824          * In each successive full-pass when the page target is not met we
 825          * allow the per-queue max_launder to increase up to a maximum of
 826          * vm_max_launder / 16.
 827          */
 828         max_launder = (long)vm_max_launder / PQ_L2_SIZE;
 829         if (pass)
 830                 max_launder *= 2;
 831         max_launder = (max_launder + MAXSCAN_DIVIDER - 1) / MAXSCAN_DIVIDER;
 832
 833         if (max_launder <= 1)
 834                 max_launder = 1;
 835         if (max_launder >= vm_max_launder / 16)
 836                 max_launder = vm_max_launder / 16 + 1;
 837
 838         /*
 839          * Start scanning the inactive queue for pages we can move to the
 840          * cache or free.  The scan will stop when the target is reached or
 841          * we have scanned the entire inactive queue.  Note that m->act_count
 842          * is not used to form decisions for the inactive queue, only for the
 843          * active queue.
 844          *
 845          * NOTE!  THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED
 846          *        PAGES.
 847          */
 848
 849         /*
 850          * Initialize our marker
 851          */
 852         bzero(&marker, sizeof(marker));
 853         marker.flags = PG_FICTITIOUS | PG_MARKER;
 854         marker.busy_count = PBUSY_LOCKED;
 855         marker.queue = PQ_INACTIVE + q;
 856         marker.pc = q;
 857         marker.wire_count = 1;
 858
 859         /*
 860          * Inactive queue scan.
 861          *
 862          * We pick off approximately 1/10 of each queue.  Each queue is
 863          * effectively organized LRU so scanning the entire queue would
 864          * improperly pick up pages that might still be in regular use.
 865          *
 866          * NOTE: The vm_page must be spinlocked before the queue to avoid
 867          *       deadlocks, so it is easiest to simply iterate the loop
 868          *       with the queue unlocked at the top.
 869          */
 870         vpfailed = NULL;
 871
 872         vm_page_queues_spin_lock(PQ_INACTIVE + q);
 873         TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq);
 874         maxscan = (vm_page_queues[PQ_INACTIVE + q].lcnt + MAXSCAN_DIVIDER - 1) /
 875                   MAXSCAN_DIVIDER + 1;
 876
 877         /*
 878          * Queue locked at top of loop to avoid stack marker issues.
 879          */
 880         while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
 881                maxscan-- > 0 && avail_shortage - delta > 0)
 882         {
 883                 int count;
 884
 885                 KKASSERT(m->queue == PQ_INACTIVE + q);
 886                 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl,
 887                              &marker, pageq);
 888                 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE + q].pl, m,
 889                                    &marker, pageq);
 890                 mycpu->gd_cnt.v_pdpages++;
 891
 892                 /*
 893                  * Skip marker pages (atomic against other markers to avoid
 894                  * infinite hop-over scans).
 895                  */
 896                 if (m->flags & PG_MARKER)
 897                         continue;
 898
 899                 /*
 900                  * Try to busy the page.  Don't mess with pages which are
 901                  * already busy or reorder them in the queue.
 902                  */
 903                 if (vm_page_busy_try(m, TRUE))
 904                         continue;
 905
 906                 /*
 907                  * Remaining operations run with the page busy and neither
 908                  * the page or the queue will be spin-locked.
 909                  */
 910                 KKASSERT(m->queue == PQ_INACTIVE + q);
 911                 vm_page_queues_spin_unlock(PQ_INACTIVE + q);
 912
 913                 /*
 914                  * The emergency pager runs when the primary pager gets
 915                  * stuck, which typically means the primary pager deadlocked
 916                  * on a vnode-backed page.  Therefore, the emergency pager
 917                  * must skip any complex objects.
 918                  *
 919                  * We disallow VNODEs unless they are VCHR whos device ops
 920                  * does not flag D_NOEMERGPGR.
 921                  */
 922                 if (isep && m->object) {
 923                         struct vnode *vp;
 924
 925                         switch(m->object->type) {
 926                         case OBJT_DEFAULT:
 927                         case OBJT_SWAP:
 928                                 /*
 929                                  * Allow anonymous memory and assume that
 930                                  * swap devices are not complex, since its
 931                                  * kinda worthless if we can't swap out dirty
 932                                  * anonymous pages.
 933                                  */
 934                                 break;
 935                         case OBJT_VNODE:
 936                                 /*
 937                                  * Allow VCHR device if the D_NOEMERGPGR
 938                                  * flag is not set, deny other vnode types
 939                                  * as being too complex.
 940                                  */
 941                                 vp = m->object->handle;
 942                                 if (vp && vp->v_type == VCHR &&
 943                                     vp->v_rdev && vp->v_rdev->si_ops &&
 944                                     (vp->v_rdev->si_ops->head.flags &
 945                                      D_NOEMERGPGR) == 0) {
 946                                         break;
 947                                 }
 948                                 /* Deny - fall through */
 949                         default:
 950                                 /*
 951                                  * Deny
 952                                  */
 953                                 vm_page_wakeup(m);
 954                                 vm_page_queues_spin_lock(PQ_INACTIVE + q);
 955                                 lwkt_yield();
 956                                 continue;
 957                         }
 958                 }
 959
 960                 /*
 961                  * Try to pageout the page and perhaps other nearby pages.
 962                  * We want to get the pages into the cache eventually (
 963                  * first or second pass).  Otherwise the pages can wind up
 964                  * just cycling in the inactive queue, getting flushed over
 965                  * and over again.
 966                  *
 967                  * Generally speaking we recycle dirty pages within PQ_INACTIVE
 968                  * twice (double LRU) before paging them out.  If the
 969                  * memuse_mode is >= 3 we run them single-LRU like we do clean
 970                  * pages.
 971                  */
 972                 if (vm_pageout_memuse_mode >= 3)
 973                         vm_page_flag_set(m, PG_WINATCFLS);
 974
 975                 vmflush_flags = 0;
 976                 if (vm_pageout_allow_active)
 977                         vmflush_flags |= OBJPC_ALLOW_ACTIVE;
 978                 if (m->flags & PG_WINATCFLS)
 979                         vmflush_flags |= OBJPC_TRY_TO_CACHE;
 980                 count = vm_pageout_page(m, &max_launder, vnodes_skipped,
 981                                         &vpfailed, pass, vmflush_flags, counts);
 982                 delta += count;
 983
 984                 /*
 985                  * Systems with a ton of memory can wind up with huge
 986                  * deactivation counts.  Because the inactive scan is
 987                  * doing a lot of flushing, the combination can result
 988                  * in excessive paging even in situations where other
 989                  * unrelated threads free up sufficient VM.
 990                  *
 991                  * To deal with this we abort the nominal active->inactive
 992                  * scan before we hit the inactive target when free+cache
 993                  * levels have reached a reasonable target.
 994                  *
 995                  * When deciding to stop early we need to add some slop to
 996                  * the test and we need to return full completion to the caller
 997                  * to prevent the caller from thinking there is something
 998                  * wrong and issuing a low-memory+swap warning or pkill.
 999                  *
1000                  * A deficit forces paging regardless of the state of the
1001                  * VM page queues (used for RSS enforcement).
1002                  */
1003                 lwkt_yield();
1004                 vm_page_queues_spin_lock(PQ_INACTIVE + q);
1005
1006                 /* if (vm_paging_target() < -vm_max_launder) */
1007                 if (!vm_paging_target2()) {
1008                         /*
1009                          * Stopping early, return full completion to caller.
1010                          */
1011                         if (delta < avail_shortage)
1012                                 delta = avail_shortage;
1013                         break;
1014                 }
1015         }
1016
1017         /* page queue still spin-locked */
1018         TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq);
1019         vm_page_queues_spin_unlock(PQ_INACTIVE + q);
1020
1021         return (delta);
1022 }
1023
1024 /*
1025  * Pageout the specified page, return the total number of pages paged out
1026  * (this routine may cluster).
1027  *
1028  * The page must be busied and soft-busied by the caller and will be disposed
1029  * of by this function.
1030  */
1031 static int
1032 vm_pageout_page(vm_page_t m, long *max_launderp, long *vnodes_skippedp,
1033                 struct vnode **vpfailedp, int pass, int vmflush_flags,
1034                 long *counts)
1035 {
1036         vm_object_t object;
1037         int actcount;
1038         int count = 0;
1039
1040         /*
1041          * Wiring no longer removes a page from its queue.  The last unwiring
1042          * will requeue the page.  Obviously wired pages cannot be paged out
1043          * so unqueue it and return.
1044          */
1045         if (m->wire_count) {
1046                 vm_page_unqueue_nowakeup(m);
1047                 vm_page_wakeup(m);
1048                 return 0;
1049         }
1050
1051         /*
1052          * A held page may be undergoing I/O, so skip it.
1053          */
1054         if (m->hold_count) {
1055                 vm_page_and_queue_spin_lock(m);
1056                 if (m->queue - m->pc == PQ_INACTIVE) {
1057                         TAILQ_REMOVE(
1058                                 &vm_page_queues[m->queue].pl, m, pageq);
1059                         TAILQ_INSERT_TAIL(
1060                                 &vm_page_queues[m->queue].pl, m, pageq);
1061                 }
1062                 vm_page_and_queue_spin_unlock(m);
1063                 vm_page_wakeup(m);
1064                 return 0;
1065         }
1066
1067         if (m->object == NULL || m->object->ref_count == 0) {
1068                 /*
1069                  * If the object is not being used, we ignore previous
1070                  * references.
1071                  */
1072                 vm_page_flag_clear(m, PG_REFERENCED);
1073                 pmap_clear_reference(m);
1074                 /* fall through to end */
1075         } else if (((m->flags & PG_REFERENCED) == 0) &&
1076                     (actcount = pmap_ts_referenced(m))) {
1077                 /*
1078                  * Otherwise, if the page has been referenced while
1079                  * in the inactive queue, we bump the "activation
1080                  * count" upwards, making it less likely that the
1081                  * page will be added back to the inactive queue
1082                  * prematurely again.  Here we check the page tables
1083                  * (or emulated bits, if any), given the upper level
1084                  * VM system not knowing anything about existing
1085                  * references.
1086                  */
1087                 ++counts[3];
1088                 vm_page_activate(m);
1089                 m->act_count += (actcount + ACT_ADVANCE);
1090                 vm_page_wakeup(m);
1091                 return 0;
1092         }
1093
1094         /*
1095          * (m) is still busied.
1096          *
1097          * If the upper level VM system knows about any page
1098          * references, we activate the page.  We also set the
1099          * "activation count" higher than normal so that we will less
1100          * likely place pages back onto the inactive queue again.
1101          */
1102         if ((m->flags & PG_REFERENCED) != 0) {
1103                 vm_page_flag_clear(m, PG_REFERENCED);
1104                 actcount = pmap_ts_referenced(m);
1105                 vm_page_activate(m);
1106                 m->act_count += (actcount + ACT_ADVANCE + 1);
1107                 vm_page_wakeup(m);
1108                 ++counts[3];
1109                 return 0;
1110         }
1111
1112         /*
1113          * If the upper level VM system doesn't know anything about
1114          * the page being dirty, we have to check for it again.  As
1115          * far as the VM code knows, any partially dirty pages are
1116          * fully dirty.
1117          *
1118          * Pages marked PG_WRITEABLE may be mapped into the user
1119          * address space of a process running on another cpu.  A
1120          * user process (without holding the MP lock) running on
1121          * another cpu may be able to touch the page while we are
1122          * trying to remove it.  vm_page_cache() will handle this
1123          * case for us.
1124          */
1125         if (m->dirty == 0) {
1126                 vm_page_test_dirty(m);
1127         } else {
1128                 vm_page_dirty(m);
1129         }
1130
1131         if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) {
1132                 /*
1133                  * Invalid pages can be easily freed
1134                  */
1135                 vm_pageout_page_free(m);
1136                 mycpu->gd_cnt.v_dfree++;
1137                 ++count;
1138                 ++counts[1];
1139         } else if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) {
1140                 /*
1141                  * Clean pages can be placed onto the cache queue.
1142                  * This effectively frees them.
1143                  */
1144                 vm_page_cache(m);
1145                 ++count;
1146                 ++counts[1];
1147         } else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) {
1148                 /*
1149                  * Dirty pages need to be paged out, but flushing
1150                  * a page is extremely expensive verses freeing
1151                  * a clean page.  Rather then artificially limiting
1152                  * the number of pages we can flush, we instead give
1153                  * dirty pages extra priority on the inactive queue
1154                  * by forcing them to be cycled through the queue
1155                  * twice before being flushed, after which the
1156                  * (now clean) page will cycle through once more
1157                  * before being freed.  This significantly extends
1158                  * the thrash point for a heavily loaded machine.
1159                  */
1160                 ++counts[2];
1161                 vm_page_flag_set(m, PG_WINATCFLS);
1162                 vm_page_and_queue_spin_lock(m);
1163                 if (m->queue - m->pc == PQ_INACTIVE) {
1164                         TAILQ_REMOVE(
1165                                 &vm_page_queues[m->queue].pl, m, pageq);
1166                         TAILQ_INSERT_TAIL(
1167                                 &vm_page_queues[m->queue].pl, m, pageq);
1168                 }
1169                 vm_page_and_queue_spin_unlock(m);
1170                 vm_page_wakeup(m);
1171         } else if (*max_launderp > 0) {
1172                 /*
1173                  * We always want to try to flush some dirty pages if
1174                  * we encounter them, to keep the system stable.
1175                  * Normally this number is small, but under extreme
1176                  * pressure where there are insufficient clean pages
1177                  * on the inactive queue, we may have to go all out.
1178                  */
1179                 int swap_pageouts_ok;
1180                 struct vnode *vp = NULL;
1181
1182                 if ((m->flags & PG_WINATCFLS) == 0)
1183                         vm_page_flag_set(m, PG_WINATCFLS);
1184                 swap_pageouts_ok = 0;
1185                 object = m->object;
1186                 if (object &&
1187                     (object->type != OBJT_SWAP) &&
1188                     (object->type != OBJT_DEFAULT)) {
1189                         swap_pageouts_ok = 1;
1190                 } else {
1191                         swap_pageouts_ok = !(defer_swap_pageouts ||
1192                                              disable_swap_pageouts);
1193                         swap_pageouts_ok |= (!disable_swap_pageouts &&
1194                                              defer_swap_pageouts &&
1195                                              vm_paging_min());
1196                 }
1197
1198                 /*
1199                  * We don't bother paging objects that are "dead".
1200                  * Those objects are in a "rundown" state.
1201                  */
1202                 if (!swap_pageouts_ok ||
1203                     (object == NULL) ||
1204                     (object->flags & OBJ_DEAD)) {
1205                         vm_page_and_queue_spin_lock(m);
1206                         if (m->queue - m->pc == PQ_INACTIVE) {
1207                                 TAILQ_REMOVE(
1208                                     &vm_page_queues[m->queue].pl,
1209                                     m, pageq);
1210                                 TAILQ_INSERT_TAIL(
1211                                     &vm_page_queues[m->queue].pl,
1212                                     m, pageq);
1213                         }
1214                         vm_page_and_queue_spin_unlock(m);
1215                         vm_page_wakeup(m);
1216                         return 0;
1217                 }
1218
1219                 /*
1220                  * (m) is still busied.
1221                  *
1222                  * The object is already known NOT to be dead.   It
1223                  * is possible for the vget() to block the whole
1224                  * pageout daemon, but the new low-memory handling
1225                  * code should prevent it.
1226                  *
1227                  * The previous code skipped locked vnodes and, worse,
1228                  * reordered pages in the queue.  This results in
1229                  * completely non-deterministic operation because,
1230                  * quite often, a vm_fault has initiated an I/O and
1231                  * is holding a locked vnode at just the point where
1232                  * the pageout daemon is woken up.
1233                  *
1234                  * We can't wait forever for the vnode lock, we might
1235                  * deadlock due to a vn_read() getting stuck in
1236                  * vm_wait while holding this vnode.  We skip the
1237                  * vnode if we can't get it in a reasonable amount
1238                  * of time.
1239                  *
1240                  * vpfailed is used to (try to) avoid the case where
1241                  * a large number of pages are associated with a
1242                  * locked vnode, which could cause the pageout daemon
1243                  * to stall for an excessive amount of time.
1244                  */
1245                 if (object->type == OBJT_VNODE) {
1246                         int flags;
1247
1248                         vp = object->handle;
1249                         flags = LK_EXCLUSIVE;
1250                         if (vp == *vpfailedp)
1251                                 flags |= LK_NOWAIT;
1252                         else
1253                                 flags |= LK_TIMELOCK;
1254                         vm_page_hold(m);
1255                         vm_page_wakeup(m);
1256
1257                         /*
1258                          * We have unbusied (m) temporarily so we can
1259                          * acquire the vp lock without deadlocking.
1260                          * (m) is held to prevent destruction.
1261                          */
1262                         if (vget(vp, flags) != 0) {
1263                                 *vpfailedp = vp;
1264                                 ++pageout_lock_miss;
1265                                 if (object->flags & OBJ_MIGHTBEDIRTY)
1266                                             ++*vnodes_skippedp;
1267                                 vm_page_unhold(m);
1268                                 return 0;
1269                         }
1270
1271                         /*
1272                          * The page might have been moved to another
1273                          * queue during potential blocking in vget()
1274                          * above.  The page might have been freed and
1275                          * reused for another vnode.  The object might
1276                          * have been reused for another vnode.
1277                          */
1278                         if (m->queue - m->pc != PQ_INACTIVE ||
1279                             m->object != object ||
1280                             object->handle != vp) {
1281                                 if (object->flags & OBJ_MIGHTBEDIRTY)
1282                                         ++*vnodes_skippedp;
1283                                 vput(vp);
1284                                 vm_page_unhold(m);
1285                                 return 0;
1286                         }
1287
1288                         /*
1289                          * The page may have been busied during the
1290                          * blocking in vput();  We don't move the
1291                          * page back onto the end of the queue so that
1292                          * statistics are more correct if we don't.
1293                          */
1294                         if (vm_page_busy_try(m, TRUE)) {
1295                                 vput(vp);
1296                                 vm_page_unhold(m);
1297                                 return 0;
1298                         }
1299                         vm_page_unhold(m);
1300
1301                         /*
1302                          * If it was wired while we didn't own it.
1303                          */
1304                         if (m->wire_count) {
1305                                 vm_page_unqueue_nowakeup(m);
1306                                 vput(vp);
1307                                 vm_page_wakeup(m);
1308                                 return 0;
1309                         }
1310
1311                         /*
1312                          * (m) is busied again
1313                          *
1314                          * We own the busy bit and remove our hold
1315                          * bit.  If the page is still held it
1316                          * might be undergoing I/O, so skip it.
1317                          */
1318                         if (m->hold_count) {
1319 rebusy_failed:
1320                                 vm_page_and_queue_spin_lock(m);
1321                                 if (m->queue - m->pc == PQ_INACTIVE) {
1322                                         TAILQ_REMOVE(&vm_page_queues[m->queue].pl, m, pageq);
1323                                         TAILQ_INSERT_TAIL(&vm_page_queues[m->queue].pl, m, pageq);
1324                                 }
1325                                 vm_page_and_queue_spin_unlock(m);
1326                                 if (object->flags & OBJ_MIGHTBEDIRTY)
1327                                         ++*vnodes_skippedp;
1328                                 vm_page_wakeup(m);
1329                                 vput(vp);
1330                                 return 0;
1331                         }
1332
1333                         /*
1334                          * Recheck queue, object, and vp now that we have
1335                          * rebusied the page.
1336                          */
1337                         if (m->queue - m->pc != PQ_INACTIVE ||
1338                             m->object != object ||
1339                             object->handle != vp) {
1340                                 kprintf("vm_pageout_page: "
1341                                         "rebusy %p failed(A)\n",
1342                                         m);
1343                                 goto rebusy_failed;
1344                         }
1345
1346                         /*
1347                          * Check page validity
1348                          */
1349                         if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) {
1350                                 kprintf("vm_pageout_page: "
1351                                         "rebusy %p failed(B)\n",
1352                                         m);
1353                                 goto rebusy_failed;
1354                         }
1355                         if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) {
1356                                 kprintf("vm_pageout_page: "
1357                                         "rebusy %p failed(C)\n",
1358                                         m);
1359                                 goto rebusy_failed;
1360                         }
1361
1362                         /* (m) is left busied as we fall through */
1363                 }
1364
1365                 /*
1366                  * page is busy and not held here.
1367                  *
1368                  * If a page is dirty, then it is either being washed
1369                  * (but not yet cleaned) or it is still in the
1370                  * laundry.  If it is still in the laundry, then we
1371                  * start the cleaning operation.
1372                  *
1373                  * decrement inactive_shortage on success to account
1374                  * for the (future) cleaned page.  Otherwise we
1375                  * could wind up laundering or cleaning too many
1376                  * pages.
1377                  *
1378                  * NOTE: Cleaning the page here does not cause
1379                  *       force_deficit to be adjusted, because the
1380                  *       page is not being freed or moved to the
1381                  *       cache.
1382                  */
1383                 count = vm_pageout_clean_helper(m, vmflush_flags);
1384                 counts[0] += count;
1385                 *max_launderp -= count;
1386
1387                 /*
1388                  * Clean ate busy, page no longer accessible
1389                  */
1390                 if (vp != NULL)
1391                         vput(vp);
1392         } else {
1393                 vm_page_wakeup(m);
1394         }
1395         return count;
1396 }
1397
1398 /*
1399  * Scan active queue
1400  *
1401  * WARNING! Can be called from two pagedaemon threads simultaneously.
1402  */
1403 static int
1404 vm_pageout_scan_active(int pass, int q,
1405                        long avail_shortage, long inactive_shortage,
1406                        struct vm_page *marker,
1407                        long *recycle_countp)
1408 {
1409         vm_page_t m;
1410         int actcount;
1411         long delta = 0;
1412         long maxscan;
1413         int isep;
1414
1415         isep = (curthread == emergpager);
1416
1417         /*
1418          * We want to move pages from the active queue to the inactive
1419          * queue to get the inactive queue to the inactive target.  If
1420          * we still have a page shortage from above we try to directly free
1421          * clean pages instead of moving them.
1422          *
1423          * If we do still have a shortage we keep track of the number of
1424          * pages we free or cache (recycle_count) as a measure of thrashing
1425          * between the active and inactive queues.
1426          *
1427          * If we were able to completely satisfy the free+cache targets
1428          * from the inactive pool we limit the number of pages we move
1429          * from the active pool to the inactive pool to 2x the pages we
1430          * had removed from the inactive pool (with a minimum of 1/5 the
1431          * inactive target).  If we were not able to completely satisfy
1432          * the free+cache targets we go for the whole target aggressively.
1433          *
1434          * NOTE: Both variables can end up negative.
1435          * NOTE: We are still in a critical section.
1436          *
1437          * NOTE!  THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED
1438          *        PAGES.
1439          */
1440
1441         vm_page_queues_spin_lock(PQ_ACTIVE + q);
1442         maxscan = (vm_page_queues[PQ_ACTIVE + q].lcnt + MAXSCAN_DIVIDER - 1) /
1443                   MAXSCAN_DIVIDER + 1;
1444
1445         /*
1446          * Queue locked at top of loop to avoid stack marker issues.
1447          */
1448         while ((m = TAILQ_NEXT(marker, pageq)) != NULL &&
1449                maxscan-- > 0 && (avail_shortage - delta > 0 ||
1450                                 inactive_shortage > 0))
1451         {
1452                 KKASSERT(m->queue == PQ_ACTIVE + q);
1453                 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl,
1454                              marker, pageq);
1455                 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m,
1456                                    marker, pageq);
1457
1458                 /*
1459                  * Skip marker pages (atomic against other markers to avoid
1460                  * infinite hop-over scans).
1461                  */
1462                 if (m->flags & PG_MARKER)
1463                         continue;
1464
1465                 /*
1466                  * Try to busy the page.  Don't mess with pages which are
1467                  * already busy or reorder them in the queue.
1468                  */
1469                 if (vm_page_busy_try(m, TRUE))
1470                         continue;
1471
1472                 /*
1473                  * Remaining operations run with the page busy and neither
1474                  * the page or the queue will be spin-locked.
1475                  */
1476                 KKASSERT(m->queue == PQ_ACTIVE + q);
1477                 vm_page_queues_spin_unlock(PQ_ACTIVE + q);
1478
1479 #if 0
1480                 /*
1481                  * Don't deactivate pages that are held, even if we can
1482                  * busy them.  (XXX why not?)
1483                  */
1484                 if (m->hold_count) {
1485                         vm_page_and_queue_spin_lock(m);
1486                         if (m->queue - m->pc == PQ_ACTIVE) {
1487                                 TAILQ_REMOVE(
1488                                         &vm_page_queues[PQ_ACTIVE + q].pl,
1489                                         m, pageq);
1490                                 TAILQ_INSERT_TAIL(
1491                                         &vm_page_queues[PQ_ACTIVE + q].pl,
1492                                         m, pageq);
1493                         }
1494                         vm_page_and_queue_spin_unlock(m);
1495                         vm_page_wakeup(m);
1496                         goto next;
1497                 }
1498 #endif
1499                 /*
1500                  * We can just remove wired pages from the queue
1501                  */
1502                 if (m->wire_count) {
1503                         vm_page_unqueue_nowakeup(m);
1504                         vm_page_wakeup(m);
1505                         goto next;
1506                 }
1507
1508                 /*
1509                  * The emergency pager ignores vnode-backed pages as these
1510                  * are the pages that probably bricked the main pager.
1511                  */
1512                 if (isep && m->object && m->object->type == OBJT_VNODE) {
1513 #if 0
1514                         vm_page_and_queue_spin_lock(m);
1515                         if (m->queue - m->pc == PQ_ACTIVE) {
1516                                 TAILQ_REMOVE(
1517                                         &vm_page_queues[PQ_ACTIVE + q].pl,
1518                                         m, pageq);
1519                                 TAILQ_INSERT_TAIL(
1520                                         &vm_page_queues[PQ_ACTIVE + q].pl,
1521                                         m, pageq);
1522                         }
1523                         vm_page_and_queue_spin_unlock(m);
1524 #endif
1525                         vm_page_wakeup(m);
1526                         goto next;
1527                 }
1528
1529                 /*
1530                  * The count for pagedaemon pages is done after checking the
1531                  * page for eligibility...
1532                  */
1533                 mycpu->gd_cnt.v_pdpages++;
1534
1535                 /*
1536                  * Check to see "how much" the page has been used and clear
1537                  * the tracking access bits.  If the object has no references
1538                  * don't bother paying the expense.
1539                  */
1540                 actcount = 0;
1541                 if (m->object && m->object->ref_count != 0) {
1542                         if (m->flags & PG_REFERENCED)
1543                                 ++actcount;
1544                         actcount += pmap_ts_referenced(m);
1545                         if (actcount) {
1546                                 m->act_count += ACT_ADVANCE + actcount;
1547                                 if (m->act_count > ACT_MAX)
1548                                         m->act_count = ACT_MAX;
1549                         }
1550                 }
1551                 vm_page_flag_clear(m, PG_REFERENCED);
1552
1553                 /*
1554                  * actcount is only valid if the object ref_count is non-zero.
1555                  * If the page does not have an object, actcount will be zero.
1556                  */
1557                 if (actcount && m->object->ref_count != 0) {
1558 #if 0
1559                         vm_page_and_queue_spin_lock(m);
1560                         if (m->queue - m->pc == PQ_ACTIVE) {
1561                                 TAILQ_REMOVE(
1562                                         &vm_page_queues[PQ_ACTIVE + q].pl,
1563                                         m, pageq);
1564                                 TAILQ_INSERT_TAIL(
1565                                         &vm_page_queues[PQ_ACTIVE + q].pl,
1566                                         m, pageq);
1567                         }
1568                         vm_page_and_queue_spin_unlock(m);
1569 #endif
1570                         vm_page_wakeup(m);
1571                 } else {
1572                         switch(m->object->type) {
1573                         case OBJT_DEFAULT:
1574                         case OBJT_SWAP:
1575                                 m->act_count -= min(m->act_count,
1576                                                     vm_anonmem_decline);
1577                                 break;
1578                         default:
1579                                 m->act_count -= min(m->act_count,
1580                                                     vm_filemem_decline);
1581                                 break;
1582                         }
1583                         if (vm_pageout_algorithm ||
1584                             (m->object == NULL) ||
1585                             (m->object && (m->object->ref_count == 0)) ||
1586                             m->act_count < pass + 1
1587                         ) {
1588                                 /*
1589                                  * Deactivate the page.  If we had a
1590                                  * shortage from our inactive scan try to
1591                                  * free (cache) the page instead.
1592                                  *
1593                                  * Don't just blindly cache the page if
1594                                  * we do not have a shortage from the
1595                                  * inactive scan, that could lead to
1596                                  * gigabytes being moved.
1597                                  */
1598                                 --inactive_shortage;
1599                                 if (avail_shortage - delta > 0 ||
1600                                     (m->object && (m->object->ref_count == 0)))
1601                                 {
1602                                         if (avail_shortage - delta > 0)
1603                                                 ++*recycle_countp;
1604                                         vm_page_protect(m, VM_PROT_NONE);
1605                                         if (m->dirty == 0 &&
1606                                             (m->flags & PG_NEED_COMMIT) == 0 &&
1607                                             avail_shortage - delta > 0) {
1608                                                 vm_page_cache(m);
1609                                         } else {
1610                                                 vm_page_deactivate(m);
1611                                                 vm_page_wakeup(m);
1612                                         }
1613                                 } else {
1614                                         vm_page_deactivate(m);
1615                                         vm_page_wakeup(m);
1616                                 }
1617                                 ++delta;
1618                         } else {
1619                                 /*
1620                                  * Do nothing
1621                                  */
1622 #if 0
1623                                 vm_page_and_queue_spin_lock(m);
1624                                 if (m->queue - m->pc == PQ_ACTIVE) {
1625                                         TAILQ_REMOVE(
1626                                             &vm_page_queues[PQ_ACTIVE + q].pl,
1627                                             m, pageq);
1628                                         TAILQ_INSERT_TAIL(
1629                                             &vm_page_queues[PQ_ACTIVE + q].pl,
1630                                             m, pageq);
1631                                 }
1632                                 vm_page_and_queue_spin_unlock(m);
1633 #endif
1634                                 vm_page_wakeup(m);
1635                         }
1636                 }
1637 next:
1638                 lwkt_yield();
1639                 vm_page_queues_spin_lock(PQ_ACTIVE + q);
1640         }
1641
1642         /*
1643          * Clean out our local marker.
1644          *
1645          * Page queue still spin-locked.
1646          */
1647         if (m == NULL) {
1648                 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl,
1649                              marker, pageq);
1650                 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl,
1651                              marker, pageq);
1652         }
1653         vm_page_queues_spin_unlock(PQ_ACTIVE + q);
1654
1655         return (delta);
1656 }
1657
1658 /*
1659  * The number of actually free pages can drop down to v_free_reserved,
1660  * we try to build the free count back above v_free_min, to v_free_target.
1661  *
1662  * Cache pages are already counted as being free-ish.
1663  *
1664  * NOTE: we are still in a critical section.
1665  *
1666  * Pages moved from PQ_CACHE to totally free are not counted in the
1667  * pages_freed counter.
1668  *
1669  * WARNING! Can be called from two pagedaemon threads simultaneously.
1670  */
1671 static void
1672 vm_pageout_scan_cache(long avail_shortage, int pass,
1673                       long vnodes_skipped, long recycle_count)
1674 {
1675         static int lastkillticks;
1676         struct vm_pageout_scan_info info;
1677         vm_page_t m;
1678         int isep;
1679
1680         isep = (curthread == emergpager);
1681
1682         /*
1683          * Test conditions also include a safeety against v_free_min in
1684          * case the sysop messes up the sysctls.
1685          *
1686          * Also include a test to avoid degenerate scans.
1687          */
1688         while ((vmstats.v_free_count < vmstats.v_free_target ||
1689                 vmstats.v_free_count < vmstats.v_free_min) &&
1690                vmstats.v_cache_count > VM_CACHE_SCAN_MIN)
1691         {
1692                 /*
1693                  * This steals some code from vm/vm_page.c
1694                  *
1695                  * Create two rovers and adjust the code to reduce
1696                  * chances of them winding up at the same index (which
1697                  * can cause a lot of contention).
1698                  */
1699                 static int cache_rover[2] = { 0, PQ_L2_MASK / 2 };
1700
1701                 if (((cache_rover[0] ^ cache_rover[1]) & PQ_L2_MASK) == 0)
1702                         goto next_rover;
1703
1704                 m = vm_page_list_find(PQ_CACHE, cache_rover[isep] & PQ_L2_MASK);
1705                 if (m == NULL)
1706                         break;
1707                 /*
1708                  * page is returned removed from its queue and spinlocked.
1709                  *
1710                  * If the busy attempt fails we can still deactivate the page.
1711                  */
1712                 if (vm_page_busy_try(m, TRUE)) {
1713                         vm_page_deactivate_locked(m);
1714                         vm_page_spin_unlock(m);
1715                         continue;
1716                 }
1717                 vm_page_spin_unlock(m);
1718                 pagedaemon_wakeup();
1719                 lwkt_yield();
1720
1721                 /*
1722                  * Report a possible edge case.  This shouldn't happen but
1723                  * actually I think it can race against e.g.
1724                  * vm_page_lookup()/busy sequences.  If the page isn't
1725                  * in a cache-like state we will deactivate and skip it.
1726                  */
1727                 if ((m->flags & PG_MAPPED) || (m->valid & m->dirty)) {
1728                         kprintf("WARNING! page race during find/busy: %p "
1729                                 "queue == %d dirty=%02x\n",
1730                                 m, m->queue - m->pc, m->dirty);
1731                 }
1732
1733                 /*
1734                  * Remaining operations run with the page busy and neither
1735                  * the page or the queue will be spin-locked.
1736                  */
1737                 if ((m->flags & (PG_UNQUEUED | PG_NEED_COMMIT | PG_MAPPED)) ||
1738                     m->hold_count ||
1739                     m->wire_count ||
1740                     (m->valid & m->dirty))
1741                 {
1742                         vm_page_deactivate(m);
1743                         vm_page_wakeup(m);
1744                         continue;
1745                 }
1746
1747                 /*
1748                  * Because the page is in the cache, it shouldn't be mapped.
1749                  */
1750                 pmap_mapped_sync(m);
1751                 KKASSERT((m->flags & PG_MAPPED) == 0);
1752                 KKASSERT(m->dirty == 0);
1753                 vm_pageout_page_free(m);
1754                 mycpu->gd_cnt.v_dfree++;
1755 next_rover:
1756                 if (isep)
1757                         cache_rover[1] -= PQ_PRIME2;
1758                 else
1759                         cache_rover[0] += PQ_PRIME2;
1760         }
1761
1762         /*
1763          * If we didn't get enough free pages, and we have skipped a vnode
1764          * in a writeable object, wakeup the sync daemon.  And kick swapout
1765          * if we did not get enough free pages.
1766          */
1767         if (vm_paging_target1()) {
1768                 if (vnodes_skipped && vm_paging_min())
1769                         speedup_syncer(NULL);
1770 #if !defined(NO_SWAPPING)
1771                 if (vm_swap_enabled && vm_paging_target1())
1772                         vm_req_vmdaemon();
1773 #endif
1774         }
1775
1776         /*
1777          * Handle catastrophic conditions.  Under good conditions we should
1778          * be at the target, well beyond our minimum.  If we could not even
1779          * reach our minimum the system is under heavy stress.  But just being
1780          * under heavy stress does not trigger process killing.
1781          *
1782          * We consider ourselves to have run out of memory if the swap pager
1783          * is full and avail_shortage is still positive.  The secondary check
1784          * ensures that we do not kill processes if the instantanious
1785          * availability is good, even if the pageout demon pass says it
1786          * couldn't get to the target.
1787          *
1788          * NOTE!  THE EMERGENCY PAGER (isep) DOES NOT HANDLE SWAP FULL
1789          *        SITUATIONS.
1790          */
1791         if (swap_pager_almost_full &&
1792             pass > 0 &&
1793             isep == 0 &&
1794             (vm_paging_min_dnc(recycle_count) || avail_shortage > 0)) {
1795                 kprintf("Warning: system low on memory+swap "
1796                         "shortage %ld for %d ticks!\n",
1797                         avail_shortage, ticks - swap_fail_ticks);
1798                 if (bootverbose) {
1799                         kprintf("Metrics: spaf=%d spf=%d pass=%d "
1800                                 "availshrt=%ld tgt=%d/%d inacshrt=%ld "
1801                                 "last=%u\n",
1802                                 swap_pager_almost_full,
1803                                 swap_pager_full,
1804                                 pass,
1805                                 avail_shortage,
1806                                 vm_paging_target1(),
1807                                 vm_paging_target2(),
1808                                 vm_paging_target2_count(),
1809                                 (unsigned int)(ticks - lastkillticks));
1810                 }
1811         }
1812         if (swap_pager_full &&
1813             pass > 1 &&
1814             isep == 0 &&
1815             avail_shortage > 0 &&
1816             vm_paging_target1() &&
1817             (unsigned int)(ticks - lastkillticks) >= hz)
1818         {
1819                 /*
1820                  * Kill something, maximum rate once per second to give
1821                  * the process time to free up sufficient memory.
1822                  */
1823                 lastkillticks = ticks;
1824                 info.bigproc = NULL;
1825                 info.bigsize = 0;
1826                 allproc_scan(vm_pageout_scan_callback, &info, 0);
1827                 if (info.bigproc != NULL) {
1828                         kprintf("Try to kill process %d %s\n",
1829                                 info.bigproc->p_pid, info.bigproc->p_comm);
1830                         info.bigproc->p_nice = PRIO_MIN;
1831                         info.bigproc->p_usched->resetpriority(
1832                                 FIRST_LWP_IN_PROC(info.bigproc));
1833                         atomic_set_int(&info.bigproc->p_flags, P_LOWMEMKILL);
1834                         killproc(info.bigproc, "out of swap space");
1835                         wakeup(&vmstats.v_free_count);
1836                         PRELE(info.bigproc);
1837                 }
1838         }
1839 }
1840
1841 static int
1842 vm_pageout_scan_callback(struct proc *p, void *data)
1843 {
1844         struct vm_pageout_scan_info *info = data;
1845         vm_offset_t size;
1846
1847         /*
1848          * Never kill system processes or init.  If we have configured swap
1849          * then try to avoid killing low-numbered pids.
1850          */
1851         if ((p->p_flags & P_SYSTEM) || (p->p_pid == 1) ||
1852             ((p->p_pid < 48) && (vm_swap_size != 0))) {
1853                 return (0);
1854         }
1855
1856         lwkt_gettoken(&p->p_token);
1857
1858         /*
1859          * if the process is in a non-running type state,
1860          * don't touch it.
1861          */
1862         if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) {
1863                 lwkt_reltoken(&p->p_token);
1864                 return (0);
1865         }
1866
1867         /*
1868          * Get the approximate process size.  Note that anonymous pages
1869          * with backing swap will be counted twice, but there should not
1870          * be too many such pages due to the stress the VM system is
1871          * under at this point.
1872          */
1873         size = vmspace_anonymous_count(p->p_vmspace) +
1874                 vmspace_swap_count(p->p_vmspace);
1875
1876         /*
1877          * If the this process is bigger than the biggest one
1878          * remember it.
1879          */
1880         if (info->bigsize < size) {
1881                 if (info->bigproc)
1882                         PRELE(info->bigproc);
1883                 PHOLD(p);
1884                 info->bigproc = p;
1885                 info->bigsize = size;
1886         }
1887         lwkt_reltoken(&p->p_token);
1888         lwkt_yield();
1889
1890         return(0);
1891 }
1892
1893 /*
1894  * This old guy slowly walks PQ_HOLD looking for pages which need to be
1895  * moved back to PQ_FREE.  It is possible for pages to accumulate here
1896  * when vm_page_free() races against vm_page_unhold(), resulting in a
1897  * page being left on a PQ_HOLD queue with hold_count == 0.
1898  *
1899  * It is easier to handle this edge condition here, in non-critical code,
1900  * rather than enforce a spin-lock for every 1->0 transition in
1901  * vm_page_unhold().
1902  *
1903  * NOTE: TAILQ_FOREACH becomes invalid the instant we unlock the queue.
1904  */
1905 static void
1906 vm_pageout_scan_hold(int q, struct vm_page *marker)
1907 {
1908         vm_page_t m;
1909         long pcount;
1910
1911         pcount = vm_page_queues[PQ_HOLD + q].lcnt;
1912         if (pcount > vm_pageout_stats_scan)
1913                 pcount = vm_pageout_stats_scan;
1914
1915         vm_page_queues_spin_lock(PQ_HOLD + q);
1916         while ((m = TAILQ_NEXT(marker, pageq)) != NULL &&
1917                pcount-- > 0)
1918         {
1919                 KKASSERT(m->queue == PQ_HOLD + q);
1920                 TAILQ_REMOVE(&vm_page_queues[PQ_HOLD + q].pl, marker, pageq);
1921                 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_HOLD + q].pl, m,
1922                                    marker, pageq);
1923
1924                 if (m->flags & PG_MARKER)
1925                         continue;
1926
1927                 /*
1928                  * Process one page and return
1929                  */
1930                 if (m->hold_count)
1931                         break;
1932                 kprintf("DEBUG: pageout HOLD->FREE %p\n", m);
1933                 vm_page_hold(m);
1934                 vm_page_queues_spin_unlock(PQ_HOLD + q);
1935                 vm_page_unhold(m);      /* reprocess */
1936                 vm_page_queues_spin_lock(PQ_HOLD + q);
1937         }
1938
1939         /*
1940          * If queue exhausted move the marker back to the head.
1941          */
1942         if (m == NULL) {
1943                 TAILQ_REMOVE(&vm_page_queues[PQ_HOLD + q].pl,
1944                              marker, pageq);
1945                 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_HOLD + q].pl,
1946                              marker, pageq);
1947         }
1948
1949         vm_page_queues_spin_unlock(PQ_HOLD + q);
1950 }
1951
1952 /*
1953  * This code maintains the m->act for active pages.  The scan occurs only
1954  * as long as the pageout daemon is not running or the inactive target has
1955  * not been reached.
1956  *
1957  * The restrictions prevent an idle machine from degrading all VM pages
1958  * m->act to 0 or nearly 0, which makes the field useless.  For example, if
1959  * a workstation user goes to bed.
1960  */
1961 static void
1962 vm_pageout_page_stats(int q, struct vm_page *marker, long *counterp)
1963 {
1964         struct vpgqueues *pq = &vm_page_queues[PQ_ACTIVE + q];
1965         vm_page_t m;
1966         long pcount;                    /* Number of pages to check */
1967
1968         /*
1969          * No point scanning the active queue if it is smaller than
1970          * 1/2 usable memory.  This most typically occurs at system
1971          * startup or if a huge amount of memory has just been freed.
1972          */
1973         if (vmstats.v_active_count < vmstats.v_free_count +
1974                                      vmstats.v_cache_count +
1975                                      vmstats.v_inactive_count)
1976         {
1977                 return;
1978         }
1979
1980         /*
1981          * Generally do not scan if the pageout daemon is not running
1982          * or the inactive target has been reached.  However, we override
1983          * this and scan anyway for N seconds after the pageout daemon last
1984          * ran.
1985          *
1986          * This last bit is designed to give the system a little time to
1987          * stage more pages for potential deactivation.  In this situation,
1988          * if the inactive target has been met, we just update m->act_count
1989          * and do not otherwise mess with the page.  But we don't want it
1990          * to run forever because that would cause m->act to become unusable
1991          * if the machine were to become idle.
1992          */
1993         if (vm_pages_needed == 0 && !vm_paging_inactive()) {
1994                 if (time_uptime - vm_pagedaemon_uptime > vm_pageout_stats_rsecs)
1995                         return;
1996         }
1997
1998         if (vm_pageout_debug) {
1999                 static time_t save_time;
2000                 if (save_time != time_uptime) {
2001                         save_time = time_uptime;
2002                         kprintf("DEACTIVATE Q=%4d N=%ld\n",
2003                                 q, vm_paging_inactive_count());
2004                 }
2005         }
2006
2007         /*
2008          * Limited scan to reduce cpu glitches, just in case the
2009          * pmap_ts_referenced() burns a lot of CPU.
2010          */
2011         pcount = pq->lcnt;
2012         if (pcount > vm_pageout_stats_scan)
2013                 pcount = vm_pageout_stats_scan;
2014
2015         vm_page_queues_spin_lock(PQ_ACTIVE + q);
2016
2017         /*
2018          * Queue locked at top of loop to avoid stack marker issues.
2019          */
2020         while ((m = TAILQ_NEXT(marker, pageq)) != NULL &&
2021                pcount-- > 0)
2022         {
2023                 int actcount;
2024
2025                 KKASSERT(m->queue == PQ_ACTIVE + q);
2026                 TAILQ_REMOVE(&pq->pl, marker, pageq);
2027                 TAILQ_INSERT_AFTER(&pq->pl, m, marker, pageq);
2028
2029                 /*
2030                  * Skip marker pages (atomic against other markers to avoid
2031                  * infinite hop-over scans).
2032                  */
2033                 if (m->flags & PG_MARKER)
2034                         continue;
2035
2036                 ++counterp[0];
2037
2038                 /*
2039                  * Ignore pages we can't busy
2040                  */
2041                 if (vm_page_busy_try(m, TRUE)) {
2042                         continue;
2043                 }
2044
2045                 /*
2046                  * Remaining operations run with the page busy and neither
2047                  * the page or the queue will be spin-locked.
2048                  */
2049                 KKASSERT(m->queue == PQ_ACTIVE + q);
2050                 vm_page_queues_spin_unlock(PQ_ACTIVE + q);
2051
2052                 /*
2053                  * We can just remove wired pages from the queue
2054                  */
2055                 if (m->wire_count) {
2056                         vm_page_unqueue_nowakeup(m);
2057                         vm_page_wakeup(m);
2058                         goto next;
2059                 }
2060
2061
2062                 /*
2063                  * We now have a safely busied page, the page and queue
2064                  * spinlocks have been released.
2065                  *
2066                  * Ignore held and wired pages
2067                  */
2068                 if (m->hold_count || m->wire_count) {
2069                         vm_page_wakeup(m);
2070                         goto next;
2071                 }
2072
2073                 /*
2074                  * Calculate activity
2075                  */
2076                 actcount = 0;
2077                 if (m->flags & PG_REFERENCED) {
2078                         vm_page_flag_clear(m, PG_REFERENCED);
2079                         actcount += 1;
2080                 }
2081                 actcount += pmap_ts_referenced(m);
2082
2083                 /*
2084                  * Update act_count and move page to end of queue.
2085                  */
2086                 if (actcount) {
2087                         m->act_count += ACT_ADVANCE + actcount;
2088                         if (m->act_count > ACT_MAX)
2089                                 m->act_count = ACT_MAX;
2090 #if 0
2091                         vm_page_and_queue_spin_lock(m);
2092                         if (m->queue - m->pc == PQ_ACTIVE) {
2093                                 TAILQ_REMOVE(&pq->pl, m, pageq);
2094                                 TAILQ_INSERT_TAIL(&pq->pl, m, pageq);
2095                         }
2096                         vm_page_and_queue_spin_unlock(m);
2097 #endif
2098                         vm_page_wakeup(m);
2099                         goto next;
2100                 }
2101
2102                 if (m->act_count == 0) {
2103                         /*
2104                          * If the deactivation target has not been reached
2105                          * we try to deactivate the page.
2106                          *
2107                          * If the deactivation target has been reached it
2108                          * is a complete waste of time (both now and later)
2109                          * to try to deactivate more pages.
2110                          */
2111                         if (vm_paging_inactive()) {
2112                                 vm_page_protect(m, VM_PROT_NONE);
2113                                 vm_page_deactivate(m);
2114                         }
2115                         ++counterp[1];
2116                 } else {
2117                         m->act_count -= min(m->act_count, ACT_DECLINE);
2118 #if 0
2119                         vm_page_and_queue_spin_lock(m);
2120                         if (m->queue - m->pc == PQ_ACTIVE) {
2121                                 TAILQ_REMOVE(&pq->pl, m, pageq);
2122                                 TAILQ_INSERT_TAIL(&pq->pl, m, pageq);
2123                         }
2124                         vm_page_and_queue_spin_unlock(m);
2125 #endif
2126
2127                         if (m->act_count < vm_pageout_stats_actcmp) {
2128                                 if (vm_paging_inactive()) {
2129                                         vm_page_protect(m, VM_PROT_NONE);
2130                                         vm_page_deactivate(m);
2131                                 }
2132                                 ++counterp[1];
2133                         }
2134                 }
2135                 vm_page_wakeup(m);
2136 next:
2137                 vm_page_queues_spin_lock(PQ_ACTIVE + q);
2138         }
2139
2140         /*
2141          * If the queue has been exhausted move the marker back to the head.
2142          */
2143         if (m == NULL) {
2144                 TAILQ_REMOVE(&pq->pl, marker, pageq);
2145                 TAILQ_INSERT_HEAD(&pq->pl, marker, pageq);
2146         }
2147
2148         /*
2149          * Remove our local marker
2150          *
2151          * Page queue still spin-locked.
2152          */
2153         vm_page_queues_spin_unlock(PQ_ACTIVE + q);
2154
2155         /*
2156          * After roughly every (inalim) pages determine if we are making
2157          * appropriate progress.  If we are then reduce the comparison point
2158          * for act_count, and if we are not increase the comparison point.
2159          *
2160          * This allows us to handle heavier loads and also balances the
2161          * code, particularly at startup.
2162          */
2163         if (counterp[0] > vm_pageout_stats_inalim) {
2164                 if (counterp[1] < vm_pageout_stats_inamin) {
2165                         if (vm_pageout_stats_actcmp < ACT_MAX * 3 / 4)
2166                                 ++vm_pageout_stats_actcmp;
2167                 } else {
2168                         if (vm_pageout_stats_actcmp > 0)
2169                                 --vm_pageout_stats_actcmp;
2170                 }
2171                 counterp[0] = 0;
2172                 counterp[1] = 0;
2173         }
2174 }
2175
2176 static void
2177 vm_pageout_free_page_calc(vm_size_t count)
2178 {
2179         /*
2180          * v_free_min           normal allocations
2181          * v_free_reserved      system allocations
2182          * v_pageout_free_min   allocations by pageout daemon
2183          * v_interrupt_free_min low level allocations (e.g swap structures)
2184          *
2185          * v_free_min is used to generate several other baselines, and they
2186          * can get pretty silly on systems with a lot of memory.
2187          */
2188         vmstats.v_free_min = 64 + vmstats.v_page_count / 200;
2189         vmstats.v_free_reserved = vmstats.v_free_min * 4 / 8 + 7;
2190         vmstats.v_free_severe = vmstats.v_free_min * 4 / 8 + 0;
2191         vmstats.v_pageout_free_min = vmstats.v_free_min * 2 / 8 + 7;
2192         vmstats.v_interrupt_free_min = vmstats.v_free_min * 1 / 8 + 7;
2193 }
2194
2195
2196 /*
2197  * vm_pageout is the high level pageout daemon.  TWO kernel threads run
2198  * this daemon, the primary pageout daemon and the emergency pageout daemon.
2199  *
2200  * The emergency pageout daemon takes over when the primary pageout daemon
2201  * deadlocks.  The emergency pageout daemon ONLY pages out to swap, thus
2202  * avoiding the many low-memory deadlocks which can occur when paging out
2203  * to VFS's.
2204  */
2205 static void
2206 vm_pageout_thread(void)
2207 {
2208         int pass;
2209         int q;
2210         int q1iterator = 0;
2211         int q2iterator = 0;
2212         int q3iterator = 0;
2213         int isep;
2214         enum { PAGING_IDLE, PAGING_TARGET1, PAGING_TARGET2 } state;
2215         struct markers *markers;
2216         long scounter[2] = { 0, 0 };
2217         time_t warn_time;
2218
2219         curthread->td_flags |= TDF_SYSTHREAD;
2220         state = PAGING_IDLE;
2221
2222         /*
2223          * Allocate continuous markers for hold, stats (active), and
2224          * paging active queue scan.  These scans occur incrementally.
2225          */
2226         markers = kmalloc(sizeof(*markers) * PQ_L2_SIZE,
2227                           M_PAGEOUT, M_WAITOK | M_ZERO);
2228
2229         for (q = 0; q < PQ_L2_SIZE; ++q) {
2230                 struct markers *mark = &markers[q];
2231
2232                 mark->hold.flags = PG_FICTITIOUS | PG_MARKER;
2233                 mark->hold.busy_count = PBUSY_LOCKED;
2234                 mark->hold.queue = PQ_HOLD + q;
2235                 mark->hold.pc = PQ_HOLD + q;
2236                 mark->hold.wire_count = 1;
2237                 vm_page_queues_spin_lock(PQ_HOLD + q);
2238                 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_HOLD + q].pl,
2239                                   &mark->hold, pageq);
2240                 vm_page_queues_spin_unlock(PQ_HOLD + q);
2241
2242                 mark->stat.flags = PG_FICTITIOUS | PG_MARKER;
2243                 mark->stat.busy_count = PBUSY_LOCKED;
2244                 mark->stat.queue = PQ_ACTIVE + q;
2245                 mark->stat.pc = PQ_ACTIVE + q;
2246                 mark->stat.wire_count = 1;
2247                 vm_page_queues_spin_lock(PQ_ACTIVE + q);
2248                 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl,
2249                                   &mark->stat, pageq);
2250                 vm_page_queues_spin_unlock(PQ_ACTIVE + q);
2251
2252                 mark->pact.flags = PG_FICTITIOUS | PG_MARKER;
2253                 mark->pact.busy_count = PBUSY_LOCKED;
2254                 mark->pact.queue = PQ_ACTIVE + q;
2255                 mark->pact.pc = PQ_ACTIVE + q;
2256                 mark->pact.wire_count = 1;
2257                 vm_page_queues_spin_lock(PQ_ACTIVE + q);
2258                 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl,
2259                                   &mark->pact, pageq);
2260                 vm_page_queues_spin_unlock(PQ_ACTIVE + q);
2261         }
2262
2263         /*
2264          * We only need to setup once.
2265          */
2266         isep = 0;
2267         if (curthread == emergpager) {
2268                 isep = 1;
2269                 goto skip_setup;
2270         }
2271
2272         /*
2273          * Initialize vm_max_launder per pageout pass to be 1/16
2274          * of total physical memory, plus a little slop.
2275          */
2276         if (vm_max_launder == 0)
2277                 vm_max_launder = physmem / 256 + 16;
2278
2279         /*
2280          * Initialize some paging parameters.
2281          */
2282         vm_pageout_free_page_calc(vmstats.v_page_count);
2283
2284         /*
2285          * Basic pageout daemon paging operation settings
2286          */
2287         vmstats.v_free_target = vmstats.v_free_min * 2;
2288
2289         vmstats.v_paging_wait = vmstats.v_free_min * 2;
2290         vmstats.v_paging_start = vmstats.v_free_min * 3;
2291         vmstats.v_paging_target1 = vmstats.v_free_min * 4;
2292         vmstats.v_paging_target2 = vmstats.v_free_min * 5;
2293
2294         /*
2295          * NOTE: With the new buffer cache b_act_count we want the default
2296          *       inactive target to be a percentage of available memory.
2297          *
2298          *       The inactive target essentially determines the minimum
2299          *       number of 'temporary' pages capable of caching one-time-use
2300          *       files when the VM system is otherwise full of pages
2301          *       belonging to multi-time-use files or active program data.
2302          *
2303          * NOTE: The inactive target is aggressively persued only if the
2304          *       inactive queue becomes too small.  If the inactive queue
2305          *       is large enough to satisfy page movement to free+cache
2306          *       then it is repopulated more slowly from the active queue.
2307          *       This allows a general inactive_target default to be set.
2308          *
2309          *       There is an issue here for processes which sit mostly idle
2310          *       'overnight', such as sshd, tcsh, and X.  Any movement from
2311          *       the active queue will eventually cause such pages to
2312          *       recycle eventually causing a lot of paging in the morning.
2313          *       To reduce the incidence of this pages cycled out of the
2314          *       buffer cache are moved directly to the inactive queue if
2315          *       they were only used once or twice.
2316          *
2317          *       The vfs.vm_cycle_point sysctl can be used to adjust this.
2318          *       Increasing the value (up to 64) increases the number of
2319          *       buffer recyclements which go directly to the inactive queue.
2320          *
2321          * NOTE: There is 'cache target'.  The combined (free + cache( target
2322          *       is handled by the v_paging_* targets above.
2323          */
2324         vmstats.v_inactive_target = vmstats.v_free_count / 16;
2325         //vmstats.v_inactive_target = vmstats.v_free_min * 4;
2326
2327         /* XXX does not really belong here */
2328         if (vm_page_max_wired == 0)
2329                 vm_page_max_wired = vmstats.v_free_count / 3;
2330
2331         /*
2332          * page stats operation.
2333          *
2334          * scan - needs to be large enough for decent turn-around but
2335          *        not so large that it eats a ton of CPU.  Pages per run.
2336          *
2337          * ticks - interval per run in ticks.
2338          *
2339          * run  - number of seconds after the pagedaemon has run that
2340          *        we continue to collect page stats, after which we stop.
2341          *
2342          *        Calculated for 50% coverage.
2343          *
2344          */
2345         if (vm_pageout_stats_scan == 0) {
2346                 vm_pageout_stats_scan = vmstats.v_free_count / PQ_L2_SIZE / 16;
2347                 if (vm_pageout_stats_scan < 16)
2348                         vm_pageout_stats_scan = 16;
2349         }
2350
2351         if (vm_pageout_stats_ticks == 0)
2352                 vm_pageout_stats_ticks = hz / 10;
2353
2354         vm_pagedaemon_uptime = time_uptime;
2355
2356         swap_pager_swap_init();
2357
2358         atomic_swap_int(&sequence_emerg_pager, 1);
2359         wakeup(&sequence_emerg_pager);
2360
2361 skip_setup:
2362         /*
2363          * Sequence emergency pager startup
2364          */
2365         if (isep) {
2366                 while (sequence_emerg_pager == 0)
2367                         tsleep(&sequence_emerg_pager, 0, "pstartup", hz);
2368         }
2369
2370         pass = 0;
2371         warn_time = time_uptime;
2372
2373         /*
2374          * The pageout daemon is never done, so loop forever.
2375          *
2376          * WARNING!  This code is being executed by two kernel threads
2377          *           potentially simultaneously.
2378          */
2379         while (TRUE) {
2380                 int error;
2381                 long avail_shortage;
2382                 long inactive_shortage;
2383                 long vnodes_skipped = 0;
2384                 long recycle_count = 0;
2385                 long tmp;
2386
2387                 /*
2388                  * Don't let pass overflow
2389                  */
2390                 if (pass > 0x7FFF0000)
2391                         pass = 0x70000000;
2392
2393                 /*
2394                  * Wait for an action request.  If we timeout check to
2395                  * see if paging is needed (in case the normal wakeup
2396                  * code raced us).
2397                  */
2398                 if (isep) {
2399                         /*
2400                          * Emergency pagedaemon monitors the primary
2401                          * pagedaemon while vm_pages_needed != 0.
2402                          *
2403                          * The emergency pagedaemon only runs if VM paging
2404                          * is needed and the primary pagedaemon has not
2405                          * updated vm_pagedaemon_uptime for more than 2
2406                          * seconds.
2407                          */
2408                         if (vm_pages_needed)
2409                                 tsleep(&vm_pagedaemon_uptime, 0, "psleep", hz);
2410                         else
2411                                 tsleep(&vm_pagedaemon_uptime, 0, "psleep", hz*10);
2412                         if (vm_pages_needed == 0) {
2413                                 pass = 0;
2414                                 continue;
2415                         }
2416                         if ((int)(time_uptime - vm_pagedaemon_uptime) < 2) {
2417                                 pass = 0;
2418                                 continue;
2419                         }
2420                 } else {
2421                         /*
2422                          * Primary pagedaemon
2423                          *
2424                          * Do an unconditional partial scan to deal with
2425                          * PQ_HOLD races and to maintain active stats on
2426                          * pages that are in PQ_ACTIVE.
2427                          */
2428                         vm_pageout_scan_hold(q3iterator & PQ_L2_MASK,
2429                                       &markers[q3iterator & PQ_L2_MASK].hold);
2430                         vm_pageout_page_stats(q3iterator & PQ_L2_MASK,
2431                                       &markers[q3iterator & PQ_L2_MASK].stat,
2432                                       scounter);
2433                         ++q3iterator;
2434
2435                         /*
2436                          * Primary idle sleep loop, check condition after
2437                          * sleep.
2438                          *
2439                          * NOTE: State will not be IDLE if vm_pages_needed
2440                          *       is non-zero.
2441                          */
2442                         if (vm_pages_needed == 0) {
2443                                 error = tsleep(&vm_pages_needed,
2444                                                0, "psleep",
2445                                                vm_pageout_stats_ticks);
2446                                 if (error &&
2447                                     vm_paging_start(0) == 0 &&
2448                                     vm_pages_needed == 0)
2449                                 {
2450                                         continue;
2451                                 }
2452                                 vm_pagedaemon_uptime = time_uptime;
2453                                 vm_pages_needed = 1;
2454                                 state = PAGING_TARGET1;
2455
2456                                 /*
2457                                  * Wake the emergency pagedaemon up so it
2458                                  * can monitor us.  It will automatically
2459                                  * go back into a long sleep when
2460                                  * vm_pages_needed returns to 0.
2461                                  */
2462                                 wakeup(&vm_pagedaemon_uptime);
2463                         }
2464                 }
2465
2466                 mycpu->gd_cnt.v_pdwakeups++;
2467
2468                 /*
2469                  * Scan for INACTIVE->CLEAN/PAGEOUT
2470                  *
2471                  * This routine tries to avoid thrashing the system with
2472                  * unnecessary activity.
2473                  *
2474                  * Calculate our target for the number of free+cache pages we
2475                  * want to get to.  This is higher then the number that causes
2476                  * allocations to stall (severe) in order to provide hysteresis,
2477                  * and if we don't make it all the way but get to the minimum
2478                  * we're happy.  Goose it a bit if there are multiple requests
2479                  * for memory.
2480                  *
2481                  * Don't reduce avail_shortage inside the loop or the
2482                  * PQAVERAGE() calculation will break.
2483                  *
2484                  * NOTE! deficit is differentiated from avail_shortage as
2485                  *       REQUIRING at least (deficit) pages to be cleaned,
2486                  *       even if the page queues are in good shape.  This
2487                  *       is used primarily for handling per-process
2488                  *       RLIMIT_RSS and may also see small values when
2489                  *       processes block due to low memory.
2490                  */
2491                 vmstats_rollup();
2492                 if (isep == 0)
2493                         vm_pagedaemon_uptime = time_uptime;
2494
2495                 if (state == PAGING_TARGET1) {
2496                         avail_shortage = vm_paging_target1_count() +
2497                                          vm_pageout_deficit;
2498                 } else {
2499                         avail_shortage = vm_paging_target2_count() +
2500                                          vm_pageout_deficit;
2501                 }
2502                 vm_pageout_deficit = 0;
2503
2504                 if (avail_shortage > 0) {
2505                         long delta = 0;
2506                         long counts[4] = { 0, 0, 0, 0 };
2507                         long use = avail_shortage;
2508                         int qq;
2509
2510                         if (vm_pageout_debug) {
2511                                 static time_t save_time3;
2512                                 if (save_time3 != time_uptime) {
2513                                         save_time3 = time_uptime;
2514                                         kprintf("scan_inactive "
2515                                                 "pass %d isep=%d\n",
2516                                                 pass, isep);
2517                                 }
2518                         }
2519
2520                         /*
2521                          * Once target1 is achieved we move on to target2,
2522                          * but pageout more lazily in smaller batches.
2523                          */
2524                         if (state == PAGING_TARGET2 &&
2525                             use > vmstats.v_inactive_target / 10)
2526                         {
2527                                 use = vmstats.v_inactive_target / 10 + 1;
2528                         }
2529
2530                         qq = q1iterator;
2531                         for (q = 0; q < PQ_L2_SIZE; ++q) {
2532                                 delta += vm_pageout_scan_inactive(
2533                                             pass / MAXSCAN_DIVIDER,
2534                                             qq & PQ_L2_MASK,
2535                                             PQAVERAGE(use),
2536                                             &vnodes_skipped, counts);
2537                                 if (isep)
2538                                         --qq;
2539                                 else
2540                                         ++qq;
2541                                 if (avail_shortage - delta <= 0)
2542                                         break;
2543
2544                                 /*
2545                                  * It is possible for avail_shortage to be
2546                                  * very large.  If a large program exits or
2547                                  * frees a ton of memory all at once, we do
2548                                  * not have to continue deactivations.
2549                                  *
2550                                  * (We will still run the active->inactive
2551                                  * target, however).
2552                                  */
2553                                 if (!vm_paging_target2() &&
2554                                     !vm_paging_min_dnc(vm_page_free_hysteresis)) {
2555                                         avail_shortage = 0;
2556                                         break;
2557                                 }
2558                         }
2559                         if (vm_pageout_debug) {
2560                                 static time_t save_time2;
2561                                 if (save_time2 != time_uptime) {
2562                                         save_time2 = time_uptime;
2563                                         kprintf("flsh %ld cln %ld "
2564                                                 "lru2 %ld react %ld "
2565                                                 "delta %ld\n",
2566                                                 counts[0], counts[1],
2567                                                 counts[2], counts[3],
2568                                                 delta);
2569                                 }
2570                         }
2571                         avail_shortage -= delta;
2572                         q1iterator = qq;
2573                 }
2574
2575                 /*
2576                  * Figure out how many active pages we must deactivate.  If
2577                  * we were able to reach our target with just the inactive
2578                  * scan above we limit the number of active pages we
2579                  * deactivate to reduce unnecessary work.
2580                  *
2581                  * When calculating inactive_shortage notice that we are
2582                  * departing from what vm_paging_inactive_count() does.
2583                  * During paging, the free + cache queues are assumed to
2584                  * be under stress, so only a pure inactive target is
2585                  * calculated without taking into account v_free_min,
2586                  * v_free_count, or v_cache_count.
2587                  */
2588                 vmstats_rollup();
2589                 if (isep == 0)
2590                         vm_pagedaemon_uptime = time_uptime;
2591                 inactive_shortage = vmstats.v_inactive_target -
2592                                     vmstats.v_inactive_count;
2593
2594                 /*
2595                  * If we were unable to free sufficient inactive pages to
2596                  * satisfy the free/cache queue requirements then simply
2597                  * reaching the inactive target may not be good enough.
2598                  * Try to deactivate pages in excess of the target based
2599                  * on the shortfall.
2600                  *
2601                  * However to prevent thrashing the VM system do not
2602                  * deactivate more than an additional 1/10 the inactive
2603                  * target's worth of active pages.
2604                  */
2605                 if (avail_shortage > 0) {
2606                         tmp = avail_shortage * 2;
2607                         if (tmp > vmstats.v_inactive_target / 10)
2608                                 tmp = vmstats.v_inactive_target / 10;
2609                         inactive_shortage += tmp;
2610                 }
2611
2612                 /*
2613                  * Only trigger a pmap cleanup on inactive shortage.
2614                  */
2615                 if (isep == 0 && inactive_shortage > 0) {
2616                         pmap_collect();
2617                 }
2618
2619                 /*
2620                  * Scan for ACTIVE->INACTIVE
2621                  *
2622                  * Only trigger on inactive shortage.  Triggering on
2623                  * avail_shortage can starve the active queue with
2624                  * unnecessary active->inactive transitions and destroy
2625                  * performance.
2626                  *
2627                  * If this is the emergency pager, always try to move
2628                  * a few pages from active to inactive because the inactive
2629                  * queue might have enough pages, but not enough anonymous
2630                  * pages.
2631                  */
2632                 if (isep && inactive_shortage < vm_emerg_launder)
2633                         inactive_shortage = vm_emerg_launder;
2634
2635                 if (/*avail_shortage > 0 ||*/ inactive_shortage > 0) {
2636                         long delta = 0;
2637                         int qq;
2638
2639                         qq = q2iterator;
2640                         for (q = 0; q < PQ_L2_SIZE; ++q) {
2641                                 delta += vm_pageout_scan_active(
2642                                                 pass / MAXSCAN_DIVIDER,
2643                                                 qq & PQ_L2_MASK,
2644                                                 PQAVERAGE(avail_shortage),
2645                                                 PQAVERAGE(inactive_shortage),
2646                                                 &markers[qq & PQ_L2_MASK].pact,
2647                                                 &recycle_count);
2648                                 if (isep)
2649                                         --qq;
2650                                 else
2651                                         ++qq;
2652                                 if (inactive_shortage - delta <= 0 &&
2653                                     avail_shortage - delta <= 0) {
2654                                         break;
2655                                 }
2656
2657                                 /*
2658                                  * inactive_shortage can be a very large
2659                                  * number.  This is intended to break out
2660                                  * early if our inactive_target has been
2661                                  * reached due to other system activity.
2662                                  */
2663                                 if (vmstats.v_inactive_count >
2664                                     vmstats.v_inactive_target)
2665                                 {
2666                                         inactive_shortage = 0;
2667                                         break;
2668                                 }
2669                         }
2670                         inactive_shortage -= delta;
2671                         avail_shortage -= delta;
2672                         q2iterator = qq;
2673                 }
2674
2675                 /*
2676                  * Scan for CACHE->FREE
2677                  *
2678                  * Finally free enough cache pages to meet our free page
2679                  * requirement and take more drastic measures if we are
2680                  * still in trouble.
2681                  */
2682                 vmstats_rollup();
2683                 if (isep == 0)
2684                         vm_pagedaemon_uptime = time_uptime;
2685                 vm_pageout_scan_cache(avail_shortage, pass / MAXSCAN_DIVIDER,
2686                                       vnodes_skipped, recycle_count);
2687
2688                 /*
2689                  * This is a bit sophisticated because we do not necessarily
2690                  * want to force paging until our targets are reached if we
2691                  * were able to successfully retire the shortage we calculated.
2692                  */
2693                 if (avail_shortage > 0) {
2694                         /*
2695                          * If we did not retire enough pages continue the
2696                          * pageout operation until we are able to.  It
2697                          * takes MAXSCAN_DIVIDER passes to cover the entire
2698                          * inactive list.
2699                          *
2700                          * We used to throw delays in here if paging went on
2701                          * continuously but that really just makes things
2702                          * worse.  Just keep going.
2703                          */
2704                         if (pass == 0)
2705                                 warn_time = time_uptime;
2706                         ++pass;
2707                         if (isep == 0 && time_uptime - warn_time >= 60) {
2708                                 kprintf("pagedaemon: WARNING! Continuous "
2709                                         "paging for %ld minutes\n",
2710                                         (time_uptime - warn_time ) / 60);
2711                                 warn_time = time_uptime;
2712                         }
2713
2714                         if (vm_pages_needed) {
2715                                 /*
2716                                  * Normal operation, additional processes
2717                                  * have already kicked us.  Retry immediately
2718                                  * unless swap space is completely full in
2719                                  * which case delay a bit.
2720                                  */
2721                                 if (swap_pager_full) {
2722                                         tsleep(&vm_pages_needed, 0, "pdelay",
2723                                                 hz / 5);
2724                                 } /* else immediate loop */
2725                         } /* else immediate loop */
2726                 } else {
2727                         /*
2728                          * Reset pass
2729                          */
2730                         pass = 0;
2731
2732                         if (vm_paging_start(0) ||
2733                             vm_paging_min_dnc(vm_page_free_hysteresis))
2734                         {
2735                                 /*
2736                                  * Pages sufficiently exhausted to start
2737                                  * page-daemon in TARGET1 mode
2738                                  */
2739                                 state = PAGING_TARGET1;
2740                                 vm_pages_needed = 2;
2741
2742                                 /*
2743                                  * We can wakeup waiters if we are above
2744                                  * the wait point.
2745                                  */
2746                                 if (!vm_paging_wait())
2747                                         wakeup(&vmstats.v_free_count);
2748                         } else if (vm_pages_needed) {
2749                                 /*
2750                                  * Continue paging until TARGET2 reached,
2751                                  * but waiters can be woken up.
2752                                  *
2753                                  * The PAGING_TARGET2 state tells the
2754                                  * pagedaemon to work a little less hard.
2755                                  */
2756                                 if (vm_paging_target1()) {
2757                                         state = PAGING_TARGET1;
2758                                         vm_pages_needed = 2;
2759                                 } else if (vm_paging_target2()) {
2760                                         state = PAGING_TARGET2;
2761                                         vm_pages_needed = 2;
2762                                 } else {
2763                                         vm_pages_needed = 0;
2764                                 }
2765                                 wakeup(&vmstats.v_free_count);
2766                         } /* else nothing to do here */
2767                 }
2768         }
2769 }
2770
2771 static struct kproc_desc pg1_kp = {
2772         "pagedaemon",
2773         vm_pageout_thread,
2774         &pagethread
2775 };
2776 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &pg1_kp);
2777
2778 static struct kproc_desc pg2_kp = {
2779         "emergpager",
2780         vm_pageout_thread,
2781         &emergpager
2782 };
2783 SYSINIT(emergpager, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, kproc_start, &pg2_kp);
2784
2785
2786 /*
2787  * Called after allocating a page out of the cache or free queue
2788  * to possibly wake the pagedaemon up to replentish our supply.
2789  *
2790  * We try to generate some hysteresis by waking the pagedaemon up
2791  * when our free+cache pages go below the free_min+cache_min level.
2792  * The pagedaemon tries to get the count back up to at least the
2793  * minimum, and through to the target level if possible.
2794  *
2795  * If the pagedaemon is already active bump vm_pages_needed as a hint
2796  * that there are even more requests pending.
2797  *
2798  * SMP races ok?
2799  * No requirements.
2800  */
2801 void
2802 pagedaemon_wakeup(void)
2803 {
2804         if (vm_paging_start(0) && curthread != pagethread) {
2805                 if (vm_pages_needed <= 1) {
2806                         vm_pages_needed = 1;            /* SMP race ok */
2807                         wakeup(&vm_pages_needed);       /* tickle pageout */
2808                 } else if (vm_paging_min()) {
2809                         ++vm_pages_needed;              /* SMP race ok */
2810                         /* a wakeup() would be wasted here */
2811                 }
2812         }
2813 }
2814
2815 #if !defined(NO_SWAPPING)
2816
2817 /*
2818  * SMP races ok?
2819  * No requirements.
2820  */
2821 static void
2822 vm_req_vmdaemon(void)
2823 {
2824         static int lastrun = 0;
2825
2826         if ((ticks > (lastrun + hz)) || (ticks < lastrun)) {
2827                 wakeup(&vm_daemon_needed);
2828                 lastrun = ticks;
2829         }
2830 }
2831
2832 static int vm_daemon_callback(struct proc *p, void *data __unused);
2833
2834 /*
2835  * No requirements.
2836  *
2837  * Scan processes for exceeding their rlimits, deactivate pages
2838  * when RSS is exceeded.
2839  */
2840 static void
2841 vm_daemon(void)
2842 {
2843         while (TRUE) {
2844                 tsleep(&vm_daemon_needed, 0, "psleep", 0);
2845                 allproc_scan(vm_daemon_callback, NULL, 0);
2846         }
2847 }
2848
2849 static int
2850 vm_daemon_callback(struct proc *p, void *data __unused)
2851 {
2852         struct vmspace *vm;
2853         vm_pindex_t limit, size;
2854
2855         /*
2856          * if this is a system process or if we have already
2857          * looked at this process, skip it.
2858          */
2859         lwkt_gettoken(&p->p_token);
2860
2861         if (p->p_flags & (P_SYSTEM | P_WEXIT)) {
2862                 lwkt_reltoken(&p->p_token);
2863                 return (0);
2864         }
2865
2866         /*
2867          * if the process is in a non-running type state,
2868          * don't touch it.
2869          */
2870         if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) {
2871                 lwkt_reltoken(&p->p_token);
2872                 return (0);
2873         }
2874
2875         /*
2876          * get a limit
2877          */
2878         limit = OFF_TO_IDX(qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur,
2879                                 p->p_rlimit[RLIMIT_RSS].rlim_max));
2880
2881         vm = p->p_vmspace;
2882         vmspace_hold(vm);
2883         size = pmap_resident_tlnw_count(&vm->vm_pmap);
2884         if (limit >= 0 && size > 4096 &&
2885             size - 4096 >= limit && vm_pageout_memuse_mode >= 1) {
2886                 vm_pageout_map_deactivate_pages(&vm->vm_map, limit);
2887         }
2888         vmspace_drop(vm);
2889
2890         lwkt_reltoken(&p->p_token);
2891
2892         return (0);
2893 }
2894
2895 #endif