sys/vm/vm_pageout.c

   1 /*
   2  * Copyright (c) 2003-2020 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@backplane.com>
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  * 3. Neither the name of The DragonFly Project nor the names of its
  18  *    contributors may be used to endorse or promote products derived
  19  *    from this software without specific, prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  * Copyright (c) 1991 Regents of the University of California.
  35  * All rights reserved.
  36  * Copyright (c) 1994 John S. Dyson
  37  * All rights reserved.
  38  * Copyright (c) 1994 David Greenman
  39  * All rights reserved.
  40  *
  41  * This code is derived from software contributed to Berkeley by
  42  * The Mach Operating System project at Carnegie-Mellon University.
  43  *
  44  * Redistribution and use in source and binary forms, with or without
  45  * modification, are permitted provided that the following conditions
  46  * are met:
  47  * 1. Redistributions of source code must retain the above copyright
  48  *    notice, this list of conditions and the following disclaimer.
  49  * 2. Redistributions in binary form must reproduce the above copyright
  50  *    notice, this list of conditions and the following disclaimer in the
  51  *    documentation and/or other materials provided with the distribution.
  52  * 3. Neither the name of the University nor the names of its contributors
  53  *    may be used to endorse or promote products derived from this software
  54  *    without specific prior written permission.
  55  *
  56  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  57  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  58  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  59  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  60  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  61  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  62  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  63  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  64  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  65  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  66  * SUCH DAMAGE.
  67  *
  68  *      from: @(#)vm_pageout.c  7.4 (Berkeley) 5/7/91
  69  *
  70  *
  71  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  72  * All rights reserved.
  73  *
  74  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  75  *
  76  * Permission to use, copy, modify and distribute this software and
  77  * its documentation is hereby granted, provided that both the copyright
  78  * notice and this permission notice appear in all copies of the
  79  * software, derivative works or modified versions, and any portions
  80  * thereof, and that both notices appear in supporting documentation.
  81  *
  82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  85  *
  86  * Carnegie Mellon requests users of this software to return to
  87  *
  88  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  89  *  School of Computer Science
  90  *  Carnegie Mellon University
  91  *  Pittsburgh PA 15213-3890
  92  *
  93  * any improvements or extensions that they make and grant Carnegie the
  94  * rights to redistribute these changes.
  95  */
  96
  97 /*
  98  * The proverbial page-out daemon, rewritten many times over the decades.
  99  */
 100
 101 #include "opt_vm.h"
 102 #include <sys/param.h>
 103 #include <sys/systm.h>
 104 #include <sys/kernel.h>
 105 #include <sys/proc.h>
 106 #include <sys/kthread.h>
 107 #include <sys/resourcevar.h>
 108 #include <sys/signalvar.h>
 109 #include <sys/vnode.h>
 110 #include <sys/vmmeter.h>
 111 #include <sys/conf.h>
 112 #include <sys/sysctl.h>
 113
 114 #include <vm/vm.h>
 115 #include <vm/vm_param.h>
 116 #include <sys/lock.h>
 117 #include <vm/vm_object.h>
 118 #include <vm/vm_page.h>
 119 #include <vm/vm_map.h>
 120 #include <vm/vm_pageout.h>
 121 #include <vm/vm_pager.h>
 122 #include <vm/swap_pager.h>
 123 #include <vm/vm_extern.h>
 124
 125 #include <sys/spinlock2.h>
 126 #include <vm/vm_page2.h>
 127
 128 /*
 129  * System initialization
 130  */
 131
 132 /* the kernel process "vm_pageout"*/
 133 static int vm_pageout_page(vm_page_t m, long *max_launderp,
 134                            long *vnodes_skippedp, struct vnode **vpfailedp,
 135                            int pass, int vmflush_flags, long *counts);
 136 static int vm_pageout_clean_helper (vm_page_t, int);
 137 static void vm_pageout_free_page_calc (vm_size_t count);
 138 static void vm_pageout_page_free(vm_page_t m) ;
 139 __read_frequently struct thread *emergpager;
 140 __read_frequently struct thread *pagethread;
 141 static int sequence_emerg_pager;
 142
 143 #if !defined(NO_SWAPPING)
 144 /* the kernel process "vm_daemon"*/
 145 static void vm_daemon (void);
 146 static struct   thread *vmthread;
 147
 148 static struct kproc_desc vm_kp = {
 149         "vmdaemon",
 150         vm_daemon,
 151         &vmthread
 152 };
 153 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp);
 154 #endif
 155
 156 __read_mostly int vm_pages_needed = 0;  /* pageout daemon tsleep event */
 157 __read_mostly int vm_pageout_deficit = 0;/* Estimated number of pages deficit */
 158 __read_mostly int vm_pageout_pages_needed = 0;/* pageout daemon needs pages */
 159 __read_mostly int vm_page_free_hysteresis = 16;
 160 __read_mostly static int vm_pagedaemon_time;
 161
 162 #if !defined(NO_SWAPPING)
 163 static int vm_daemon_needed;
 164 #endif
 165 __read_mostly static int vm_max_launder = 0;
 166 __read_mostly static int vm_emerg_launder = 100;
 167 __read_mostly static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0;
 168 __read_mostly static int vm_pageout_full_stats_interval = 0;
 169 __read_mostly static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0;
 170 __read_mostly static int defer_swap_pageouts=0;
 171 __read_mostly static int disable_swap_pageouts=0;
 172 __read_mostly static u_int vm_anonmem_decline = ACT_DECLINE;
 173 __read_mostly static u_int vm_filemem_decline = ACT_DECLINE * 2;
 174 __read_mostly static int vm_pageout_debug;
 175
 176 #if defined(NO_SWAPPING)
 177 __read_mostly static int vm_swap_enabled=0;
 178 #else
 179 __read_mostly static int vm_swap_enabled=1;
 180 #endif
 181
 182 /* 0-disable, 1-passive, 2-active swp, 3-acive swp + single-queue dirty pages*/
 183 __read_mostly int vm_pageout_memuse_mode=2;
 184 __read_mostly int vm_pageout_allow_active=1;
 185
 186 SYSCTL_UINT(_vm, VM_PAGEOUT_ALGORITHM, anonmem_decline,
 187         CTLFLAG_RW, &vm_anonmem_decline, 0, "active->inactive anon memory");
 188
 189 SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, filemem_decline,
 190         CTLFLAG_RW, &vm_filemem_decline, 0, "active->inactive file cache");
 191
 192 SYSCTL_INT(_vm, OID_AUTO, page_free_hysteresis,
 193         CTLFLAG_RW, &vm_page_free_hysteresis, 0,
 194         "Free more pages than the minimum required");
 195
 196 SYSCTL_INT(_vm, OID_AUTO, max_launder,
 197         CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout");
 198 SYSCTL_INT(_vm, OID_AUTO, emerg_launder,
 199         CTLFLAG_RW, &vm_emerg_launder, 0, "Emergency pager minimum");
 200
 201 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max,
 202         CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length");
 203
 204 SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval,
 205         CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan");
 206
 207 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval,
 208         CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan");
 209
 210 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_free_max,
 211         CTLFLAG_RW, &vm_pageout_stats_free_max, 0, "Not implemented");
 212 SYSCTL_INT(_vm, OID_AUTO, pageout_memuse_mode,
 213         CTLFLAG_RW, &vm_pageout_memuse_mode, 0, "memoryuse resource mode");
 214 SYSCTL_INT(_vm, OID_AUTO, pageout_allow_active,
 215         CTLFLAG_RW, &vm_pageout_allow_active, 0, "allow inactive+active");
 216 SYSCTL_INT(_vm, OID_AUTO, pageout_debug,
 217         CTLFLAG_RW, &vm_pageout_debug, 0, "debug pageout pages (count)");
 218
 219
 220 #if defined(NO_SWAPPING)
 221 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
 222         CTLFLAG_RD, &vm_swap_enabled, 0, "");
 223 #else
 224 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
 225         CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout");
 226 #endif
 227
 228 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts,
 229         CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem");
 230
 231 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
 232         CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
 233
 234 static int pageout_lock_miss;
 235 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
 236         CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
 237
 238 int vm_page_max_wired;          /* XXX max # of wired pages system-wide */
 239
 240 #if !defined(NO_SWAPPING)
 241 static void vm_req_vmdaemon (void);
 242 #endif
 243 static void vm_pageout_page_stats(int q);
 244
 245 #define MAXSCAN_DIVIDER         10
 246
 247 /*
 248  * Calculate approximately how many pages on each queue to try to
 249  * clean.  An exact calculation creates an edge condition when the
 250  * queues are unbalanced so add significant slop.  The queue scans
 251  * will stop early when targets are reached and will start where they
 252  * left off on the next pass.
 253  *
 254  * We need to be generous here because there are all sorts of loading
 255  * conditions that can cause edge cases if try to average over all queues.
 256  * In particular, storage subsystems have become so fast that paging
 257  * activity can become quite frantic.  Eventually we will probably need
 258  * two paging threads, one for dirty pages and one for clean, to deal
 259  * with the bandwidth requirements.
 260
 261  * So what we do is calculate a value that can be satisfied nominally by
 262  * only having to scan half the queues.
 263  */
 264 static __inline long
 265 PQAVERAGE(long n)
 266 {
 267         long avg;
 268
 269         if (n >= 0) {
 270                 avg = ((n + (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) + 1);
 271         } else {
 272                 avg = ((n - (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) - 1);
 273         }
 274         return avg;
 275 }
 276
 277 /*
 278  * vm_pageout_clean_helper:
 279  *
 280  * Clean the page and remove it from the laundry.  The page must be busied
 281  * by the caller and will be disposed of (put away, flushed) by this routine.
 282  */
 283 static int
 284 vm_pageout_clean_helper(vm_page_t m, int vmflush_flags)
 285 {
 286         vm_object_t object;
 287         vm_page_t mc[BLIST_MAX_ALLOC];
 288         int error;
 289         int ib, is, page_base;
 290         vm_pindex_t pindex = m->pindex;
 291
 292         object = m->object;
 293
 294         /*
 295          * Don't mess with the page if it's held or special.  Theoretically
 296          * we can pageout held pages but there is no real need to press our
 297          * luck, so don't.
 298          */
 299         if (m->hold_count != 0 || (m->flags & PG_UNQUEUED)) {
 300                 vm_page_wakeup(m);
 301                 return 0;
 302         }
 303
 304         /*
 305          * Place page in cluster.  Align cluster for optimal swap space
 306          * allocation (whether it is swap or not).  This is typically ~16-32
 307          * pages, which also tends to align the cluster to multiples of the
 308          * filesystem block size if backed by a filesystem.
 309          */
 310         page_base = pindex % BLIST_MAX_ALLOC;
 311         mc[page_base] = m;
 312         ib = page_base - 1;
 313         is = page_base + 1;
 314
 315         /*
 316          * Scan object for clusterable pages.
 317          *
 318          * We can cluster ONLY if: ->> the page is NOT
 319          * clean, wired, busy, held, or mapped into a
 320          * buffer, and one of the following:
 321          * 1) The page is inactive, or a seldom used
 322          *    active page.
 323          * -or-
 324          * 2) we force the issue.
 325          *
 326          * During heavy mmap/modification loads the pageout
 327          * daemon can really fragment the underlying file
 328          * due to flushing pages out of order and not trying
 329          * align the clusters (which leave sporatic out-of-order
 330          * holes).  To solve this problem we do the reverse scan
 331          * first and attempt to align our cluster, then do a
 332          * forward scan if room remains.
 333          */
 334         vm_object_hold(object);
 335
 336         while (ib >= 0) {
 337                 vm_page_t p;
 338
 339                 p = vm_page_lookup_busy_try(object, pindex - page_base + ib,
 340                                             TRUE, &error);
 341                 if (error || p == NULL)
 342                         break;
 343                 if ((p->queue - p->pc) == PQ_CACHE ||
 344                     (p->flags & PG_UNQUEUED)) {
 345                         vm_page_wakeup(p);
 346                         break;
 347                 }
 348                 vm_page_test_dirty(p);
 349                 if (((p->dirty & p->valid) == 0 &&
 350                      (p->flags & PG_NEED_COMMIT) == 0) ||
 351                     p->wire_count != 0 ||       /* may be held by buf cache */
 352                     p->hold_count != 0) {       /* may be undergoing I/O */
 353                         vm_page_wakeup(p);
 354                         break;
 355                 }
 356                 if (p->queue - p->pc != PQ_INACTIVE) {
 357                         if (p->queue - p->pc != PQ_ACTIVE ||
 358                             (vmflush_flags & OBJPC_ALLOW_ACTIVE) == 0) {
 359                                 vm_page_wakeup(p);
 360                                 break;
 361                         }
 362                 }
 363
 364                 /*
 365                  * Try to maintain page groupings in the cluster.
 366                  */
 367                 if (m->flags & PG_WINATCFLS)
 368                         vm_page_flag_set(p, PG_WINATCFLS);
 369                 else
 370                         vm_page_flag_clear(p, PG_WINATCFLS);
 371                 p->act_count = m->act_count;
 372
 373                 mc[ib] = p;
 374                 --ib;
 375         }
 376         ++ib;   /* fixup */
 377
 378         while (is < BLIST_MAX_ALLOC &&
 379                pindex - page_base + is < object->size) {
 380                 vm_page_t p;
 381
 382                 p = vm_page_lookup_busy_try(object, pindex - page_base + is,
 383                                             TRUE, &error);
 384                 if (error || p == NULL)
 385                         break;
 386                 if (((p->queue - p->pc) == PQ_CACHE) ||
 387                     (p->flags & PG_UNQUEUED)) {
 388                         vm_page_wakeup(p);
 389                         break;
 390                 }
 391                 vm_page_test_dirty(p);
 392                 if (((p->dirty & p->valid) == 0 &&
 393                      (p->flags & PG_NEED_COMMIT) == 0) ||
 394                     p->wire_count != 0 ||       /* may be held by buf cache */
 395                     p->hold_count != 0) {       /* may be undergoing I/O */
 396                         vm_page_wakeup(p);
 397                         break;
 398                 }
 399                 if (p->queue - p->pc != PQ_INACTIVE) {
 400                         if (p->queue - p->pc != PQ_ACTIVE ||
 401                             (vmflush_flags & OBJPC_ALLOW_ACTIVE) == 0) {
 402                                 vm_page_wakeup(p);
 403                                 break;
 404                         }
 405                 }
 406
 407                 /*
 408                  * Try to maintain page groupings in the cluster.
 409                  */
 410                 if (m->flags & PG_WINATCFLS)
 411                         vm_page_flag_set(p, PG_WINATCFLS);
 412                 else
 413                         vm_page_flag_clear(p, PG_WINATCFLS);
 414                 p->act_count = m->act_count;
 415
 416                 mc[is] = p;
 417                 ++is;
 418         }
 419
 420         vm_object_drop(object);
 421
 422         /*
 423          * we allow reads during pageouts...
 424          */
 425         return vm_pageout_flush(&mc[ib], is - ib, vmflush_flags);
 426 }
 427
 428 /*
 429  * vm_pageout_flush() - launder the given pages
 430  *
 431  *      The given pages are laundered.  Note that we setup for the start of
 432  *      I/O ( i.e. busy the page ), mark it read-only, and bump the object
 433  *      reference count all in here rather then in the parent.  If we want
 434  *      the parent to do more sophisticated things we may have to change
 435  *      the ordering.
 436  *
 437  *      The pages in the array must be busied by the caller and will be
 438  *      unbusied by this function.
 439  */
 440 int
 441 vm_pageout_flush(vm_page_t *mc, int count, int vmflush_flags)
 442 {
 443         vm_object_t object;
 444         int pageout_status[count];
 445         int numpagedout = 0;
 446         int i;
 447
 448         /*
 449          * Initiate I/O.  Bump the vm_page_t->busy counter.
 450          */
 451         for (i = 0; i < count; i++) {
 452                 KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL,
 453                         ("vm_pageout_flush page %p index %d/%d: partially "
 454                          "invalid page", mc[i], i, count));
 455                 vm_page_io_start(mc[i]);
 456         }
 457
 458         /*
 459          * We must make the pages read-only.  This will also force the
 460          * modified bit in the related pmaps to be cleared.  The pager
 461          * cannot clear the bit for us since the I/O completion code
 462          * typically runs from an interrupt.  The act of making the page
 463          * read-only handles the case for us.
 464          *
 465          * Then we can unbusy the pages, we still hold a reference by virtue
 466          * of our soft-busy.
 467          */
 468         for (i = 0; i < count; i++) {
 469                 if (vmflush_flags & OBJPC_TRY_TO_CACHE)
 470                         vm_page_protect(mc[i], VM_PROT_NONE);
 471                 else
 472                         vm_page_protect(mc[i], VM_PROT_READ);
 473                 vm_page_wakeup(mc[i]);
 474         }
 475
 476         object = mc[0]->object;
 477         vm_object_pip_add(object, count);
 478
 479         vm_pager_put_pages(object, mc, count,
 480                            (vmflush_flags |
 481                             ((object == &kernel_object) ?
 482                                 OBJPC_SYNC : 0)),
 483                            pageout_status);
 484
 485         for (i = 0; i < count; i++) {
 486                 vm_page_t mt = mc[i];
 487
 488                 switch (pageout_status[i]) {
 489                 case VM_PAGER_OK:
 490                         numpagedout++;
 491                         break;
 492                 case VM_PAGER_PEND:
 493                         numpagedout++;
 494                         break;
 495                 case VM_PAGER_BAD:
 496                         /*
 497                          * Page outside of range of object. Right now we
 498                          * essentially lose the changes by pretending it
 499                          * worked.
 500                          */
 501                         vm_page_busy_wait(mt, FALSE, "pgbad");
 502                         pmap_clear_modify(mt);
 503                         vm_page_undirty(mt);
 504                         vm_page_wakeup(mt);
 505                         break;
 506                 case VM_PAGER_ERROR:
 507                 case VM_PAGER_FAIL:
 508                         /*
 509                          * A page typically cannot be paged out when we
 510                          * have run out of swap.  We leave the page
 511                          * marked inactive and will try to page it out
 512                          * again later.
 513                          *
 514                          * Starvation of the active page list is used to
 515                          * determine when the system is massively memory
 516                          * starved.
 517                          */
 518                         break;
 519                 case VM_PAGER_AGAIN:
 520                         break;
 521                 }
 522
 523                 /*
 524                  * If not PENDing this was a synchronous operation and we
 525                  * clean up after the I/O.  If it is PENDing the mess is
 526                  * cleaned up asynchronously.
 527                  *
 528                  * Also nominally act on the caller's wishes if the caller
 529                  * wants to try to really clean (cache or free) the page.
 530                  *
 531                  * Also nominally deactivate the page if the system is
 532                  * memory-stressed.
 533                  */
 534                 if (pageout_status[i] != VM_PAGER_PEND) {
 535                         vm_page_busy_wait(mt, FALSE, "pgouw");
 536                         vm_page_io_finish(mt);
 537                         if (vmflush_flags & OBJPC_TRY_TO_CACHE) {
 538                                 vm_page_try_to_cache(mt);
 539                         } else if (vm_page_count_severe()) {
 540                                 vm_page_deactivate(mt);
 541                                 vm_page_wakeup(mt);
 542                         } else {
 543                                 vm_page_wakeup(mt);
 544                         }
 545                         vm_object_pip_wakeup(object);
 546                 }
 547         }
 548         return numpagedout;
 549 }
 550
 551 #if !defined(NO_SWAPPING)
 552
 553 /*
 554  * Callback function, page busied for us.  We must dispose of the busy
 555  * condition.  Any related pmap pages may be held but will not be locked.
 556  */
 557 static
 558 int
 559 vm_pageout_mdp_callback(struct pmap_pgscan_info *info, vm_offset_t va,
 560                         vm_page_t p)
 561 {
 562         int actcount;
 563         int cleanit = 0;
 564
 565         /*
 566          * Basic tests - There should never be a marker, and we can stop
 567          *               once the RSS is below the required level.
 568          */
 569         KKASSERT((p->flags & PG_MARKER) == 0);
 570         if (pmap_resident_tlnw_count(info->pmap) <= info->limit) {
 571                 vm_page_wakeup(p);
 572                 return(-1);
 573         }
 574
 575         mycpu->gd_cnt.v_pdpages++;
 576
 577         if (p->wire_count || p->hold_count || (p->flags & PG_UNQUEUED)) {
 578                 vm_page_wakeup(p);
 579                 goto done;
 580         }
 581
 582         ++info->actioncount;
 583
 584         /*
 585          * Check if the page has been referened recently.  If it has,
 586          * activate it and skip.
 587          */
 588         actcount = pmap_ts_referenced(p);
 589         if (actcount) {
 590                 vm_page_flag_set(p, PG_REFERENCED);
 591         } else if (p->flags & PG_REFERENCED) {
 592                 actcount = 1;
 593         }
 594
 595         if (actcount) {
 596                 if (p->queue - p->pc != PQ_ACTIVE) {
 597                         vm_page_and_queue_spin_lock(p);
 598                         if (p->queue - p->pc != PQ_ACTIVE) {
 599                                 vm_page_and_queue_spin_unlock(p);
 600                                 vm_page_activate(p);
 601                         } else {
 602                                 vm_page_and_queue_spin_unlock(p);
 603                         }
 604                 } else {
 605                         p->act_count += actcount;
 606                         if (p->act_count > ACT_MAX)
 607                                 p->act_count = ACT_MAX;
 608                 }
 609                 vm_page_flag_clear(p, PG_REFERENCED);
 610                 vm_page_wakeup(p);
 611                 goto done;
 612         }
 613
 614         /*
 615          * Remove the page from this particular pmap.  Once we do this, our
 616          * pmap scans will not see it again (unless it gets faulted in), so
 617          * we must actively dispose of or deal with the page.
 618          */
 619         pmap_remove_specific(info->pmap, p);
 620
 621         /*
 622          * If the page is not mapped to another process (i.e. as would be
 623          * typical if this were a shared page from a library) then deactivate
 624          * the page and clean it in two passes only.
 625          *
 626          * If the page hasn't been referenced since the last check, remove it
 627          * from the pmap.  If it is no longer mapped, deactivate it
 628          * immediately, accelerating the normal decline.
 629          *
 630          * Once the page has been removed from the pmap the RSS code no
 631          * longer tracks it so we have to make sure that it is staged for
 632          * potential flush action.
 633          *
 634          * XXX
 635          */
 636         if ((p->flags & PG_MAPPED) == 0 ||
 637             (pmap_mapped_sync(p) & PG_MAPPED) == 0) {
 638                 if (p->queue - p->pc == PQ_ACTIVE) {
 639                         vm_page_deactivate(p);
 640                 }
 641                 if (p->queue - p->pc == PQ_INACTIVE) {
 642                         cleanit = 1;
 643                 }
 644         }
 645
 646         /*
 647          * Ok, try to fully clean the page and any nearby pages such that at
 648          * least the requested page is freed or moved to the cache queue.
 649          *
 650          * We usually do this synchronously to allow us to get the page into
 651          * the CACHE queue quickly, which will prevent memory exhaustion if
 652          * a process with a memoryuse limit is running away.  However, the
 653          * sysadmin may desire to set vm.swap_user_async which relaxes this
 654          * and improves write performance.
 655          */
 656         if (cleanit) {
 657                 long max_launder = 0x7FFF;
 658                 long vnodes_skipped = 0;
 659                 long counts[4] = { 0, 0, 0, 0 };
 660                 int vmflush_flags;
 661                 struct vnode *vpfailed = NULL;
 662
 663                 info->offset = va;
 664
 665                 if (vm_pageout_memuse_mode >= 2) {
 666                         vmflush_flags = OBJPC_TRY_TO_CACHE |
 667                                         OBJPC_ALLOW_ACTIVE;
 668                         if (swap_user_async == 0)
 669                                 vmflush_flags |= OBJPC_SYNC;
 670                         vm_page_flag_set(p, PG_WINATCFLS);
 671                         info->cleancount +=
 672                                 vm_pageout_page(p, &max_launder,
 673                                                 &vnodes_skipped,
 674                                                 &vpfailed, 1, vmflush_flags,
 675                                                 counts);
 676                 } else {
 677                         vm_page_wakeup(p);
 678                         ++info->cleancount;
 679                 }
 680         } else {
 681                 vm_page_wakeup(p);
 682         }
 683
 684         /*
 685          * Must be at end to avoid SMP races.
 686          */
 687 done:
 688         lwkt_user_yield();
 689         return 0;
 690 }
 691
 692 /*
 693  * Deactivate some number of pages in a map due to set RLIMIT_RSS limits.
 694  * that is relatively difficult to do.  We try to keep track of where we
 695  * left off last time to reduce scan overhead.
 696  *
 697  * Called when vm_pageout_memuse_mode is >= 1.
 698  */
 699 void
 700 vm_pageout_map_deactivate_pages(vm_map_t map, vm_pindex_t limit)
 701 {
 702         vm_offset_t pgout_offset;
 703         struct pmap_pgscan_info info;
 704         int retries = 3;
 705
 706         pgout_offset = map->pgout_offset;
 707 again:
 708 #if 0
 709         kprintf("%016jx ", pgout_offset);
 710 #endif
 711         if (pgout_offset < VM_MIN_USER_ADDRESS)
 712                 pgout_offset = VM_MIN_USER_ADDRESS;
 713         if (pgout_offset >= VM_MAX_USER_ADDRESS)
 714                 pgout_offset = 0;
 715         info.pmap = vm_map_pmap(map);
 716         info.limit = limit;
 717         info.beg_addr = pgout_offset;
 718         info.end_addr = VM_MAX_USER_ADDRESS;
 719         info.callback = vm_pageout_mdp_callback;
 720         info.cleancount = 0;
 721         info.actioncount = 0;
 722         info.busycount = 0;
 723
 724         pmap_pgscan(&info);
 725         pgout_offset = info.offset;
 726 #if 0
 727         kprintf("%016jx %08lx %08lx\n", pgout_offset,
 728                 info.cleancount, info.actioncount);
 729 #endif
 730
 731         if (pgout_offset != VM_MAX_USER_ADDRESS &&
 732             pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) {
 733                 goto again;
 734         } else if (retries &&
 735                    pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) {
 736                 --retries;
 737                 goto again;
 738         }
 739         map->pgout_offset = pgout_offset;
 740 }
 741 #endif
 742
 743 /*
 744  * Called when the pageout scan wants to free a page.  We no longer
 745  * try to cycle the vm_object here with a reference & dealloc, which can
 746  * cause a non-trivial object collapse in a critical path.
 747  *
 748  * It is unclear why we cycled the ref_count in the past, perhaps to try
 749  * to optimize shadow chain collapses but I don't quite see why it would
 750  * be necessary.  An OBJ_DEAD object should terminate any and all vm_pages
 751  * synchronously and not have to be kicked-start.
 752  */
 753 static void
 754 vm_pageout_page_free(vm_page_t m)
 755 {
 756         vm_page_protect(m, VM_PROT_NONE);
 757         vm_page_free(m);
 758 }
 759
 760 /*
 761  * vm_pageout_scan does the dirty work for the pageout daemon.
 762  */
 763 struct vm_pageout_scan_info {
 764         struct proc *bigproc;
 765         vm_offset_t bigsize;
 766 };
 767
 768 static int vm_pageout_scan_callback(struct proc *p, void *data);
 769
 770 /*
 771  * Scan inactive queue
 772  *
 773  * WARNING! Can be called from two pagedaemon threads simultaneously.
 774  */
 775 static int
 776 vm_pageout_scan_inactive(int pass, int q, long avail_shortage,
 777                          long *vnodes_skipped, long *counts)
 778 {
 779         vm_page_t m;
 780         struct vm_page marker;
 781         struct vnode *vpfailed;         /* warning, allowed to be stale */
 782         long maxscan;
 783         long delta = 0;
 784         long max_launder;
 785         int isep;
 786         int vmflush_flags;
 787
 788         isep = (curthread == emergpager);
 789         if ((unsigned)pass > 1000)
 790                 pass = 1000;
 791
 792         /*
 793          * This routine is called for each of PQ_L2_SIZE inactive queues.
 794          * We want the vm_max_launder parameter to apply to the whole
 795          * queue (i.e. per-whole-queue pass, not per-sub-queue).
 796          *
 797          * In each successive full-pass when the page target is not met we
 798          * allow the per-queue max_launder to increase up to a maximum of
 799          * vm_max_launder / 16.
 800          */
 801         if (pass)
 802                 max_launder = (long)vm_max_launder * (pass + 1) / PQ_L2_SIZE;
 803         else
 804                 max_launder = (long)vm_max_launder / PQ_L2_SIZE;
 805         max_launder /= MAXSCAN_DIVIDER;
 806
 807         if (max_launder <= 1)
 808                 max_launder = 1;
 809         if (max_launder >= vm_max_launder / 16)
 810                 max_launder = vm_max_launder / 16 + 1;
 811
 812         /*
 813          * Start scanning the inactive queue for pages we can move to the
 814          * cache or free.  The scan will stop when the target is reached or
 815          * we have scanned the entire inactive queue.  Note that m->act_count
 816          * is not used to form decisions for the inactive queue, only for the
 817          * active queue.
 818          *
 819          * NOTE!  THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED
 820          *        PAGES.
 821          */
 822
 823         /*
 824          * Initialize our marker
 825          */
 826         bzero(&marker, sizeof(marker));
 827         marker.flags = PG_FICTITIOUS | PG_MARKER;
 828         marker.busy_count = PBUSY_LOCKED;
 829         marker.queue = PQ_INACTIVE + q;
 830         marker.pc = q;
 831         marker.wire_count = 1;
 832
 833         /*
 834          * Inactive queue scan.
 835          *
 836          * We pick off approximately 1/10 of each queue.  Each queue is
 837          * effectively organized LRU so scanning the entire queue would
 838          * improperly pick up pages that might still be in regular use.
 839          *
 840          * NOTE: The vm_page must be spinlocked before the queue to avoid
 841          *       deadlocks, so it is easiest to simply iterate the loop
 842          *       with the queue unlocked at the top.
 843          */
 844         vpfailed = NULL;
 845
 846         vm_page_queues_spin_lock(PQ_INACTIVE + q);
 847         TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq);
 848         maxscan = vm_page_queues[PQ_INACTIVE + q].lcnt / MAXSCAN_DIVIDER + 1;
 849
 850         /*
 851          * Queue locked at top of loop to avoid stack marker issues.
 852          */
 853         while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
 854                maxscan-- > 0 && avail_shortage - delta > 0)
 855         {
 856                 int count;
 857
 858                 KKASSERT(m->queue == PQ_INACTIVE + q);
 859                 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl,
 860                              &marker, pageq);
 861                 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE + q].pl, m,
 862                                    &marker, pageq);
 863                 mycpu->gd_cnt.v_pdpages++;
 864
 865                 /*
 866                  * Skip marker pages (atomic against other markers to avoid
 867                  * infinite hop-over scans).
 868                  */
 869                 if (m->flags & PG_MARKER)
 870                         continue;
 871
 872                 /*
 873                  * Try to busy the page.  Don't mess with pages which are
 874                  * already busy or reorder them in the queue.
 875                  */
 876                 if (vm_page_busy_try(m, TRUE))
 877                         continue;
 878
 879                 /*
 880                  * Remaining operations run with the page busy and neither
 881                  * the page or the queue will be spin-locked.
 882                  */
 883                 KKASSERT(m->queue == PQ_INACTIVE + q);
 884                 vm_page_queues_spin_unlock(PQ_INACTIVE + q);
 885
 886                 /*
 887                  * The emergency pager runs when the primary pager gets
 888                  * stuck, which typically means the primary pager deadlocked
 889                  * on a vnode-backed page.  Therefore, the emergency pager
 890                  * must skip any complex objects.
 891                  *
 892                  * We disallow VNODEs unless they are VCHR whos device ops
 893                  * does not flag D_NOEMERGPGR.
 894                  */
 895                 if (isep && m->object) {
 896                         struct vnode *vp;
 897
 898                         switch(m->object->type) {
 899                         case OBJT_DEFAULT:
 900                         case OBJT_SWAP:
 901                                 /*
 902                                  * Allow anonymous memory and assume that
 903                                  * swap devices are not complex, since its
 904                                  * kinda worthless if we can't swap out dirty
 905                                  * anonymous pages.
 906                                  */
 907                                 break;
 908                         case OBJT_VNODE:
 909                                 /*
 910                                  * Allow VCHR device if the D_NOEMERGPGR
 911                                  * flag is not set, deny other vnode types
 912                                  * as being too complex.
 913                                  */
 914                                 vp = m->object->handle;
 915                                 if (vp && vp->v_type == VCHR &&
 916                                     vp->v_rdev && vp->v_rdev->si_ops &&
 917                                     (vp->v_rdev->si_ops->head.flags &
 918                                      D_NOEMERGPGR) == 0) {
 919                                         break;
 920                                 }
 921                                 /* Deny - fall through */
 922                         default:
 923                                 /*
 924                                  * Deny
 925                                  */
 926                                 vm_page_wakeup(m);
 927                                 vm_page_queues_spin_lock(PQ_INACTIVE + q);
 928                                 lwkt_yield();
 929                                 continue;
 930                         }
 931                 }
 932
 933                 /*
 934                  * Try to pageout the page and perhaps other nearby pages.
 935                  * We want to get the pages into the cache eventually (
 936                  * first or second pass).  Otherwise the pages can wind up
 937                  * just cycling in the inactive queue, getting flushed over
 938                  * and over again.
 939                  *
 940                  * Generally speaking we recycle dirty pages within PQ_INACTIVE
 941                  * twice (double LRU) before paging them out.  If the
 942                  * memuse_mode is >= 3 we run them single-LRU like we do clean
 943                  * pages.
 944                  */
 945                 if (vm_pageout_memuse_mode >= 3)
 946                         vm_page_flag_set(m, PG_WINATCFLS);
 947
 948                 vmflush_flags = 0;
 949                 if (vm_pageout_allow_active)
 950                         vmflush_flags |= OBJPC_ALLOW_ACTIVE;
 951                 if (m->flags & PG_WINATCFLS)
 952                         vmflush_flags |= OBJPC_TRY_TO_CACHE;
 953                 count = vm_pageout_page(m, &max_launder, vnodes_skipped,
 954                                         &vpfailed, pass, vmflush_flags, counts);
 955                 delta += count;
 956
 957                 /*
 958                  * Systems with a ton of memory can wind up with huge
 959                  * deactivation counts.  Because the inactive scan is
 960                  * doing a lot of flushing, the combination can result
 961                  * in excessive paging even in situations where other
 962                  * unrelated threads free up sufficient VM.
 963                  *
 964                  * To deal with this we abort the nominal active->inactive
 965                  * scan before we hit the inactive target when free+cache
 966                  * levels have reached a reasonable target.
 967                  *
 968                  * When deciding to stop early we need to add some slop to
 969                  * the test and we need to return full completion to the caller
 970                  * to prevent the caller from thinking there is something
 971                  * wrong and issuing a low-memory+swap warning or pkill.
 972                  *
 973                  * A deficit forces paging regardless of the state of the
 974                  * VM page queues (used for RSS enforcement).
 975                  */
 976                 lwkt_yield();
 977                 vm_page_queues_spin_lock(PQ_INACTIVE + q);
 978                 if (vm_paging_target() < -vm_max_launder) {
 979                         /*
 980                          * Stopping early, return full completion to caller.
 981                          */
 982                         if (delta < avail_shortage)
 983                                 delta = avail_shortage;
 984                         break;
 985                 }
 986         }
 987
 988         /* page queue still spin-locked */
 989         TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq);
 990         vm_page_queues_spin_unlock(PQ_INACTIVE + q);
 991
 992         return (delta);
 993 }
 994
 995 /*
 996  * Pageout the specified page, return the total number of pages paged out
 997  * (this routine may cluster).
 998  *
 999  * The page must be busied and soft-busied by the caller and will be disposed
1000  * of by this function.
1001  */
1002 static int
1003 vm_pageout_page(vm_page_t m, long *max_launderp, long *vnodes_skippedp,
1004                 struct vnode **vpfailedp, int pass, int vmflush_flags,
1005                 long *counts)
1006 {
1007         vm_object_t object;
1008         int actcount;
1009         int count = 0;
1010
1011         /*
1012          * Wiring no longer removes a page from its queue.  The last unwiring
1013          * will requeue the page.  Obviously wired pages cannot be paged out
1014          * so unqueue it and return.
1015          */
1016         if (m->wire_count) {
1017                 vm_page_unqueue_nowakeup(m);
1018                 vm_page_wakeup(m);
1019                 return 0;
1020         }
1021
1022         /*
1023          * A held page may be undergoing I/O, so skip it.
1024          */
1025         if (m->hold_count) {
1026                 vm_page_and_queue_spin_lock(m);
1027                 if (m->queue - m->pc == PQ_INACTIVE) {
1028                         TAILQ_REMOVE(
1029                                 &vm_page_queues[m->queue].pl, m, pageq);
1030                         TAILQ_INSERT_TAIL(
1031                                 &vm_page_queues[m->queue].pl, m, pageq);
1032                 }
1033                 vm_page_and_queue_spin_unlock(m);
1034                 vm_page_wakeup(m);
1035                 return 0;
1036         }
1037
1038         if (m->object == NULL || m->object->ref_count == 0) {
1039                 /*
1040                  * If the object is not being used, we ignore previous
1041                  * references.
1042                  */
1043                 vm_page_flag_clear(m, PG_REFERENCED);
1044                 pmap_clear_reference(m);
1045                 /* fall through to end */
1046         } else if (((m->flags & PG_REFERENCED) == 0) &&
1047                     (actcount = pmap_ts_referenced(m))) {
1048                 /*
1049                  * Otherwise, if the page has been referenced while
1050                  * in the inactive queue, we bump the "activation
1051                  * count" upwards, making it less likely that the
1052                  * page will be added back to the inactive queue
1053                  * prematurely again.  Here we check the page tables
1054                  * (or emulated bits, if any), given the upper level
1055                  * VM system not knowing anything about existing
1056                  * references.
1057                  */
1058                 ++counts[3];
1059                 vm_page_activate(m);
1060                 m->act_count += (actcount + ACT_ADVANCE);
1061                 vm_page_wakeup(m);
1062                 return 0;
1063         }
1064
1065         /*
1066          * (m) is still busied.
1067          *
1068          * If the upper level VM system knows about any page
1069          * references, we activate the page.  We also set the
1070          * "activation count" higher than normal so that we will less
1071          * likely place pages back onto the inactive queue again.
1072          */
1073         if ((m->flags & PG_REFERENCED) != 0) {
1074                 vm_page_flag_clear(m, PG_REFERENCED);
1075                 actcount = pmap_ts_referenced(m);
1076                 vm_page_activate(m);
1077                 m->act_count += (actcount + ACT_ADVANCE + 1);
1078                 vm_page_wakeup(m);
1079                 ++counts[3];
1080                 return 0;
1081         }
1082
1083         /*
1084          * If the upper level VM system doesn't know anything about
1085          * the page being dirty, we have to check for it again.  As
1086          * far as the VM code knows, any partially dirty pages are
1087          * fully dirty.
1088          *
1089          * Pages marked PG_WRITEABLE may be mapped into the user
1090          * address space of a process running on another cpu.  A
1091          * user process (without holding the MP lock) running on
1092          * another cpu may be able to touch the page while we are
1093          * trying to remove it.  vm_page_cache() will handle this
1094          * case for us.
1095          */
1096         if (m->dirty == 0) {
1097                 vm_page_test_dirty(m);
1098         } else {
1099                 vm_page_dirty(m);
1100         }
1101
1102         if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) {
1103                 /*
1104                  * Invalid pages can be easily freed
1105                  */
1106                 vm_pageout_page_free(m);
1107                 mycpu->gd_cnt.v_dfree++;
1108                 ++count;
1109                 ++counts[1];
1110         } else if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) {
1111                 /*
1112                  * Clean pages can be placed onto the cache queue.
1113                  * This effectively frees them.
1114                  */
1115                 vm_page_cache(m);
1116                 ++count;
1117                 ++counts[1];
1118         } else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) {
1119                 /*
1120                  * Dirty pages need to be paged out, but flushing
1121                  * a page is extremely expensive verses freeing
1122                  * a clean page.  Rather then artificially limiting
1123                  * the number of pages we can flush, we instead give
1124                  * dirty pages extra priority on the inactive queue
1125                  * by forcing them to be cycled through the queue
1126                  * twice before being flushed, after which the
1127                  * (now clean) page will cycle through once more
1128                  * before being freed.  This significantly extends
1129                  * the thrash point for a heavily loaded machine.
1130                  */
1131                 ++counts[2];
1132                 vm_page_flag_set(m, PG_WINATCFLS);
1133                 vm_page_and_queue_spin_lock(m);
1134                 if (m->queue - m->pc == PQ_INACTIVE) {
1135                         TAILQ_REMOVE(
1136                                 &vm_page_queues[m->queue].pl, m, pageq);
1137                         TAILQ_INSERT_TAIL(
1138                                 &vm_page_queues[m->queue].pl, m, pageq);
1139                 }
1140                 vm_page_and_queue_spin_unlock(m);
1141                 vm_page_wakeup(m);
1142         } else if (*max_launderp > 0) {
1143                 /*
1144                  * We always want to try to flush some dirty pages if
1145                  * we encounter them, to keep the system stable.
1146                  * Normally this number is small, but under extreme
1147                  * pressure where there are insufficient clean pages
1148                  * on the inactive queue, we may have to go all out.
1149                  */
1150                 int swap_pageouts_ok;
1151                 struct vnode *vp = NULL;
1152
1153                 if ((m->flags & PG_WINATCFLS) == 0)
1154                         vm_page_flag_set(m, PG_WINATCFLS);
1155                 swap_pageouts_ok = 0;
1156                 object = m->object;
1157                 if (object &&
1158                     (object->type != OBJT_SWAP) &&
1159                     (object->type != OBJT_DEFAULT)) {
1160                         swap_pageouts_ok = 1;
1161                 } else {
1162                         swap_pageouts_ok = !(defer_swap_pageouts ||
1163                                              disable_swap_pageouts);
1164                         swap_pageouts_ok |= (!disable_swap_pageouts &&
1165                                              defer_swap_pageouts &&
1166                                              vm_page_count_min(0));
1167                 }
1168
1169                 /*
1170                  * We don't bother paging objects that are "dead".
1171                  * Those objects are in a "rundown" state.
1172                  */
1173                 if (!swap_pageouts_ok ||
1174                     (object == NULL) ||
1175                     (object->flags & OBJ_DEAD)) {
1176                         vm_page_and_queue_spin_lock(m);
1177                         if (m->queue - m->pc == PQ_INACTIVE) {
1178                                 TAILQ_REMOVE(
1179                                     &vm_page_queues[m->queue].pl,
1180                                     m, pageq);
1181                                 TAILQ_INSERT_TAIL(
1182                                     &vm_page_queues[m->queue].pl,
1183                                     m, pageq);
1184                         }
1185                         vm_page_and_queue_spin_unlock(m);
1186                         vm_page_wakeup(m);
1187                         return 0;
1188                 }
1189
1190                 /*
1191                  * (m) is still busied.
1192                  *
1193                  * The object is already known NOT to be dead.   It
1194                  * is possible for the vget() to block the whole
1195                  * pageout daemon, but the new low-memory handling
1196                  * code should prevent it.
1197                  *
1198                  * The previous code skipped locked vnodes and, worse,
1199                  * reordered pages in the queue.  This results in
1200                  * completely non-deterministic operation because,
1201                  * quite often, a vm_fault has initiated an I/O and
1202                  * is holding a locked vnode at just the point where
1203                  * the pageout daemon is woken up.
1204                  *
1205                  * We can't wait forever for the vnode lock, we might
1206                  * deadlock due to a vn_read() getting stuck in
1207                  * vm_wait while holding this vnode.  We skip the
1208                  * vnode if we can't get it in a reasonable amount
1209                  * of time.
1210                  *
1211                  * vpfailed is used to (try to) avoid the case where
1212                  * a large number of pages are associated with a
1213                  * locked vnode, which could cause the pageout daemon
1214                  * to stall for an excessive amount of time.
1215                  */
1216                 if (object->type == OBJT_VNODE) {
1217                         int flags;
1218
1219                         vp = object->handle;
1220                         flags = LK_EXCLUSIVE;
1221                         if (vp == *vpfailedp)
1222                                 flags |= LK_NOWAIT;
1223                         else
1224                                 flags |= LK_TIMELOCK;
1225                         vm_page_hold(m);
1226                         vm_page_wakeup(m);
1227
1228                         /*
1229                          * We have unbusied (m) temporarily so we can
1230                          * acquire the vp lock without deadlocking.
1231                          * (m) is held to prevent destruction.
1232                          */
1233                         if (vget(vp, flags) != 0) {
1234                                 *vpfailedp = vp;
1235                                 ++pageout_lock_miss;
1236                                 if (object->flags & OBJ_MIGHTBEDIRTY)
1237                                             ++*vnodes_skippedp;
1238                                 vm_page_unhold(m);
1239                                 return 0;
1240                         }
1241
1242                         /*
1243                          * The page might have been moved to another
1244                          * queue during potential blocking in vget()
1245                          * above.  The page might have been freed and
1246                          * reused for another vnode.  The object might
1247                          * have been reused for another vnode.
1248                          */
1249                         if (m->queue - m->pc != PQ_INACTIVE ||
1250                             m->object != object ||
1251                             object->handle != vp) {
1252                                 if (object->flags & OBJ_MIGHTBEDIRTY)
1253                                         ++*vnodes_skippedp;
1254                                 vput(vp);
1255                                 vm_page_unhold(m);
1256                                 return 0;
1257                         }
1258
1259                         /*
1260                          * The page may have been busied during the
1261                          * blocking in vput();  We don't move the
1262                          * page back onto the end of the queue so that
1263                          * statistics are more correct if we don't.
1264                          */
1265                         if (vm_page_busy_try(m, TRUE)) {
1266                                 vput(vp);
1267                                 vm_page_unhold(m);
1268                                 return 0;
1269                         }
1270                         vm_page_unhold(m);
1271
1272                         /*
1273                          * If it was wired while we didn't own it.
1274                          */
1275                         if (m->wire_count) {
1276                                 vm_page_unqueue_nowakeup(m);
1277                                 vput(vp);
1278                                 vm_page_wakeup(m);
1279                                 return 0;
1280                         }
1281
1282                         /*
1283                          * (m) is busied again
1284                          *
1285                          * We own the busy bit and remove our hold
1286                          * bit.  If the page is still held it
1287                          * might be undergoing I/O, so skip it.
1288                          */
1289                         if (m->hold_count) {
1290 rebusy_failed:
1291                                 vm_page_and_queue_spin_lock(m);
1292                                 if (m->queue - m->pc == PQ_INACTIVE) {
1293                                         TAILQ_REMOVE(&vm_page_queues[m->queue].pl, m, pageq);
1294                                         TAILQ_INSERT_TAIL(&vm_page_queues[m->queue].pl, m, pageq);
1295                                 }
1296                                 vm_page_and_queue_spin_unlock(m);
1297                                 if (object->flags & OBJ_MIGHTBEDIRTY)
1298                                         ++*vnodes_skippedp;
1299                                 vm_page_wakeup(m);
1300                                 vput(vp);
1301                                 return 0;
1302                         }
1303
1304                         /*
1305                          * Recheck queue, object, and vp now that we have
1306                          * rebusied the page.
1307                          */
1308                         if (m->queue - m->pc != PQ_INACTIVE ||
1309                             m->object != object ||
1310                             object->handle != vp) {
1311                                 kprintf("vm_pageout_page: "
1312                                         "rebusy %p failed(A)\n",
1313                                         m);
1314                                 goto rebusy_failed;
1315                         }
1316
1317                         /*
1318                          * Check page validity
1319                          */
1320                         if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) {
1321                                 kprintf("vm_pageout_page: "
1322                                         "rebusy %p failed(B)\n",
1323                                         m);
1324                                 goto rebusy_failed;
1325                         }
1326                         if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) {
1327                                 kprintf("vm_pageout_page: "
1328                                         "rebusy %p failed(C)\n",
1329                                         m);
1330                                 goto rebusy_failed;
1331                         }
1332
1333                         /* (m) is left busied as we fall through */
1334                 }
1335
1336                 /*
1337                  * page is busy and not held here.
1338                  *
1339                  * If a page is dirty, then it is either being washed
1340                  * (but not yet cleaned) or it is still in the
1341                  * laundry.  If it is still in the laundry, then we
1342                  * start the cleaning operation.
1343                  *
1344                  * decrement inactive_shortage on success to account
1345                  * for the (future) cleaned page.  Otherwise we
1346                  * could wind up laundering or cleaning too many
1347                  * pages.
1348                  *
1349                  * NOTE: Cleaning the page here does not cause
1350                  *       force_deficit to be adjusted, because the
1351                  *       page is not being freed or moved to the
1352                  *       cache.
1353                  */
1354                 count = vm_pageout_clean_helper(m, vmflush_flags);
1355                 counts[0] += count;
1356                 *max_launderp -= count;
1357
1358                 /*
1359                  * Clean ate busy, page no longer accessible
1360                  */
1361                 if (vp != NULL)
1362                         vput(vp);
1363         } else {
1364                 vm_page_wakeup(m);
1365         }
1366         return count;
1367 }
1368
1369 /*
1370  * Scan active queue
1371  *
1372  * WARNING! Can be called from two pagedaemon threads simultaneously.
1373  */
1374 static int
1375 vm_pageout_scan_active(int pass, int q,
1376                        long avail_shortage, long inactive_shortage,
1377                        long *recycle_countp)
1378 {
1379         struct vm_page marker;
1380         vm_page_t m;
1381         int actcount;
1382         long delta = 0;
1383         long maxscan;
1384         int isep;
1385
1386         isep = (curthread == emergpager);
1387
1388         /*
1389          * We want to move pages from the active queue to the inactive
1390          * queue to get the inactive queue to the inactive target.  If
1391          * we still have a page shortage from above we try to directly free
1392          * clean pages instead of moving them.
1393          *
1394          * If we do still have a shortage we keep track of the number of
1395          * pages we free or cache (recycle_count) as a measure of thrashing
1396          * between the active and inactive queues.
1397          *
1398          * If we were able to completely satisfy the free+cache targets
1399          * from the inactive pool we limit the number of pages we move
1400          * from the active pool to the inactive pool to 2x the pages we
1401          * had removed from the inactive pool (with a minimum of 1/5 the
1402          * inactive target).  If we were not able to completely satisfy
1403          * the free+cache targets we go for the whole target aggressively.
1404          *
1405          * NOTE: Both variables can end up negative.
1406          * NOTE: We are still in a critical section.
1407          *
1408          * NOTE!  THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED
1409          *        PAGES.
1410          */
1411
1412         bzero(&marker, sizeof(marker));
1413         marker.flags = PG_FICTITIOUS | PG_MARKER;
1414         marker.busy_count = PBUSY_LOCKED;
1415         marker.queue = PQ_ACTIVE + q;
1416         marker.pc = q;
1417         marker.wire_count = 1;
1418
1419         vm_page_queues_spin_lock(PQ_ACTIVE + q);
1420         TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1421         maxscan = vm_page_queues[PQ_ACTIVE + q].lcnt / MAXSCAN_DIVIDER + 1;
1422
1423         /*
1424          * Queue locked at top of loop to avoid stack marker issues.
1425          */
1426         while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
1427                maxscan-- > 0 && (avail_shortage - delta > 0 ||
1428                                 inactive_shortage > 0))
1429         {
1430                 KKASSERT(m->queue == PQ_ACTIVE + q);
1431                 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl,
1432                              &marker, pageq);
1433                 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m,
1434                                    &marker, pageq);
1435
1436                 /*
1437                  * Skip marker pages (atomic against other markers to avoid
1438                  * infinite hop-over scans).
1439                  */
1440                 if (m->flags & PG_MARKER)
1441                         continue;
1442
1443                 /*
1444                  * Try to busy the page.  Don't mess with pages which are
1445                  * already busy or reorder them in the queue.
1446                  */
1447                 if (vm_page_busy_try(m, TRUE))
1448                         continue;
1449
1450                 /*
1451                  * Remaining operations run with the page busy and neither
1452                  * the page or the queue will be spin-locked.
1453                  */
1454                 KKASSERT(m->queue == PQ_ACTIVE + q);
1455                 vm_page_queues_spin_unlock(PQ_ACTIVE + q);
1456
1457 #if 0
1458                 /*
1459                  * Don't deactivate pages that are held, even if we can
1460                  * busy them.  (XXX why not?)
1461                  */
1462                 if (m->hold_count) {
1463                         vm_page_and_queue_spin_lock(m);
1464                         if (m->queue - m->pc == PQ_ACTIVE) {
1465                                 TAILQ_REMOVE(
1466                                         &vm_page_queues[PQ_ACTIVE + q].pl,
1467                                         m, pageq);
1468                                 TAILQ_INSERT_TAIL(
1469                                         &vm_page_queues[PQ_ACTIVE + q].pl,
1470                                         m, pageq);
1471                         }
1472                         vm_page_and_queue_spin_unlock(m);
1473                         vm_page_wakeup(m);
1474                         goto next;
1475                 }
1476 #endif
1477                 /*
1478                  * We can just remove wired pages from the queue
1479                  */
1480                 if (m->wire_count) {
1481                         vm_page_unqueue_nowakeup(m);
1482                         vm_page_wakeup(m);
1483                         goto next;
1484                 }
1485
1486                 /*
1487                  * The emergency pager ignores vnode-backed pages as these
1488                  * are the pages that probably bricked the main pager.
1489                  */
1490                 if (isep && m->object && m->object->type == OBJT_VNODE) {
1491                         vm_page_and_queue_spin_lock(m);
1492                         if (m->queue - m->pc == PQ_ACTIVE) {
1493                                 TAILQ_REMOVE(
1494                                         &vm_page_queues[PQ_ACTIVE + q].pl,
1495                                         m, pageq);
1496                                 TAILQ_INSERT_TAIL(
1497                                         &vm_page_queues[PQ_ACTIVE + q].pl,
1498                                         m, pageq);
1499                         }
1500                         vm_page_and_queue_spin_unlock(m);
1501                         vm_page_wakeup(m);
1502                         goto next;
1503                 }
1504
1505                 /*
1506                  * The count for pagedaemon pages is done after checking the
1507                  * page for eligibility...
1508                  */
1509                 mycpu->gd_cnt.v_pdpages++;
1510
1511                 /*
1512                  * Check to see "how much" the page has been used and clear
1513                  * the tracking access bits.  If the object has no references
1514                  * don't bother paying the expense.
1515                  */
1516                 actcount = 0;
1517                 if (m->object && m->object->ref_count != 0) {
1518                         if (m->flags & PG_REFERENCED)
1519                                 ++actcount;
1520                         actcount += pmap_ts_referenced(m);
1521                         if (actcount) {
1522                                 m->act_count += ACT_ADVANCE + actcount;
1523                                 if (m->act_count > ACT_MAX)
1524                                         m->act_count = ACT_MAX;
1525                         }
1526                 }
1527                 vm_page_flag_clear(m, PG_REFERENCED);
1528
1529                 /*
1530                  * actcount is only valid if the object ref_count is non-zero.
1531                  * If the page does not have an object, actcount will be zero.
1532                  */
1533                 if (actcount && m->object->ref_count != 0) {
1534                         vm_page_and_queue_spin_lock(m);
1535                         if (m->queue - m->pc == PQ_ACTIVE) {
1536                                 TAILQ_REMOVE(
1537                                         &vm_page_queues[PQ_ACTIVE + q].pl,
1538                                         m, pageq);
1539                                 TAILQ_INSERT_TAIL(
1540                                         &vm_page_queues[PQ_ACTIVE + q].pl,
1541                                         m, pageq);
1542                         }
1543                         vm_page_and_queue_spin_unlock(m);
1544                         vm_page_wakeup(m);
1545                 } else {
1546                         switch(m->object->type) {
1547                         case OBJT_DEFAULT:
1548                         case OBJT_SWAP:
1549                                 m->act_count -= min(m->act_count,
1550                                                     vm_anonmem_decline);
1551                                 break;
1552                         default:
1553                                 m->act_count -= min(m->act_count,
1554                                                     vm_filemem_decline);
1555                                 break;
1556                         }
1557                         if (vm_pageout_algorithm ||
1558                             (m->object == NULL) ||
1559                             (m->object && (m->object->ref_count == 0)) ||
1560                             m->act_count < pass + 1
1561                         ) {
1562                                 /*
1563                                  * Deactivate the page.  If we had a
1564                                  * shortage from our inactive scan try to
1565                                  * free (cache) the page instead.
1566                                  *
1567                                  * Don't just blindly cache the page if
1568                                  * we do not have a shortage from the
1569                                  * inactive scan, that could lead to
1570                                  * gigabytes being moved.
1571                                  */
1572                                 --inactive_shortage;
1573                                 if (avail_shortage - delta > 0 ||
1574                                     (m->object && (m->object->ref_count == 0)))
1575                                 {
1576                                         if (avail_shortage - delta > 0)
1577                                                 ++*recycle_countp;
1578                                         vm_page_protect(m, VM_PROT_NONE);
1579                                         if (m->dirty == 0 &&
1580                                             (m->flags & PG_NEED_COMMIT) == 0 &&
1581                                             avail_shortage - delta > 0) {
1582                                                 vm_page_cache(m);
1583                                         } else {
1584                                                 vm_page_deactivate(m);
1585                                                 vm_page_wakeup(m);
1586                                         }
1587                                 } else {
1588                                         vm_page_deactivate(m);
1589                                         vm_page_wakeup(m);
1590                                 }
1591                                 ++delta;
1592                         } else {
1593                                 vm_page_and_queue_spin_lock(m);
1594                                 if (m->queue - m->pc == PQ_ACTIVE) {
1595                                         TAILQ_REMOVE(
1596                                             &vm_page_queues[PQ_ACTIVE + q].pl,
1597                                             m, pageq);
1598                                         TAILQ_INSERT_TAIL(
1599                                             &vm_page_queues[PQ_ACTIVE + q].pl,
1600                                             m, pageq);
1601                                 }
1602                                 vm_page_and_queue_spin_unlock(m);
1603                                 vm_page_wakeup(m);
1604                         }
1605                 }
1606 next:
1607                 lwkt_yield();
1608                 vm_page_queues_spin_lock(PQ_ACTIVE + q);
1609         }
1610
1611         /*
1612          * Clean out our local marker.
1613          *
1614          * Page queue still spin-locked.
1615          */
1616         TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1617         vm_page_queues_spin_unlock(PQ_ACTIVE + q);
1618
1619         return (delta);
1620 }
1621
1622 /*
1623  * The number of actually free pages can drop down to v_free_reserved,
1624  * we try to build the free count back above v_free_min.  Note that
1625  * vm_paging_needed() also returns TRUE if v_free_count is not at
1626  * least v_free_min so that is the minimum we must build the free
1627  * count to.
1628  *
1629  * We use a slightly higher target to improve hysteresis,
1630  * ((v_free_target + v_free_min) / 2).  Since v_free_target
1631  * is usually the same as v_cache_min this maintains about
1632  * half the pages in the free queue as are in the cache queue,
1633  * providing pretty good pipelining for pageout operation.
1634  *
1635  * The system operator can manipulate vm.v_cache_min and
1636  * vm.v_free_target to tune the pageout demon.  Be sure
1637  * to keep vm.v_free_min < vm.v_free_target.
1638  *
1639  * Note that the original paging target is to get at least
1640  * (free_min + cache_min) into (free + cache).  The slightly
1641  * higher target will shift additional pages from cache to free
1642  * without effecting the original paging target in order to
1643  * maintain better hysteresis and not have the free count always
1644  * be dead-on v_free_min.
1645  *
1646  * NOTE: we are still in a critical section.
1647  *
1648  * Pages moved from PQ_CACHE to totally free are not counted in the
1649  * pages_freed counter.
1650  *
1651  * WARNING! Can be called from two pagedaemon threads simultaneously.
1652  */
1653 static void
1654 vm_pageout_scan_cache(long avail_shortage, int pass,
1655                       long vnodes_skipped, long recycle_count)
1656 {
1657         static int lastkillticks;
1658         struct vm_pageout_scan_info info;
1659         vm_page_t m;
1660         int isep;
1661
1662         isep = (curthread == emergpager);
1663
1664         while (vmstats.v_free_count <
1665                (vmstats.v_free_min + vmstats.v_free_target) / 2) {
1666                 /*
1667                  * This steals some code from vm/vm_page.c
1668                  *
1669                  * Create two rovers and adjust the code to reduce
1670                  * chances of them winding up at the same index (which
1671                  * can cause a lot of contention).
1672                  */
1673                 static int cache_rover[2] = { 0, PQ_L2_MASK / 2 };
1674
1675                 if (((cache_rover[0] ^ cache_rover[1]) & PQ_L2_MASK) == 0)
1676                         goto next_rover;
1677
1678                 m = vm_page_list_find(PQ_CACHE, cache_rover[isep] & PQ_L2_MASK);
1679                 if (m == NULL)
1680                         break;
1681                 /*
1682                  * page is returned removed from its queue and spinlocked
1683                  *
1684                  * If the busy attempt fails we can still deactivate the page.
1685                  */
1686                 if (vm_page_busy_try(m, TRUE)) {
1687                         vm_page_deactivate_locked(m);
1688                         vm_page_spin_unlock(m);
1689                         continue;
1690                 }
1691                 vm_page_spin_unlock(m);
1692                 pagedaemon_wakeup();
1693                 lwkt_yield();
1694
1695                 /*
1696                  * Remaining operations run with the page busy and neither
1697                  * the page or the queue will be spin-locked.
1698                  */
1699                 if ((m->flags & (PG_UNQUEUED | PG_NEED_COMMIT)) ||
1700                     m->hold_count ||
1701                     m->wire_count) {
1702                         vm_page_deactivate(m);
1703                         vm_page_wakeup(m);
1704                         continue;
1705                 }
1706
1707                 /*
1708                  * Because the page is in the cache, it shouldn't be mapped.
1709                  */
1710                 pmap_mapped_sync(m);
1711                 KKASSERT((m->flags & PG_MAPPED) == 0);
1712                 KKASSERT(m->dirty == 0);
1713                 vm_pageout_page_free(m);
1714                 mycpu->gd_cnt.v_dfree++;
1715 next_rover:
1716                 if (isep)
1717                         cache_rover[1] -= PQ_PRIME2;
1718                 else
1719                         cache_rover[0] += PQ_PRIME2;
1720         }
1721
1722         /*
1723          * If we didn't get enough free pages, and we have skipped a vnode
1724          * in a writeable object, wakeup the sync daemon.  And kick swapout
1725          * if we did not get enough free pages.
1726          */
1727         if (vm_paging_target() > 0) {
1728                 if (vnodes_skipped && vm_page_count_min(0))
1729                         speedup_syncer(NULL);
1730 #if !defined(NO_SWAPPING)
1731                 if (vm_swap_enabled && vm_page_count_target())
1732                         vm_req_vmdaemon();
1733 #endif
1734         }
1735
1736         /*
1737          * Handle catastrophic conditions.  Under good conditions we should
1738          * be at the target, well beyond our minimum.  If we could not even
1739          * reach our minimum the system is under heavy stress.  But just being
1740          * under heavy stress does not trigger process killing.
1741          *
1742          * We consider ourselves to have run out of memory if the swap pager
1743          * is full and avail_shortage is still positive.  The secondary check
1744          * ensures that we do not kill processes if the instantanious
1745          * availability is good, even if the pageout demon pass says it
1746          * couldn't get to the target.
1747          *
1748          * NOTE!  THE EMERGENCY PAGER (isep) DOES NOT HANDLE SWAP FULL
1749          *        SITUATIONS.
1750          */
1751         if (swap_pager_almost_full &&
1752             pass > 0 &&
1753             isep == 0 &&
1754             (vm_page_count_min(recycle_count) || avail_shortage > 0)) {
1755                 kprintf("Warning: system low on memory+swap "
1756                         "shortage %ld for %d ticks!\n",
1757                         avail_shortage, ticks - swap_fail_ticks);
1758                 if (bootverbose)
1759                 kprintf("Metrics: spaf=%d spf=%d pass=%d "
1760                         "avail=%ld target=%ld last=%u\n",
1761                         swap_pager_almost_full,
1762                         swap_pager_full,
1763                         pass,
1764                         avail_shortage,
1765                         vm_paging_target(),
1766                         (unsigned int)(ticks - lastkillticks));
1767         }
1768         if (swap_pager_full &&
1769             pass > 1 &&
1770             isep == 0 &&
1771             avail_shortage > 0 &&
1772             vm_paging_target() > 0 &&
1773             (unsigned int)(ticks - lastkillticks) >= hz) {
1774                 /*
1775                  * Kill something, maximum rate once per second to give
1776                  * the process time to free up sufficient memory.
1777                  */
1778                 lastkillticks = ticks;
1779                 info.bigproc = NULL;
1780                 info.bigsize = 0;
1781                 allproc_scan(vm_pageout_scan_callback, &info, 0);
1782                 if (info.bigproc != NULL) {
1783                         kprintf("Try to kill process %d %s\n",
1784                                 info.bigproc->p_pid, info.bigproc->p_comm);
1785                         info.bigproc->p_nice = PRIO_MIN;
1786                         info.bigproc->p_usched->resetpriority(
1787                                 FIRST_LWP_IN_PROC(info.bigproc));
1788                         atomic_set_int(&info.bigproc->p_flags, P_LOWMEMKILL);
1789                         killproc(info.bigproc, "out of swap space");
1790                         wakeup(&vmstats.v_free_count);
1791                         PRELE(info.bigproc);
1792                 }
1793         }
1794 }
1795
1796 static int
1797 vm_pageout_scan_callback(struct proc *p, void *data)
1798 {
1799         struct vm_pageout_scan_info *info = data;
1800         vm_offset_t size;
1801
1802         /*
1803          * Never kill system processes or init.  If we have configured swap
1804          * then try to avoid killing low-numbered pids.
1805          */
1806         if ((p->p_flags & P_SYSTEM) || (p->p_pid == 1) ||
1807             ((p->p_pid < 48) && (vm_swap_size != 0))) {
1808                 return (0);
1809         }
1810
1811         lwkt_gettoken(&p->p_token);
1812
1813         /*
1814          * if the process is in a non-running type state,
1815          * don't touch it.
1816          */
1817         if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) {
1818                 lwkt_reltoken(&p->p_token);
1819                 return (0);
1820         }
1821
1822         /*
1823          * Get the approximate process size.  Note that anonymous pages
1824          * with backing swap will be counted twice, but there should not
1825          * be too many such pages due to the stress the VM system is
1826          * under at this point.
1827          */
1828         size = vmspace_anonymous_count(p->p_vmspace) +
1829                 vmspace_swap_count(p->p_vmspace);
1830
1831         /*
1832          * If the this process is bigger than the biggest one
1833          * remember it.
1834          */
1835         if (info->bigsize < size) {
1836                 if (info->bigproc)
1837                         PRELE(info->bigproc);
1838                 PHOLD(p);
1839                 info->bigproc = p;
1840                 info->bigsize = size;
1841         }
1842         lwkt_reltoken(&p->p_token);
1843         lwkt_yield();
1844
1845         return(0);
1846 }
1847
1848 /*
1849  * This old guy slowly walks PQ_HOLD looking for pages which need to be
1850  * moved back to PQ_FREE.  It is possible for pages to accumulate here
1851  * when vm_page_free() races against vm_page_unhold(), resulting in a
1852  * page being left on a PQ_HOLD queue with hold_count == 0.
1853  *
1854  * It is easier to handle this edge condition here, in non-critical code,
1855  * rather than enforce a spin-lock for every 1->0 transition in
1856  * vm_page_unhold().
1857  *
1858  * NOTE: TAILQ_FOREACH becomes invalid the instant we unlock the queue.
1859  */
1860 static void
1861 vm_pageout_scan_hold(int q)
1862 {
1863         vm_page_t m;
1864
1865         vm_page_queues_spin_lock(PQ_HOLD + q);
1866         TAILQ_FOREACH(m, &vm_page_queues[PQ_HOLD + q].pl, pageq) {
1867                 if (m->flags & PG_MARKER)
1868                         continue;
1869
1870                 /*
1871                  * Process one page and return
1872                  */
1873                 if (m->hold_count)
1874                         break;
1875                 kprintf("DEBUG: pageout HOLD->FREE %p\n", m);
1876                 vm_page_hold(m);
1877                 vm_page_queues_spin_unlock(PQ_HOLD + q);
1878                 vm_page_unhold(m);      /* reprocess */
1879                 return;
1880         }
1881         vm_page_queues_spin_unlock(PQ_HOLD + q);
1882 }
1883
1884 /*
1885  * This routine tries to maintain the pseudo LRU active queue,
1886  * so that during long periods of time where there is no paging,
1887  * that some statistic accumulation still occurs.  This code
1888  * helps the situation where paging just starts to occur.
1889  */
1890 static void
1891 vm_pageout_page_stats(int q)
1892 {
1893         static int fullintervalcount = 0;
1894         struct vm_page marker;
1895         vm_page_t m;
1896         long pcount, tpcount;           /* Number of pages to check */
1897         long page_shortage;
1898
1899         page_shortage = (vmstats.v_inactive_target + vmstats.v_cache_max +
1900                          vmstats.v_free_min) -
1901                         (vmstats.v_free_count + vmstats.v_inactive_count +
1902                          vmstats.v_cache_count);
1903
1904         if (page_shortage <= 0)
1905                 return;
1906
1907         pcount = vm_page_queues[PQ_ACTIVE + q].lcnt;
1908         fullintervalcount += vm_pageout_stats_interval;
1909         if (fullintervalcount < vm_pageout_full_stats_interval) {
1910                 tpcount = (vm_pageout_stats_max * pcount) /
1911                           vmstats.v_page_count + 1;
1912                 if (pcount > tpcount)
1913                         pcount = tpcount;
1914         } else {
1915                 fullintervalcount = 0;
1916         }
1917
1918         bzero(&marker, sizeof(marker));
1919         marker.flags = PG_FICTITIOUS | PG_MARKER;
1920         marker.busy_count = PBUSY_LOCKED;
1921         marker.queue = PQ_ACTIVE + q;
1922         marker.pc = q;
1923         marker.wire_count = 1;
1924
1925         vm_page_queues_spin_lock(PQ_ACTIVE + q);
1926         TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1927
1928         /*
1929          * Queue locked at top of loop to avoid stack marker issues.
1930          */
1931         while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
1932                pcount-- > 0)
1933         {
1934                 int actcount;
1935
1936                 KKASSERT(m->queue == PQ_ACTIVE + q);
1937                 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1938                 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m,
1939                                    &marker, pageq);
1940
1941                 /*
1942                  * Skip marker pages (atomic against other markers to avoid
1943                  * infinite hop-over scans).
1944                  */
1945                 if (m->flags & PG_MARKER)
1946                         continue;
1947
1948                 /*
1949                  * Ignore pages we can't busy
1950                  */
1951                 if (vm_page_busy_try(m, TRUE))
1952                         continue;
1953
1954                 /*
1955                  * Remaining operations run with the page busy and neither
1956                  * the page or the queue will be spin-locked.
1957                  */
1958                 KKASSERT(m->queue == PQ_ACTIVE + q);
1959                 vm_page_queues_spin_unlock(PQ_ACTIVE + q);
1960
1961                 /*
1962                  * We can just remove wired pages from the queue
1963                  */
1964                 if (m->wire_count) {
1965                         vm_page_unqueue_nowakeup(m);
1966                         vm_page_wakeup(m);
1967                         goto next;
1968                 }
1969
1970
1971                 /*
1972                  * We now have a safely busied page, the page and queue
1973                  * spinlocks have been released.
1974                  *
1975                  * Ignore held and wired pages
1976                  */
1977                 if (m->hold_count || m->wire_count) {
1978                         vm_page_wakeup(m);
1979                         goto next;
1980                 }
1981
1982                 /*
1983                  * Calculate activity
1984                  */
1985                 actcount = 0;
1986                 if (m->flags & PG_REFERENCED) {
1987                         vm_page_flag_clear(m, PG_REFERENCED);
1988                         actcount += 1;
1989                 }
1990                 actcount += pmap_ts_referenced(m);
1991
1992                 /*
1993                  * Update act_count and move page to end of queue.
1994                  */
1995                 if (actcount) {
1996                         m->act_count += ACT_ADVANCE + actcount;
1997                         if (m->act_count > ACT_MAX)
1998                                 m->act_count = ACT_MAX;
1999                         vm_page_and_queue_spin_lock(m);
2000                         if (m->queue - m->pc == PQ_ACTIVE) {
2001                                 TAILQ_REMOVE(
2002                                         &vm_page_queues[PQ_ACTIVE + q].pl,
2003                                         m, pageq);
2004                                 TAILQ_INSERT_TAIL(
2005                                         &vm_page_queues[PQ_ACTIVE + q].pl,
2006                                         m, pageq);
2007                         }
2008                         vm_page_and_queue_spin_unlock(m);
2009                         vm_page_wakeup(m);
2010                         goto next;
2011                 }
2012
2013                 if (m->act_count == 0) {
2014                         /*
2015                          * We turn off page access, so that we have
2016                          * more accurate RSS stats.  We don't do this
2017                          * in the normal page deactivation when the
2018                          * system is loaded VM wise, because the
2019                          * cost of the large number of page protect
2020                          * operations would be higher than the value
2021                          * of doing the operation.
2022                          *
2023                          * We use the marker to save our place so
2024                          * we can release the spin lock.  both (m)
2025                          * and (next) will be invalid.
2026                          */
2027                         vm_page_protect(m, VM_PROT_NONE);
2028                         vm_page_deactivate(m);
2029                 } else {
2030                         m->act_count -= min(m->act_count, ACT_DECLINE);
2031                         vm_page_and_queue_spin_lock(m);
2032                         if (m->queue - m->pc == PQ_ACTIVE) {
2033                                 TAILQ_REMOVE(
2034                                         &vm_page_queues[PQ_ACTIVE + q].pl,
2035                                         m, pageq);
2036                                 TAILQ_INSERT_TAIL(
2037                                         &vm_page_queues[PQ_ACTIVE + q].pl,
2038                                         m, pageq);
2039                         }
2040                         vm_page_and_queue_spin_unlock(m);
2041                 }
2042                 vm_page_wakeup(m);
2043 next:
2044                 vm_page_queues_spin_lock(PQ_ACTIVE + q);
2045         }
2046
2047         /*
2048          * Remove our local marker
2049          *
2050          * Page queue still spin-locked.
2051          */
2052         TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
2053         vm_page_queues_spin_unlock(PQ_ACTIVE + q);
2054 }
2055
2056 static void
2057 vm_pageout_free_page_calc(vm_size_t count)
2058 {
2059         /*
2060          * v_free_min           normal allocations
2061          * v_free_reserved      system allocations
2062          * v_pageout_free_min   allocations by pageout daemon
2063          * v_interrupt_free_min low level allocations (e.g swap structures)
2064          *
2065          * v_free_min is used to generate several other baselines, and they
2066          * can get pretty silly on systems with a lot of memory.
2067          */
2068         vmstats.v_free_min = 64 + vmstats.v_page_count / 200;
2069         vmstats.v_free_reserved = vmstats.v_free_min * 4 / 8 + 7;
2070         vmstats.v_free_severe = vmstats.v_free_min * 4 / 8 + 0;
2071         vmstats.v_pageout_free_min = vmstats.v_free_min * 2 / 8 + 7;
2072         vmstats.v_interrupt_free_min = vmstats.v_free_min * 1 / 8 + 7;
2073 }
2074
2075
2076 /*
2077  * vm_pageout is the high level pageout daemon.  TWO kernel threads run
2078  * this daemon, the primary pageout daemon and the emergency pageout daemon.
2079  *
2080  * The emergency pageout daemon takes over when the primary pageout daemon
2081  * deadlocks.  The emergency pageout daemon ONLY pages out to swap, thus
2082  * avoiding the many low-memory deadlocks which can occur when paging out
2083  * to VFS's.
2084  */
2085 static void
2086 vm_pageout_thread(void)
2087 {
2088         int pass;
2089         int q;
2090         int q1iterator = 0;
2091         int q2iterator = 0;
2092         int q3iterator = 0;
2093         int isep;
2094
2095         curthread->td_flags |= TDF_SYSTHREAD;
2096
2097         /*
2098          * We only need to setup once.
2099          */
2100         isep = 0;
2101         if (curthread == emergpager) {
2102                 isep = 1;
2103                 goto skip_setup;
2104         }
2105
2106         /*
2107          * Initialize vm_max_launder per pageout pass to be 1/16
2108          * of total physical memory, plus a little slop.
2109          */
2110         if (vm_max_launder == 0)
2111                 vm_max_launder = physmem / 256 + 16;
2112
2113         /*
2114          * Initialize some paging parameters.
2115          */
2116         vm_pageout_free_page_calc(vmstats.v_page_count);
2117
2118         /*
2119          * v_free_target and v_cache_min control pageout hysteresis.  Note
2120          * that these are more a measure of the VM cache queue hysteresis
2121          * then the VM free queue.  Specifically, v_free_target is the
2122          * high water mark (free+cache pages).
2123          *
2124          * v_free_reserved + v_cache_min (mostly means v_cache_min) is the
2125          * low water mark, while v_free_min is the stop.  v_cache_min must
2126          * be big enough to handle memory needs while the pageout daemon
2127          * is signalled and run to free more pages.
2128          */
2129         vmstats.v_free_target = 4 * vmstats.v_free_min +
2130                                 vmstats.v_free_reserved;
2131
2132         /*
2133          * NOTE: With the new buffer cache b_act_count we want the default
2134          *       inactive target to be a percentage of available memory.
2135          *
2136          *       The inactive target essentially determines the minimum
2137          *       number of 'temporary' pages capable of caching one-time-use
2138          *       files when the VM system is otherwise full of pages
2139          *       belonging to multi-time-use files or active program data.
2140          *
2141          * NOTE: The inactive target is aggressively persued only if the
2142          *       inactive queue becomes too small.  If the inactive queue
2143          *       is large enough to satisfy page movement to free+cache
2144          *       then it is repopulated more slowly from the active queue.
2145          *       This allows a general inactive_target default to be set.
2146          *
2147          *       There is an issue here for processes which sit mostly idle
2148          *       'overnight', such as sshd, tcsh, and X.  Any movement from
2149          *       the active queue will eventually cause such pages to
2150          *       recycle eventually causing a lot of paging in the morning.
2151          *       To reduce the incidence of this pages cycled out of the
2152          *       buffer cache are moved directly to the inactive queue if
2153          *       they were only used once or twice.
2154          *
2155          *       The vfs.vm_cycle_point sysctl can be used to adjust this.
2156          *       Increasing the value (up to 64) increases the number of
2157          *       buffer recyclements which go directly to the inactive queue.
2158          */
2159         if (vmstats.v_free_count > 2048) {
2160                 vmstats.v_cache_min = vmstats.v_free_target;
2161                 vmstats.v_cache_max = 2 * vmstats.v_cache_min;
2162         } else {
2163                 vmstats.v_cache_min = 0;
2164                 vmstats.v_cache_max = 0;
2165         }
2166         vmstats.v_inactive_target = vmstats.v_free_count / 4;
2167
2168         /* XXX does not really belong here */
2169         if (vm_page_max_wired == 0)
2170                 vm_page_max_wired = vmstats.v_free_count / 3;
2171
2172         if (vm_pageout_stats_max == 0)
2173                 vm_pageout_stats_max = vmstats.v_free_target;
2174
2175         /*
2176          * Set interval in seconds for stats scan.
2177          */
2178         if (vm_pageout_stats_interval == 0)
2179                 vm_pageout_stats_interval = 5;
2180         if (vm_pageout_full_stats_interval == 0)
2181                 vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4;
2182
2183
2184         /*
2185          * Set maximum free per pass
2186          */
2187         if (vm_pageout_stats_free_max == 0)
2188                 vm_pageout_stats_free_max = 5;
2189
2190         swap_pager_swap_init();
2191         pass = 0;
2192
2193         atomic_swap_int(&sequence_emerg_pager, 1);
2194         wakeup(&sequence_emerg_pager);
2195
2196 skip_setup:
2197         /*
2198          * Sequence emergency pager startup
2199          */
2200         if (isep) {
2201                 while (sequence_emerg_pager == 0)
2202                         tsleep(&sequence_emerg_pager, 0, "pstartup", hz);
2203         }
2204
2205         /*
2206          * The pageout daemon is never done, so loop forever.
2207          *
2208          * WARNING!  This code is being executed by two kernel threads
2209          *           potentially simultaneously.
2210          */
2211         while (TRUE) {
2212                 int error;
2213                 long avail_shortage;
2214                 long inactive_shortage;
2215                 long vnodes_skipped = 0;
2216                 long recycle_count = 0;
2217                 long tmp;
2218
2219                 /*
2220                  * Wait for an action request.  If we timeout check to
2221                  * see if paging is needed (in case the normal wakeup
2222                  * code raced us).
2223                  */
2224                 if (isep) {
2225                         /*
2226                          * Emergency pagedaemon monitors the primary
2227                          * pagedaemon while vm_pages_needed != 0.
2228                          *
2229                          * The emergency pagedaemon only runs if VM paging
2230                          * is needed and the primary pagedaemon has not
2231                          * updated vm_pagedaemon_time for more than 2 seconds.
2232                          */
2233                         if (vm_pages_needed)
2234                                 tsleep(&vm_pagedaemon_time, 0, "psleep", hz);
2235                         else
2236                                 tsleep(&vm_pagedaemon_time, 0, "psleep", hz*10);
2237                         if (vm_pages_needed == 0) {
2238                                 pass = 0;
2239                                 continue;
2240                         }
2241                         if ((int)(ticks - vm_pagedaemon_time) < hz * 2) {
2242                                 pass = 0;
2243                                 continue;
2244                         }
2245                 } else {
2246                         /*
2247                          * Primary pagedaemon
2248                          *
2249                          * NOTE: We unconditionally cleanup PQ_HOLD even
2250                          *       when there is no work to do.
2251                          */
2252                         vm_pageout_scan_hold(q3iterator & PQ_L2_MASK);
2253                         ++q3iterator;
2254
2255                         if (vm_pages_needed == 0) {
2256                                 error = tsleep(&vm_pages_needed,
2257                                                0, "psleep",
2258                                                vm_pageout_stats_interval * hz);
2259                                 if (error &&
2260                                     vm_paging_needed(0) == 0 &&
2261                                     vm_pages_needed == 0) {
2262                                         for (q = 0; q < PQ_L2_SIZE; ++q)
2263                                                 vm_pageout_page_stats(q);
2264                                         continue;
2265                                 }
2266                                 vm_pagedaemon_time = ticks;
2267                                 vm_pages_needed = 1;
2268
2269                                 /*
2270                                  * Wake the emergency pagedaemon up so it
2271                                  * can monitor us.  It will automatically
2272                                  * go back into a long sleep when
2273                                  * vm_pages_needed returns to 0.
2274                                  */
2275                                 wakeup(&vm_pagedaemon_time);
2276                         }
2277                 }
2278
2279                 mycpu->gd_cnt.v_pdwakeups++;
2280
2281                 /*
2282                  * Scan for INACTIVE->CLEAN/PAGEOUT
2283                  *
2284                  * This routine tries to avoid thrashing the system with
2285                  * unnecessary activity.
2286                  *
2287                  * Calculate our target for the number of free+cache pages we
2288                  * want to get to.  This is higher then the number that causes
2289                  * allocations to stall (severe) in order to provide hysteresis,
2290                  * and if we don't make it all the way but get to the minimum
2291                  * we're happy.  Goose it a bit if there are multiple requests
2292                  * for memory.
2293                  *
2294                  * Don't reduce avail_shortage inside the loop or the
2295                  * PQAVERAGE() calculation will break.
2296                  *
2297                  * NOTE! deficit is differentiated from avail_shortage as
2298                  *       REQUIRING at least (deficit) pages to be cleaned,
2299                  *       even if the page queues are in good shape.  This
2300                  *       is used primarily for handling per-process
2301                  *       RLIMIT_RSS and may also see small values when
2302                  *       processes block due to low memory.
2303                  */
2304                 vmstats_rollup();
2305                 if (isep == 0)
2306                         vm_pagedaemon_time = ticks;
2307                 avail_shortage = vm_paging_target() + vm_pageout_deficit;
2308                 vm_pageout_deficit = 0;
2309
2310                 if (avail_shortage > 0) {
2311                         long delta = 0;
2312                         long counts[4] = { 0, 0, 0, 0 };
2313                         int qq;
2314
2315                         if (vm_pageout_debug) {
2316                                 kprintf("scan_inactive pass %d isep=%d\t",
2317                                         pass / MAXSCAN_DIVIDER, isep);
2318                         }
2319
2320                         qq = q1iterator;
2321                         for (q = 0; q < PQ_L2_SIZE; ++q) {
2322                                 delta += vm_pageout_scan_inactive(
2323                                             pass / MAXSCAN_DIVIDER,
2324                                             qq & PQ_L2_MASK,
2325                                             PQAVERAGE(avail_shortage),
2326                                             &vnodes_skipped, counts);
2327                                 if (isep)
2328                                         --qq;
2329                                 else
2330                                         ++qq;
2331                                 if (avail_shortage - delta <= 0)
2332                                         break;
2333
2334                                 /*
2335                                  * It is possible for avail_shortage to be
2336                                  * very large.  If a large program exits or
2337                                  * frees a ton of memory all at once, we do
2338                                  * not have to continue deactivations.
2339                                  *
2340                                  * (We will still run the active->inactive
2341                                  * target, however).
2342                                  */
2343                                 if (!vm_page_count_target() &&
2344                                     !vm_page_count_min(
2345                                                 vm_page_free_hysteresis)) {
2346                                         avail_shortage = 0;
2347                                         break;
2348                                 }
2349                         }
2350                         if (vm_pageout_debug) {
2351                                 kprintf("flushed %ld cleaned %ld "
2352                                         "lru2 %ld react %ld "
2353                                         "delta %ld\n",
2354                                         counts[0], counts[1],
2355                                         counts[2], counts[3],
2356                                         delta);
2357                         }
2358                         avail_shortage -= delta;
2359                         q1iterator = qq;
2360                 }
2361
2362                 /*
2363                  * Figure out how many active pages we must deactivate.  If
2364                  * we were able to reach our target with just the inactive
2365                  * scan above we limit the number of active pages we
2366                  * deactivate to reduce unnecessary work.
2367                  */
2368                 vmstats_rollup();
2369                 if (isep == 0)
2370                         vm_pagedaemon_time = ticks;
2371                 inactive_shortage = vmstats.v_inactive_target -
2372                                     vmstats.v_inactive_count;
2373
2374                 /*
2375                  * If we were unable to free sufficient inactive pages to
2376                  * satisfy the free/cache queue requirements then simply
2377                  * reaching the inactive target may not be good enough.
2378                  * Try to deactivate pages in excess of the target based
2379                  * on the shortfall.
2380                  *
2381                  * However to prevent thrashing the VM system do not
2382                  * deactivate more than an additional 1/10 the inactive
2383                  * target's worth of active pages.
2384                  */
2385                 if (avail_shortage > 0) {
2386                         tmp = avail_shortage * 2;
2387                         if (tmp > vmstats.v_inactive_target / 10)
2388                                 tmp = vmstats.v_inactive_target / 10;
2389                         inactive_shortage += tmp;
2390                 }
2391
2392                 /*
2393                  * Only trigger a pmap cleanup on inactive shortage.
2394                  */
2395                 if (isep == 0 && inactive_shortage > 0) {
2396                         pmap_collect();
2397                 }
2398
2399                 /*
2400                  * Scan for ACTIVE->INACTIVE
2401                  *
2402                  * Only trigger on inactive shortage.  Triggering on
2403                  * avail_shortage can starve the active queue with
2404                  * unnecessary active->inactive transitions and destroy
2405                  * performance.
2406                  *
2407                  * If this is the emergency pager, always try to move
2408                  * a few pages from active to inactive because the inactive
2409                  * queue might have enough pages, but not enough anonymous
2410                  * pages.
2411                  */
2412                 if (isep && inactive_shortage < vm_emerg_launder)
2413                         inactive_shortage = vm_emerg_launder;
2414
2415                 if (/*avail_shortage > 0 ||*/ inactive_shortage > 0) {
2416                         long delta = 0;
2417                         int qq;
2418
2419                         qq = q2iterator;
2420                         for (q = 0; q < PQ_L2_SIZE; ++q) {
2421                                 delta += vm_pageout_scan_active(
2422                                                 pass / MAXSCAN_DIVIDER,
2423                                                 qq & PQ_L2_MASK,
2424                                                 PQAVERAGE(avail_shortage),
2425                                                 PQAVERAGE(inactive_shortage),
2426                                                 &recycle_count);
2427                                 if (isep)
2428                                         --qq;
2429                                 else
2430                                         ++qq;
2431                                 if (inactive_shortage - delta <= 0 &&
2432                                     avail_shortage - delta <= 0) {
2433                                         break;
2434                                 }
2435
2436                                 /*
2437                                  * inactive_shortage can be a very large
2438                                  * number.  This is intended to break out
2439                                  * early if our inactive_target has been
2440                                  * reached due to other system activity.
2441                                  */
2442                                 if (vmstats.v_inactive_count >
2443                                     vmstats.v_inactive_target) {
2444                                         inactive_shortage = 0;
2445                                         break;
2446                                 }
2447                         }
2448                         inactive_shortage -= delta;
2449                         avail_shortage -= delta;
2450                         q2iterator = qq;
2451                 }
2452
2453                 /*
2454                  * Scan for CACHE->FREE
2455                  *
2456                  * Finally free enough cache pages to meet our free page
2457                  * requirement and take more drastic measures if we are
2458                  * still in trouble.
2459                  */
2460                 vmstats_rollup();
2461                 if (isep == 0)
2462                         vm_pagedaemon_time = ticks;
2463                 vm_pageout_scan_cache(avail_shortage, pass / MAXSCAN_DIVIDER,
2464                                       vnodes_skipped, recycle_count);
2465
2466                 /*
2467                  * This is a bit sophisticated because we do not necessarily
2468                  * want to force paging until our targets are reached if we
2469                  * were able to successfully retire the shortage we calculated.
2470                  */
2471                 if (avail_shortage > 0) {
2472                         /*
2473                          * If we did not retire enough pages continue the
2474                          * pageout operation until we are able to.  It
2475                          * takes MAXSCAN_DIVIDER passes to cover the entire
2476                          * inactive list.
2477                          */
2478                         ++pass;
2479
2480                         if (pass / MAXSCAN_DIVIDER < 10 &&
2481                             vm_pages_needed > 1) {
2482                                 /*
2483                                  * Normal operation, additional processes
2484                                  * have already kicked us.  Retry immediately
2485                                  * unless swap space is completely full in
2486                                  * which case delay a bit.
2487                                  */
2488                                 if (swap_pager_full) {
2489                                         tsleep(&vm_pages_needed, 0, "pdelay",
2490                                                 hz / 5);
2491                                 } /* else immediate retry */
2492                         } else if (pass / MAXSCAN_DIVIDER < 10) {
2493                                 /*
2494                                  * Do a short sleep for the first 10 passes,
2495                                  * allow the sleep to be woken up by resetting
2496                                  * vm_pages_needed to 1 (NOTE: we are still
2497                                  * active paging!).
2498                                  */
2499                                 if (isep == 0)
2500                                         vm_pages_needed = 1;
2501                                 tsleep(&vm_pages_needed, 0, "pdelay", 2);
2502                         } else if (swap_pager_full == 0) {
2503                                 /*
2504                                  * We've taken too many passes, force a
2505                                  * longer delay.
2506                                  */
2507                                 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10);
2508                         } else {
2509                                 /*
2510                                  * Running out of memory, catastrophic
2511                                  * back-off to one-second intervals.
2512                                  */
2513                                 tsleep(&vm_pages_needed, 0, "pdelay", hz);
2514                         }
2515                 } else if (vm_pages_needed) {
2516                         /*
2517                          * We retired our calculated shortage but we may have
2518                          * to continue paging if threads drain memory too far
2519                          * below our target.
2520                          *
2521                          * Similar to vm_page_free_wakeup() in vm_page.c.
2522                          */
2523                         pass = 0;
2524                         if (!vm_paging_needed(0)) {
2525                                 /* still more than half-way to our target */
2526                                 vm_pages_needed = 0;
2527                                 wakeup(&vmstats.v_free_count);
2528                         } else
2529                         if (!vm_page_count_min(vm_page_free_hysteresis)) {
2530                                 /*
2531                                  * Continue operations with wakeup
2532                                  * (set variable to avoid overflow)
2533                                  */
2534                                 vm_pages_needed = 2;
2535                                 wakeup(&vmstats.v_free_count);
2536                         } else {
2537                                 /*
2538                                  * No wakeup() needed, continue operations.
2539                                  * (set variable to avoid overflow)
2540                                  */
2541                                 vm_pages_needed = 2;
2542                         }
2543                 } else {
2544                         /*
2545                          * Turn paging back on immediately if we are under
2546                          * minimum.
2547                          */
2548                         pass = 0;
2549                 }
2550         }
2551 }
2552
2553 static struct kproc_desc pg1_kp = {
2554         "pagedaemon",
2555         vm_pageout_thread,
2556         &pagethread
2557 };
2558 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &pg1_kp);
2559
2560 static struct kproc_desc pg2_kp = {
2561         "emergpager",
2562         vm_pageout_thread,
2563         &emergpager
2564 };
2565 SYSINIT(emergpager, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, kproc_start, &pg2_kp);
2566
2567
2568 /*
2569  * Called after allocating a page out of the cache or free queue
2570  * to possibly wake the pagedaemon up to replentish our supply.
2571  *
2572  * We try to generate some hysteresis by waking the pagedaemon up
2573  * when our free+cache pages go below the free_min+cache_min level.
2574  * The pagedaemon tries to get the count back up to at least the
2575  * minimum, and through to the target level if possible.
2576  *
2577  * If the pagedaemon is already active bump vm_pages_needed as a hint
2578  * that there are even more requests pending.
2579  *
2580  * SMP races ok?
2581  * No requirements.
2582  */
2583 void
2584 pagedaemon_wakeup(void)
2585 {
2586         if (vm_paging_needed(0) && curthread != pagethread) {
2587                 if (vm_pages_needed <= 1) {
2588                         vm_pages_needed = 1;            /* SMP race ok */
2589                         wakeup(&vm_pages_needed);       /* tickle pageout */
2590                 } else if (vm_page_count_min(0)) {
2591                         ++vm_pages_needed;              /* SMP race ok */
2592                         /* a wakeup() would be wasted here */
2593                 }
2594         }
2595 }
2596
2597 #if !defined(NO_SWAPPING)
2598
2599 /*
2600  * SMP races ok?
2601  * No requirements.
2602  */
2603 static void
2604 vm_req_vmdaemon(void)
2605 {
2606         static int lastrun = 0;
2607
2608         if ((ticks > (lastrun + hz)) || (ticks < lastrun)) {
2609                 wakeup(&vm_daemon_needed);
2610                 lastrun = ticks;
2611         }
2612 }
2613
2614 static int vm_daemon_callback(struct proc *p, void *data __unused);
2615
2616 /*
2617  * No requirements.
2618  *
2619  * Scan processes for exceeding their rlimits, deactivate pages
2620  * when RSS is exceeded.
2621  */
2622 static void
2623 vm_daemon(void)
2624 {
2625         while (TRUE) {
2626                 tsleep(&vm_daemon_needed, 0, "psleep", 0);
2627                 allproc_scan(vm_daemon_callback, NULL, 0);
2628         }
2629 }
2630
2631 static int
2632 vm_daemon_callback(struct proc *p, void *data __unused)
2633 {
2634         struct vmspace *vm;
2635         vm_pindex_t limit, size;
2636
2637         /*
2638          * if this is a system process or if we have already
2639          * looked at this process, skip it.
2640          */
2641         lwkt_gettoken(&p->p_token);
2642
2643         if (p->p_flags & (P_SYSTEM | P_WEXIT)) {
2644                 lwkt_reltoken(&p->p_token);
2645                 return (0);
2646         }
2647
2648         /*
2649          * if the process is in a non-running type state,
2650          * don't touch it.
2651          */
2652         if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) {
2653                 lwkt_reltoken(&p->p_token);
2654                 return (0);
2655         }
2656
2657         /*
2658          * get a limit
2659          */
2660         limit = OFF_TO_IDX(qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur,
2661                                 p->p_rlimit[RLIMIT_RSS].rlim_max));
2662
2663         vm = p->p_vmspace;
2664         vmspace_hold(vm);
2665         size = pmap_resident_tlnw_count(&vm->vm_pmap);
2666         if (limit >= 0 && size > 4096 &&
2667             size - 4096 >= limit && vm_pageout_memuse_mode >= 1) {
2668                 vm_pageout_map_deactivate_pages(&vm->vm_map, limit);
2669         }
2670         vmspace_drop(vm);
2671
2672         lwkt_reltoken(&p->p_token);
2673
2674         return (0);
2675 }
2676
2677 #endif