sys/vm/vm_pageout.c

   1 /*
   2  * Copyright (c) 1991 Regents of the University of California.
   3  * All rights reserved.
   4  * Copyright (c) 1994 John S. Dyson
   5  * All rights reserved.
   6  * Copyright (c) 1994 David Greenman
   7  * All rights reserved.
   8  *
   9  * This code is derived from software contributed to Berkeley by
  10  * The Mach Operating System project at Carnegie-Mellon University.
  11  *
  12  * Redistribution and use in source and binary forms, with or without
  13  * modification, are permitted provided that the following conditions
  14  * are met:
  15  * 1. Redistributions of source code must retain the above copyright
  16  *    notice, this list of conditions and the following disclaimer.
  17  * 2. Redistributions in binary form must reproduce the above copyright
  18  *    notice, this list of conditions and the following disclaimer in the
  19  *    documentation and/or other materials provided with the distribution.
  20  * 3. Neither the name of the University nor the names of its contributors
  21  *    may be used to endorse or promote products derived from this software
  22  *    without specific prior written permission.
  23  *
  24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  34  * SUCH DAMAGE.
  35  *
  36  *      from: @(#)vm_pageout.c  7.4 (Berkeley) 5/7/91
  37  *
  38  *
  39  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  40  * All rights reserved.
  41  *
  42  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  43  *
  44  * Permission to use, copy, modify and distribute this software and
  45  * its documentation is hereby granted, provided that both the copyright
  46  * notice and this permission notice appear in all copies of the
  47  * software, derivative works or modified versions, and any portions
  48  * thereof, and that both notices appear in supporting documentation.
  49  *
  50  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  51  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  52  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  53  *
  54  * Carnegie Mellon requests users of this software to return to
  55  *
  56  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  57  *  School of Computer Science
  58  *  Carnegie Mellon University
  59  *  Pittsburgh PA 15213-3890
  60  *
  61  * any improvements or extensions that they make and grant Carnegie the
  62  * rights to redistribute these changes.
  63  *
  64  * $FreeBSD: src/sys/vm/vm_pageout.c,v 1.151.2.15 2002/12/29 18:21:04 dillon Exp $
  65  */
  66
  67 /*
  68  *      The proverbial page-out daemon.
  69  */
  70
  71 #include "opt_vm.h"
  72 #include <sys/param.h>
  73 #include <sys/systm.h>
  74 #include <sys/kernel.h>
  75 #include <sys/proc.h>
  76 #include <sys/kthread.h>
  77 #include <sys/resourcevar.h>
  78 #include <sys/signalvar.h>
  79 #include <sys/vnode.h>
  80 #include <sys/vmmeter.h>
  81 #include <sys/sysctl.h>
  82
  83 #include <vm/vm.h>
  84 #include <vm/vm_param.h>
  85 #include <sys/lock.h>
  86 #include <vm/vm_object.h>
  87 #include <vm/vm_page.h>
  88 #include <vm/vm_map.h>
  89 #include <vm/vm_pageout.h>
  90 #include <vm/vm_pager.h>
  91 #include <vm/swap_pager.h>
  92 #include <vm/vm_extern.h>
  93
  94 #include <sys/thread2.h>
  95 #include <sys/spinlock2.h>
  96 #include <vm/vm_page2.h>
  97
  98 /*
  99  * System initialization
 100  */
 101
 102 /* the kernel process "vm_pageout"*/
 103 static int vm_pageout_page(vm_page_t m, int *max_launderp,
 104                            int *vnodes_skippedp, struct vnode **vpfailedp,
 105                            int pass, int vmflush_flags);
 106 static int vm_pageout_clean_helper (vm_page_t, int);
 107 static int vm_pageout_free_page_calc (vm_size_t count);
 108 static void vm_pageout_page_free(vm_page_t m) ;
 109 struct thread *pagethread;
 110
 111 #if !defined(NO_SWAPPING)
 112 /* the kernel process "vm_daemon"*/
 113 static void vm_daemon (void);
 114 static struct   thread *vmthread;
 115
 116 static struct kproc_desc vm_kp = {
 117         "vmdaemon",
 118         vm_daemon,
 119         &vmthread
 120 };
 121 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp);
 122 #endif
 123
 124 int vm_pages_needed = 0;        /* Event on which pageout daemon sleeps */
 125 int vm_pageout_deficit = 0;     /* Estimated number of pages deficit */
 126 int vm_pageout_pages_needed = 0;/* pageout daemon needs pages */
 127 int vm_page_free_hysteresis = 16;
 128
 129 #if !defined(NO_SWAPPING)
 130 static int vm_pageout_req_swapout;
 131 static int vm_daemon_needed;
 132 #endif
 133 static int vm_max_launder = 4096;
 134 static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0;
 135 static int vm_pageout_full_stats_interval = 0;
 136 static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0;
 137 static int defer_swap_pageouts=0;
 138 static int disable_swap_pageouts=0;
 139 static u_int vm_anonmem_decline = ACT_DECLINE;
 140 static u_int vm_filemem_decline = ACT_DECLINE * 2;
 141
 142 #if defined(NO_SWAPPING)
 143 static int vm_swap_enabled=0;
 144 static int vm_swap_idle_enabled=0;
 145 #else
 146 static int vm_swap_enabled=1;
 147 static int vm_swap_idle_enabled=0;
 148 #endif
 149 int vm_pageout_memuse_mode=1;   /* 0-disable, 1-passive, 2-active swp*/
 150
 151 SYSCTL_UINT(_vm, VM_PAGEOUT_ALGORITHM, anonmem_decline,
 152         CTLFLAG_RW, &vm_anonmem_decline, 0, "active->inactive anon memory");
 153
 154 SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, filemem_decline,
 155         CTLFLAG_RW, &vm_filemem_decline, 0, "active->inactive file cache");
 156
 157 SYSCTL_INT(_vm, OID_AUTO, page_free_hysteresis,
 158         CTLFLAG_RW, &vm_page_free_hysteresis, 0,
 159         "Free more pages than the minimum required");
 160
 161 SYSCTL_INT(_vm, OID_AUTO, max_launder,
 162         CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout");
 163
 164 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max,
 165         CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length");
 166
 167 SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval,
 168         CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan");
 169
 170 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval,
 171         CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan");
 172
 173 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_free_max,
 174         CTLFLAG_RW, &vm_pageout_stats_free_max, 0, "Not implemented");
 175 SYSCTL_INT(_vm, OID_AUTO, pageout_memuse_mode,
 176         CTLFLAG_RW, &vm_pageout_memuse_mode, 0, "memoryuse resource mode");
 177
 178 #if defined(NO_SWAPPING)
 179 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
 180         CTLFLAG_RD, &vm_swap_enabled, 0, "");
 181 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
 182         CTLFLAG_RD, &vm_swap_idle_enabled, 0, "");
 183 #else
 184 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
 185         CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout");
 186 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
 187         CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
 188 #endif
 189
 190 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts,
 191         CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem");
 192
 193 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
 194         CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
 195
 196 static int pageout_lock_miss;
 197 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
 198         CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
 199
 200 int vm_page_max_wired;          /* XXX max # of wired pages system-wide */
 201
 202 #if !defined(NO_SWAPPING)
 203 static vm_pindex_t vm_pageout_object_deactivate_pages(vm_map_t map,
 204                         vm_object_t object, vm_pindex_t limit,
 205                         vm_pindex_t obj_beg, vm_pindex_t obj_end);
 206 static void vm_req_vmdaemon (void);
 207 #endif
 208 static void vm_pageout_page_stats(int q);
 209
 210 /*
 211  * Calculate approximately how many pages on each queue to try to
 212  * clean.  An exact calculation creates an edge condition when the
 213  * queues are unbalanced so add significant slop.  The queue scans
 214  * will stop early when targets are reached and will start where they
 215  * left off on the next pass.
 216  *
 217  * We need to be generous here because there are all sorts of loading
 218  * conditions that can cause edge cases if try to average over all queues.
 219  * In particular, storage subsystems have become so fast that paging
 220  * activity can become quite frantic.  Eventually we will probably need
 221  * two paging threads, one for dirty pages and one for clean, to deal
 222  * with the bandwidth requirements.
 223
 224  * So what we do is calculate a value that can be satisfied nominally by
 225  * only having to scan half the queues.
 226  */
 227 static __inline int
 228 PQAVERAGE(int n)
 229 {
 230         int avg;
 231
 232         if (n >= 0) {
 233                 avg = ((n + (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) + 1);
 234         } else {
 235                 avg = ((n - (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) - 1);
 236         }
 237         return avg;
 238 }
 239
 240 /*
 241  * vm_pageout_clean_helper:
 242  *
 243  * Clean the page and remove it from the laundry.  The page must not be
 244  * busy on-call.
 245  *
 246  * We set the busy bit to cause potential page faults on this page to
 247  * block.  Note the careful timing, however, the busy bit isn't set till
 248  * late and we cannot do anything that will mess with the page.
 249  */
 250 static int
 251 vm_pageout_clean_helper(vm_page_t m, int vmflush_flags)
 252 {
 253         vm_object_t object;
 254         vm_page_t mc[BLIST_MAX_ALLOC];
 255         int error;
 256         int ib, is, page_base;
 257         vm_pindex_t pindex = m->pindex;
 258
 259         object = m->object;
 260
 261         /*
 262          * It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP
 263          * with the new swapper, but we could have serious problems paging
 264          * out other object types if there is insufficient memory.
 265          *
 266          * Unfortunately, checking free memory here is far too late, so the
 267          * check has been moved up a procedural level.
 268          */
 269
 270         /*
 271          * Don't mess with the page if it's busy, held, or special
 272          *
 273          * XXX do we really need to check hold_count here?  hold_count
 274          * isn't supposed to mess with vm_page ops except prevent the
 275          * page from being reused.
 276          */
 277         if (m->hold_count != 0 || (m->flags & PG_UNMANAGED)) {
 278                 vm_page_wakeup(m);
 279                 return 0;
 280         }
 281
 282         /*
 283          * Place page in cluster.  Align cluster for optimal swap space
 284          * allocation (whether it is swap or not).  This is typically ~16-32
 285          * pages, which also tends to align the cluster to multiples of the
 286          * filesystem block size if backed by a filesystem.
 287          */
 288         page_base = pindex % BLIST_MAX_ALLOC;
 289         mc[page_base] = m;
 290         ib = page_base - 1;
 291         is = page_base + 1;
 292
 293         /*
 294          * Scan object for clusterable pages.
 295          *
 296          * We can cluster ONLY if: ->> the page is NOT
 297          * clean, wired, busy, held, or mapped into a
 298          * buffer, and one of the following:
 299          * 1) The page is inactive, or a seldom used
 300          *    active page.
 301          * -or-
 302          * 2) we force the issue.
 303          *
 304          * During heavy mmap/modification loads the pageout
 305          * daemon can really fragment the underlying file
 306          * due to flushing pages out of order and not trying
 307          * align the clusters (which leave sporatic out-of-order
 308          * holes).  To solve this problem we do the reverse scan
 309          * first and attempt to align our cluster, then do a
 310          * forward scan if room remains.
 311          */
 312         vm_object_hold(object);
 313
 314         while (ib >= 0) {
 315                 vm_page_t p;
 316
 317                 p = vm_page_lookup_busy_try(object, pindex - page_base + ib,
 318                                             TRUE, &error);
 319                 if (error || p == NULL)
 320                         break;
 321                 if ((p->queue - p->pc) == PQ_CACHE ||
 322                     (p->flags & PG_UNMANAGED)) {
 323                         vm_page_wakeup(p);
 324                         break;
 325                 }
 326                 vm_page_test_dirty(p);
 327                 if (((p->dirty & p->valid) == 0 &&
 328                      (p->flags & PG_NEED_COMMIT) == 0) ||
 329                     p->wire_count != 0 ||       /* may be held by buf cache */
 330                     p->hold_count != 0) {       /* may be undergoing I/O */
 331                         vm_page_wakeup(p);
 332                         break;
 333                 }
 334                 if (p->queue - p->pc != PQ_INACTIVE) {
 335                         if (p->queue - p->pc != PQ_ACTIVE ||
 336                             (vmflush_flags & VM_PAGER_ALLOW_ACTIVE) == 0) {
 337                                 vm_page_wakeup(p);
 338                                 break;
 339                         }
 340                 }
 341
 342                 /*
 343                  * Try to maintain page groupings in the cluster.
 344                  */
 345                 if (m->flags & PG_WINATCFLS)
 346                         vm_page_flag_set(p, PG_WINATCFLS);
 347                 else
 348                         vm_page_flag_clear(p, PG_WINATCFLS);
 349                 p->act_count = m->act_count;
 350
 351                 mc[ib] = p;
 352                 --ib;
 353         }
 354         ++ib;   /* fixup */
 355
 356         while (is < BLIST_MAX_ALLOC &&
 357                pindex - page_base + is < object->size) {
 358                 vm_page_t p;
 359
 360                 p = vm_page_lookup_busy_try(object, pindex - page_base + is,
 361                                             TRUE, &error);
 362                 if (error || p == NULL)
 363                         break;
 364                 if (((p->queue - p->pc) == PQ_CACHE) ||
 365                     (p->flags & PG_UNMANAGED)) {
 366                         vm_page_wakeup(p);
 367                         break;
 368                 }
 369                 vm_page_test_dirty(p);
 370                 if (((p->dirty & p->valid) == 0 &&
 371                      (p->flags & PG_NEED_COMMIT) == 0) ||
 372                     p->wire_count != 0 ||       /* may be held by buf cache */
 373                     p->hold_count != 0) {       /* may be undergoing I/O */
 374                         vm_page_wakeup(p);
 375                         break;
 376                 }
 377                 if (p->queue - p->pc != PQ_INACTIVE) {
 378                         if (p->queue - p->pc != PQ_ACTIVE ||
 379                             (vmflush_flags & VM_PAGER_ALLOW_ACTIVE) == 0) {
 380                                 vm_page_wakeup(p);
 381                                 break;
 382                         }
 383                 }
 384
 385                 /*
 386                  * Try to maintain page groupings in the cluster.
 387                  */
 388                 if (m->flags & PG_WINATCFLS)
 389                         vm_page_flag_set(p, PG_WINATCFLS);
 390                 else
 391                         vm_page_flag_clear(p, PG_WINATCFLS);
 392                 p->act_count = m->act_count;
 393
 394                 mc[is] = p;
 395                 ++is;
 396         }
 397
 398         vm_object_drop(object);
 399
 400         /*
 401          * we allow reads during pageouts...
 402          */
 403         return vm_pageout_flush(&mc[ib], is - ib, vmflush_flags);
 404 }
 405
 406 /*
 407  * vm_pageout_flush() - launder the given pages
 408  *
 409  *      The given pages are laundered.  Note that we setup for the start of
 410  *      I/O ( i.e. busy the page ), mark it read-only, and bump the object
 411  *      reference count all in here rather then in the parent.  If we want
 412  *      the parent to do more sophisticated things we may have to change
 413  *      the ordering.
 414  *
 415  *      The pages in the array must be busied by the caller and will be
 416  *      unbusied by this function.
 417  */
 418 int
 419 vm_pageout_flush(vm_page_t *mc, int count, int vmflush_flags)
 420 {
 421         vm_object_t object;
 422         int pageout_status[count];
 423         int numpagedout = 0;
 424         int i;
 425
 426         /*
 427          * Initiate I/O.  Bump the vm_page_t->busy counter.
 428          */
 429         for (i = 0; i < count; i++) {
 430                 KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL,
 431                         ("vm_pageout_flush page %p index %d/%d: partially "
 432                          "invalid page", mc[i], i, count));
 433                 vm_page_io_start(mc[i]);
 434         }
 435
 436         /*
 437          * We must make the pages read-only.  This will also force the
 438          * modified bit in the related pmaps to be cleared.  The pager
 439          * cannot clear the bit for us since the I/O completion code
 440          * typically runs from an interrupt.  The act of making the page
 441          * read-only handles the case for us.
 442          *
 443          * Then we can unbusy the pages, we still hold a reference by virtue
 444          * of our soft-busy.
 445          */
 446         for (i = 0; i < count; i++) {
 447                 if (vmflush_flags & VM_PAGER_TRY_TO_CACHE)
 448                         vm_page_protect(mc[i], VM_PROT_NONE);
 449                 else
 450                         vm_page_protect(mc[i], VM_PROT_READ);
 451                 vm_page_wakeup(mc[i]);
 452         }
 453
 454         object = mc[0]->object;
 455         vm_object_pip_add(object, count);
 456
 457         vm_pager_put_pages(object, mc, count,
 458             (vmflush_flags |
 459              ((object == &kernel_object) ? VM_PAGER_PUT_SYNC : 0)),
 460             pageout_status);
 461
 462         for (i = 0; i < count; i++) {
 463                 vm_page_t mt = mc[i];
 464
 465                 switch (pageout_status[i]) {
 466                 case VM_PAGER_OK:
 467                         numpagedout++;
 468                         break;
 469                 case VM_PAGER_PEND:
 470                         numpagedout++;
 471                         break;
 472                 case VM_PAGER_BAD:
 473                         /*
 474                          * Page outside of range of object. Right now we
 475                          * essentially lose the changes by pretending it
 476                          * worked.
 477                          */
 478                         vm_page_busy_wait(mt, FALSE, "pgbad");
 479                         pmap_clear_modify(mt);
 480                         vm_page_undirty(mt);
 481                         vm_page_wakeup(mt);
 482                         break;
 483                 case VM_PAGER_ERROR:
 484                 case VM_PAGER_FAIL:
 485                         /*
 486                          * A page typically cannot be paged out when we
 487                          * have run out of swap.  We leave the page
 488                          * marked inactive and will try to page it out
 489                          * again later.
 490                          *
 491                          * Starvation of the active page list is used to
 492                          * determine when the system is massively memory
 493                          * starved.
 494                          */
 495                         break;
 496                 case VM_PAGER_AGAIN:
 497                         break;
 498                 }
 499
 500                 /*
 501                  * If not PENDing this was a synchronous operation and we
 502                  * clean up after the I/O.  If it is PENDing the mess is
 503                  * cleaned up asynchronously.
 504                  *
 505                  * Also nominally act on the caller's wishes if the caller
 506                  * wants to try to really clean (cache or free) the page.
 507                  *
 508                  * Also nominally deactivate the page if the system is
 509                  * memory-stressed.
 510                  */
 511                 if (pageout_status[i] != VM_PAGER_PEND) {
 512                         vm_page_busy_wait(mt, FALSE, "pgouw");
 513                         vm_page_io_finish(mt);
 514                         if (vmflush_flags & VM_PAGER_TRY_TO_CACHE) {
 515                                 vm_page_try_to_cache(mt);
 516                         } else if (vm_page_count_severe()) {
 517                                 vm_page_deactivate(mt);
 518                                 vm_page_wakeup(mt);
 519                         } else {
 520                                 vm_page_wakeup(mt);
 521                         }
 522                         vm_object_pip_wakeup(object);
 523                 }
 524         }
 525         return numpagedout;
 526 }
 527
 528 #if !defined(NO_SWAPPING)
 529
 530 /*
 531  * Deactivate pages until the map RSS falls below the specified limit.
 532  *
 533  * This code is part of the process rlimit and vm_daemon handler and not
 534  * part of the normal demand-paging code.  We only check the top-level
 535  * object.
 536  *
 537  * The map must be locked.
 538  * The caller must hold the vm_object.
 539  */
 540 static int vm_pageout_object_deactivate_pages_callback(vm_page_t, void *);
 541 static int vm_pageout_object_deactivate_pages_cmp(vm_page_t, void *);
 542
 543 static vm_pindex_t
 544 vm_pageout_object_deactivate_pages(vm_map_t map, vm_object_t object,
 545                                    vm_pindex_t limit,
 546                                    vm_pindex_t obj_beg,
 547                                    vm_pindex_t obj_end)
 548 {
 549         struct rb_vm_page_scan_info info;
 550         int remove_mode;
 551
 552         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
 553
 554         info.count = 0;
 555         info.backing_offset_index = obj_beg;
 556         info.backing_object = object;
 557
 558         for (;;) {
 559                 vm_pindex_t advance;
 560
 561                 if (pmap_resident_tlnw_count(vm_map_pmap(map)) <= limit)
 562                         break;
 563                 if (object->type == OBJT_DEVICE ||
 564                     object->type == OBJT_MGTDEVICE ||
 565                     object->type == OBJT_PHYS) {
 566                         break;
 567                 }
 568 #if 0
 569                 if (object->paging_in_progress)
 570                         break;
 571 #endif
 572
 573                 remove_mode = 0;
 574                 if (object->shadow_count > 1)
 575                         remove_mode = 1;
 576
 577                 /*
 578                  * scan the objects entire memory queue.  We hold the
 579                  * object's token so the scan should not race anything.
 580                  *
 581                  * The callback will adjust backing_offset_index past the
 582                  * last index scanned.  This value only matters if we
 583                  * terminate early.
 584                  */
 585                 info.limit = remove_mode;
 586                 info.map = map;
 587                 info.desired = limit;
 588                 info.start_pindex = obj_beg;
 589                 info.end_pindex = obj_end;
 590                 info.object = object;
 591
 592                 vm_page_rb_tree_RB_SCAN(&object->rb_memq,
 593                                 vm_pageout_object_deactivate_pages_cmp,
 594                                 vm_pageout_object_deactivate_pages_callback,
 595                                 &info);
 596
 597                 /*
 598                  * Backing object recursion (we will loop up).
 599                  */
 600                 while ((object = info.object->backing_object) != NULL) {
 601                         vm_object_hold(object);
 602                         if (object != info.object->backing_object) {
 603                                 vm_object_drop(object);
 604                                 continue;
 605                         }
 606                         break;
 607                 }
 608                 if (object == NULL) {
 609                         if (info.object != info.backing_object)
 610                                 vm_object_drop(info.object);
 611                         break;
 612                 }
 613                 advance = OFF_TO_IDX(info.object->backing_object_offset);
 614                 info.start_pindex += advance;
 615                 info.end_pindex += advance;
 616                 info.backing_offset_index += advance;
 617                 if (info.object != info.backing_object) {
 618                         vm_object_lock_swap();
 619                         vm_object_drop(info.object);
 620                 }
 621                 info.object = object;
 622         }
 623
 624         /*
 625          * Return how far we want the caller to advance.  The caller will
 626          * ignore this value and use obj_end if the RSS limit is still not
 627          * satisfied.
 628          */
 629         return (info.backing_offset_index - info.start_pindex);
 630 }
 631
 632 /*
 633  * Only page indices above start_pindex
 634  */
 635 static
 636 int
 637 vm_pageout_object_deactivate_pages_cmp(vm_page_t p, void *data)
 638 {
 639         struct rb_vm_page_scan_info *info = data;
 640
 641         if (p->pindex < info->start_pindex)
 642                 return -1;
 643         if (p->pindex >= info->end_pindex)
 644                 return +1;
 645         return 0;
 646 }
 647
 648 /*
 649  * The caller must hold the vm_object.
 650  *
 651  * info->count is bumped for every page removed from the process pmap.
 652  *
 653  * info->backing_offset_index is updated past the last scanned page.
 654  * This value will be ignored and the scan forced to the mapent boundary
 655  * by the caller if the resident count remains too high.
 656  */
 657 static int
 658 vm_pageout_object_deactivate_pages_callback(vm_page_t p, void *data)
 659 {
 660         struct rb_vm_page_scan_info *info = data;
 661         int actcount;
 662         int cleanit = 0;
 663
 664         /*
 665          * Basic tests - There should never be a marker, and we can stop
 666          *               once the RSS is below the required level.
 667          */
 668         KKASSERT((p->flags & PG_MARKER) == 0);
 669         if (pmap_resident_tlnw_count(vm_map_pmap(info->map)) <=
 670             info->desired) {
 671                 return(-1);
 672         }
 673
 674         mycpu->gd_cnt.v_pdpages++;
 675         info->backing_offset_index = p->pindex + 1;
 676
 677         if (vm_page_busy_try(p, TRUE))
 678                 return(0);
 679
 680         if (p->object != info->object) {
 681                 vm_page_wakeup(p);
 682                 return(0);
 683         }
 684         if (p->wire_count || p->hold_count || (p->flags & PG_UNMANAGED)) {
 685                 vm_page_wakeup(p);
 686                 goto done;
 687         }
 688         if (!pmap_page_exists_quick(vm_map_pmap(info->map), p)) {
 689                 vm_page_wakeup(p);
 690                 goto done;
 691         }
 692
 693         actcount = pmap_ts_referenced(p);
 694         if (actcount) {
 695                 vm_page_flag_set(p, PG_REFERENCED);
 696         } else if (p->flags & PG_REFERENCED) {
 697                 actcount = 1;
 698         }
 699
 700         vm_page_and_queue_spin_lock(p);
 701         if (p->queue - p->pc != PQ_ACTIVE && (p->flags & PG_REFERENCED)) {
 702                 vm_page_and_queue_spin_unlock(p);
 703                 vm_page_activate(p);
 704                 p->act_count += actcount;
 705                 vm_page_flag_clear(p, PG_REFERENCED);
 706         } else if (p->queue - p->pc == PQ_ACTIVE) {
 707                 if ((p->flags & PG_REFERENCED) == 0) {
 708                         /* use ACT_ADVANCE for a faster decline */
 709                         p->act_count -= min(p->act_count, ACT_ADVANCE);
 710                         if (!info->limit &&
 711                             (vm_pageout_algorithm || (p->act_count == 0))) {
 712                                 vm_page_and_queue_spin_unlock(p);
 713                                 vm_page_deactivate(p);
 714                                 cleanit = 1;
 715                         } else {
 716                                 TAILQ_REMOVE(&vm_page_queues[p->queue].pl,
 717                                              p, pageq);
 718                                 TAILQ_INSERT_TAIL(&vm_page_queues[p->queue].pl,
 719                                                   p, pageq);
 720                                 vm_page_and_queue_spin_unlock(p);
 721                         }
 722                 } else {
 723                         vm_page_and_queue_spin_unlock(p);
 724                         vm_page_activate(p);
 725                         vm_page_flag_clear(p, PG_REFERENCED);
 726
 727                         vm_page_and_queue_spin_lock(p);
 728                         if (p->queue - p->pc == PQ_ACTIVE) {
 729                                 if (p->act_count < (ACT_MAX - ACT_ADVANCE))
 730                                         p->act_count += ACT_ADVANCE;
 731                                 TAILQ_REMOVE(&vm_page_queues[p->queue].pl,
 732                                              p, pageq);
 733                                 TAILQ_INSERT_TAIL(&vm_page_queues[p->queue].pl,
 734                                                   p, pageq);
 735                         }
 736                         vm_page_and_queue_spin_unlock(p);
 737                 }
 738         } else if (p->queue - p->pc == PQ_INACTIVE) {
 739 #if 0
 740                 TAILQ_REMOVE(&vm_page_queues[p->queue].pl,
 741                              p, pageq);
 742                 TAILQ_INSERT_HEAD(&vm_page_queues[p->queue].pl,
 743                                   p, pageq);
 744 #endif
 745                 /* use ACT_ADVANCE for a faster decline */
 746                 p->act_count -= min(p->act_count, ACT_ADVANCE);
 747                 vm_page_and_queue_spin_unlock(p);
 748                 if (p->act_count == 0) {
 749                         cleanit = 1;
 750                 }
 751         } else {
 752                 vm_page_and_queue_spin_unlock(p);
 753         }
 754
 755         /*
 756          * Ok, try to fully clean the page and any nearby pages such that at
 757          * least the requested page is freed or moved to the cache queue.
 758          *
 759          * We usually do this synchronously to allow us to get the page into
 760          * the CACHE queue quickly, which will prevent memory exhaustion if
 761          * a process with a memoryuse limit is running away.  However, the
 762          * sysadmin may desire to set vm.swap_user_async which relaxes this
 763          * and improves write performance.
 764          */
 765         if (cleanit) {
 766                 int max_launder = 0x7FFF;
 767                 int vnodes_skipped = 0;
 768                 int vmflush_flags;
 769                 struct vnode *vpfailed = NULL;
 770
 771                 vmflush_flags = VM_PAGER_TRY_TO_CACHE | VM_PAGER_ALLOW_ACTIVE;
 772                 if (swap_user_async == 0)
 773                         vmflush_flags |= VM_PAGER_PUT_SYNC;
 774
 775                 if (vm_pageout_memuse_mode >= 1)
 776                         vm_page_protect(p, VM_PROT_NONE);
 777                 if (vm_pageout_memuse_mode >= 2) {
 778                         vm_page_flag_set(p, PG_WINATCFLS);
 779                         info->count += vm_pageout_page(p, &max_launder,
 780                                                        &vnodes_skipped,
 781                                                        &vpfailed, 1, vmflush_flags);
 782                 } else {
 783                         ++info->count;
 784                         vm_page_wakeup(p);
 785                 }
 786         } else {
 787                 vm_page_wakeup(p);
 788         }
 789
 790 done:
 791         lwkt_user_yield();
 792         return(0);
 793 }
 794
 795 /*
 796  * Deactivate some number of pages in a map due to set RLIMIT_RSS limits.
 797  * that is relatively difficult to do.
 798  *
 799  * Called when vm_pageout_memuse_mode is >= 1.
 800  */
 801 void
 802 vm_pageout_map_deactivate_pages(vm_map_t map, vm_pindex_t limit)
 803 {
 804         vm_map_entry_t tmpe;
 805         vm_object_t obj;
 806         vm_ooffset_t pgout_offset;
 807         vm_ooffset_t tmpe_end;
 808         vm_pindex_t obj_beg;
 809         vm_pindex_t obj_end;
 810         vm_pindex_t count;
 811         int retries = 3;
 812
 813         lockmgr(&map->lock, LK_EXCLUSIVE);
 814
 815         /*
 816          * Scan the map incrementally.
 817          */
 818         pgout_offset = map->pgout_offset;
 819 again:
 820         tmpe = map->header.next;
 821         obj_beg = 0;
 822         obj_end = 0;
 823         tmpe_end = 0;
 824         obj = NULL;
 825
 826         while (tmpe != &map->header) {
 827                 if (tmpe->end <= pgout_offset) {
 828                         tmpe = tmpe->next;
 829                         continue;
 830                 }
 831                 if (tmpe->maptype == VM_MAPTYPE_NORMAL ||
 832                     tmpe->maptype == VM_MAPTYPE_VPAGETABLE) {
 833                         obj = tmpe->object.vm_object;
 834                         if (obj && obj->shadow_count <= 1) {
 835                                 if (pgout_offset < tmpe->start) {
 836                                         obj_beg = tmpe->offset >> PAGE_SHIFT;
 837                                         obj_end = ((tmpe->end - tmpe->start) +
 838                                                    tmpe->offset) >> PAGE_SHIFT;
 839                                 } else {
 840                                         obj_beg = (pgout_offset - tmpe->start +
 841                                                    tmpe->offset) >> PAGE_SHIFT;
 842                                         obj_end = (tmpe->end - tmpe->start +
 843                                                    tmpe->offset) >> PAGE_SHIFT;
 844                                 }
 845                                 tmpe_end = tmpe->end;
 846                                 break;
 847                         }
 848                         obj = NULL;
 849                 }
 850                 tmpe = tmpe->next;
 851         }
 852
 853         /*
 854          * Attempt to continue where we left off until the RLIMIT is
 855          * satisfied or we run out of retries.  Note that the map remains
 856          * locked, so the program is not going to be taking any faults
 857          * while we are doing this.
 858          *
 859          * Only circle around in this particular function when the
 860          * memuse_mode is >= 2.
 861          */
 862         if (obj)  {
 863                 vm_object_hold(obj);
 864                 count = vm_pageout_object_deactivate_pages(map, obj, limit,
 865                                                    obj_beg, obj_end);
 866                 vm_object_drop(obj);
 867                 if (pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) {
 868                         pgout_offset = tmpe_end;
 869                         goto again;
 870                 }
 871                 pgout_offset += count << PAGE_SHIFT;
 872         } else {
 873                 pgout_offset = 0;
 874                 if (pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) {
 875                         if (retries && vm_pageout_memuse_mode >= 2) {
 876                                 --retries;
 877                                 goto again;
 878                         }
 879                 }
 880         }
 881
 882         map->pgout_offset = pgout_offset;
 883
 884         vm_map_unlock(map);
 885 }
 886 #endif
 887
 888 /*
 889  * Called when the pageout scan wants to free a page.  We no longer
 890  * try to cycle the vm_object here with a reference & dealloc, which can
 891  * cause a non-trivial object collapse in a critical path.
 892  *
 893  * It is unclear why we cycled the ref_count in the past, perhaps to try
 894  * to optimize shadow chain collapses but I don't quite see why it would
 895  * be necessary.  An OBJ_DEAD object should terminate any and all vm_pages
 896  * synchronously and not have to be kicked-start.
 897  */
 898 static void
 899 vm_pageout_page_free(vm_page_t m)
 900 {
 901         vm_page_protect(m, VM_PROT_NONE);
 902         vm_page_free(m);
 903 }
 904
 905 /*
 906  * vm_pageout_scan does the dirty work for the pageout daemon.
 907  */
 908 struct vm_pageout_scan_info {
 909         struct proc *bigproc;
 910         vm_offset_t bigsize;
 911 };
 912
 913 static int vm_pageout_scan_callback(struct proc *p, void *data);
 914
 915 static int
 916 vm_pageout_scan_inactive(int pass, int q, int avail_shortage,
 917                          int *vnodes_skipped)
 918 {
 919         vm_page_t m;
 920         struct vm_page marker;
 921         struct vnode *vpfailed;         /* warning, allowed to be stale */
 922         int maxscan;
 923         int delta = 0;
 924         int max_launder;
 925
 926         /*
 927          * Start scanning the inactive queue for pages we can move to the
 928          * cache or free.  The scan will stop when the target is reached or
 929          * we have scanned the entire inactive queue.  Note that m->act_count
 930          * is not used to form decisions for the inactive queue, only for the
 931          * active queue.
 932          *
 933          * max_launder limits the number of dirty pages we flush per scan.
 934          * For most systems a smaller value (16 or 32) is more robust under
 935          * extreme memory and disk pressure because any unnecessary writes
 936          * to disk can result in extreme performance degredation.  However,
 937          * systems with excessive dirty pages (especially when MAP_NOSYNC is
 938          * used) will die horribly with limited laundering.  If the pageout
 939          * daemon cannot clean enough pages in the first pass, we let it go
 940          * all out in succeeding passes.
 941          */
 942         if ((max_launder = vm_max_launder) <= 1)
 943                 max_launder = 1;
 944         if (pass)
 945                 max_launder = 10000;
 946
 947         /*
 948          * Initialize our marker
 949          */
 950         bzero(&marker, sizeof(marker));
 951         marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
 952         marker.queue = PQ_INACTIVE + q;
 953         marker.pc = q;
 954         marker.wire_count = 1;
 955
 956         /*
 957          * Inactive queue scan.
 958          *
 959          * NOTE: The vm_page must be spinlocked before the queue to avoid
 960          *       deadlocks, so it is easiest to simply iterate the loop
 961          *       with the queue unlocked at the top.
 962          */
 963         vpfailed = NULL;
 964
 965         vm_page_queues_spin_lock(PQ_INACTIVE + q);
 966         TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq);
 967         maxscan = vm_page_queues[PQ_INACTIVE + q].lcnt;
 968
 969         /*
 970          * Queue locked at top of loop to avoid stack marker issues.
 971          */
 972         while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
 973                maxscan-- > 0 && avail_shortage - delta > 0)
 974         {
 975                 int count;
 976
 977                 KKASSERT(m->queue == PQ_INACTIVE + q);
 978                 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl,
 979                              &marker, pageq);
 980                 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE + q].pl, m,
 981                                    &marker, pageq);
 982                 mycpu->gd_cnt.v_pdpages++;
 983
 984                 /*
 985                  * Skip marker pages (atomic against other markers to avoid
 986                  * infinite hop-over scans).
 987                  */
 988                 if (m->flags & PG_MARKER)
 989                         continue;
 990
 991                 /*
 992                  * Try to busy the page.  Don't mess with pages which are
 993                  * already busy or reorder them in the queue.
 994                  */
 995                 if (vm_page_busy_try(m, TRUE))
 996                         continue;
 997
 998                 /*
 999                  * Remaining operations run with the page busy and neither
1000                  * the page or the queue will be spin-locked.
1001                  */
1002                 vm_page_queues_spin_unlock(PQ_INACTIVE + q);
1003                 KKASSERT(m->queue == PQ_INACTIVE + q);
1004
1005                 count = vm_pageout_page(m, &max_launder, vnodes_skipped,
1006                                         &vpfailed, pass, 0);
1007                 delta += count;
1008
1009                 /*
1010                  * Systems with a ton of memory can wind up with huge
1011                  * deactivation counts.  Because the inactive scan is
1012                  * doing a lot of flushing, the combination can result
1013                  * in excessive paging even in situations where other
1014                  * unrelated threads free up sufficient VM.
1015                  *
1016                  * To deal with this we abort the nominal active->inactive
1017                  * scan before we hit the inactive target when free+cache
1018                  * levels have reached a reasonable target.
1019                  *
1020                  * When deciding to stop early we need to add some slop to
1021                  * the test and we need to return full completion to the caller
1022                  * to prevent the caller from thinking there is something
1023                  * wrong and issuing a low-memory+swap warning or pkill.
1024                  *
1025                  * A deficit forces paging regardless of the state of the
1026                  * VM page queues (used for RSS enforcement).
1027                  */
1028                 lwkt_yield();
1029                 vm_page_queues_spin_lock(PQ_INACTIVE + q);
1030                 if (vm_paging_target() < -vm_max_launder) {
1031                         /*
1032                          * Stopping early, return full completion to caller.
1033                          */
1034                         if (delta < avail_shortage)
1035                                 delta = avail_shortage;
1036                         break;
1037                 }
1038         }
1039
1040         /* page queue still spin-locked */
1041         TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq);
1042         vm_page_queues_spin_unlock(PQ_INACTIVE + q);
1043
1044         return (delta);
1045 }
1046
1047 /*
1048  * Pageout the specified page, return the total number of pages paged out
1049  * (this routine may cluster).
1050  *
1051  * The page must be busied and soft-busied by the caller and will be disposed
1052  * of by this function.
1053  */
1054 static int
1055 vm_pageout_page(vm_page_t m, int *max_launderp, int *vnodes_skippedp,
1056                 struct vnode **vpfailedp, int pass, int vmflush_flags)
1057 {
1058         vm_object_t object;
1059         int actcount;
1060         int count = 0;
1061
1062         /*
1063          * It is possible for a page to be busied ad-hoc (e.g. the
1064          * pmap_collect() code) and wired and race against the
1065          * allocation of a new page.  vm_page_alloc() may be forced
1066          * to deactivate the wired page in which case it winds up
1067          * on the inactive queue and must be handled here.  We
1068          * correct the problem simply by unqueuing the page.
1069          */
1070         if (m->wire_count) {
1071                 vm_page_unqueue_nowakeup(m);
1072                 vm_page_wakeup(m);
1073                 kprintf("WARNING: pagedaemon: wired page on "
1074                         "inactive queue %p\n", m);
1075                 return 0;
1076         }
1077
1078         /*
1079          * A held page may be undergoing I/O, so skip it.
1080          */
1081         if (m->hold_count) {
1082                 vm_page_and_queue_spin_lock(m);
1083                 if (m->queue - m->pc == PQ_INACTIVE) {
1084                         TAILQ_REMOVE(
1085                                 &vm_page_queues[m->queue].pl, m, pageq);
1086                         TAILQ_INSERT_TAIL(
1087                                 &vm_page_queues[m->queue].pl, m, pageq);
1088                         ++vm_swapcache_inactive_heuristic;
1089                 }
1090                 vm_page_and_queue_spin_unlock(m);
1091                 vm_page_wakeup(m);
1092                 return 0;
1093         }
1094
1095         if (m->object == NULL || m->object->ref_count == 0) {
1096                 /*
1097                  * If the object is not being used, we ignore previous
1098                  * references.
1099                  */
1100                 vm_page_flag_clear(m, PG_REFERENCED);
1101                 pmap_clear_reference(m);
1102                 /* fall through to end */
1103         } else if (((m->flags & PG_REFERENCED) == 0) &&
1104                     (actcount = pmap_ts_referenced(m))) {
1105                 /*
1106                  * Otherwise, if the page has been referenced while
1107                  * in the inactive queue, we bump the "activation
1108                  * count" upwards, making it less likely that the
1109                  * page will be added back to the inactive queue
1110                  * prematurely again.  Here we check the page tables
1111                  * (or emulated bits, if any), given the upper level
1112                  * VM system not knowing anything about existing
1113                  * references.
1114                  */
1115                 vm_page_activate(m);
1116                 m->act_count += (actcount + ACT_ADVANCE);
1117                 vm_page_wakeup(m);
1118                 return 0;
1119         }
1120
1121         /*
1122          * (m) is still busied.
1123          *
1124          * If the upper level VM system knows about any page
1125          * references, we activate the page.  We also set the
1126          * "activation count" higher than normal so that we will less
1127          * likely place pages back onto the inactive queue again.
1128          */
1129         if ((m->flags & PG_REFERENCED) != 0) {
1130                 vm_page_flag_clear(m, PG_REFERENCED);
1131                 actcount = pmap_ts_referenced(m);
1132                 vm_page_activate(m);
1133                 m->act_count += (actcount + ACT_ADVANCE + 1);
1134                 vm_page_wakeup(m);
1135                 return 0;
1136         }
1137
1138         /*
1139          * If the upper level VM system doesn't know anything about
1140          * the page being dirty, we have to check for it again.  As
1141          * far as the VM code knows, any partially dirty pages are
1142          * fully dirty.
1143          *
1144          * Pages marked PG_WRITEABLE may be mapped into the user
1145          * address space of a process running on another cpu.  A
1146          * user process (without holding the MP lock) running on
1147          * another cpu may be able to touch the page while we are
1148          * trying to remove it.  vm_page_cache() will handle this
1149          * case for us.
1150          */
1151         if (m->dirty == 0) {
1152                 vm_page_test_dirty(m);
1153         } else {
1154                 vm_page_dirty(m);
1155         }
1156
1157         if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) {
1158                 /*
1159                  * Invalid pages can be easily freed
1160                  */
1161                 vm_pageout_page_free(m);
1162                 mycpu->gd_cnt.v_dfree++;
1163                 ++count;
1164         } else if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) {
1165                 /*
1166                  * Clean pages can be placed onto the cache queue.
1167                  * This effectively frees them.
1168                  */
1169                 vm_page_cache(m);
1170                 ++count;
1171         } else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) {
1172                 /*
1173                  * Dirty pages need to be paged out, but flushing
1174                  * a page is extremely expensive verses freeing
1175                  * a clean page.  Rather then artificially limiting
1176                  * the number of pages we can flush, we instead give
1177                  * dirty pages extra priority on the inactive queue
1178                  * by forcing them to be cycled through the queue
1179                  * twice before being flushed, after which the
1180                  * (now clean) page will cycle through once more
1181                  * before being freed.  This significantly extends
1182                  * the thrash point for a heavily loaded machine.
1183                  */
1184                 vm_page_flag_set(m, PG_WINATCFLS);
1185                 vm_page_and_queue_spin_lock(m);
1186                 if (m->queue - m->pc == PQ_INACTIVE) {
1187                         TAILQ_REMOVE(
1188                                 &vm_page_queues[m->queue].pl, m, pageq);
1189                         TAILQ_INSERT_TAIL(
1190                                 &vm_page_queues[m->queue].pl, m, pageq);
1191                         ++vm_swapcache_inactive_heuristic;
1192                 }
1193                 vm_page_and_queue_spin_unlock(m);
1194                 vm_page_wakeup(m);
1195         } else if (*max_launderp > 0) {
1196                 /*
1197                  * We always want to try to flush some dirty pages if
1198                  * we encounter them, to keep the system stable.
1199                  * Normally this number is small, but under extreme
1200                  * pressure where there are insufficient clean pages
1201                  * on the inactive queue, we may have to go all out.
1202                  */
1203                 int swap_pageouts_ok;
1204                 struct vnode *vp = NULL;
1205
1206                 swap_pageouts_ok = 0;
1207                 object = m->object;
1208                 if (object &&
1209                     (object->type != OBJT_SWAP) &&
1210                     (object->type != OBJT_DEFAULT)) {
1211                         swap_pageouts_ok = 1;
1212                 } else {
1213                         swap_pageouts_ok = !(defer_swap_pageouts || disable_swap_pageouts);
1214                         swap_pageouts_ok |= (!disable_swap_pageouts && defer_swap_pageouts &&
1215                         vm_page_count_min(0));
1216                 }
1217
1218                 /*
1219                  * We don't bother paging objects that are "dead".
1220                  * Those objects are in a "rundown" state.
1221                  */
1222                 if (!swap_pageouts_ok ||
1223                     (object == NULL) ||
1224                     (object->flags & OBJ_DEAD)) {
1225                         vm_page_and_queue_spin_lock(m);
1226                         if (m->queue - m->pc == PQ_INACTIVE) {
1227                                 TAILQ_REMOVE(
1228                                     &vm_page_queues[m->queue].pl,
1229                                     m, pageq);
1230                                 TAILQ_INSERT_TAIL(
1231                                     &vm_page_queues[m->queue].pl,
1232                                     m, pageq);
1233                                 ++vm_swapcache_inactive_heuristic;
1234                         }
1235                         vm_page_and_queue_spin_unlock(m);
1236                         vm_page_wakeup(m);
1237                         return 0;
1238                 }
1239
1240                 /*
1241                  * (m) is still busied.
1242                  *
1243                  * The object is already known NOT to be dead.   It
1244                  * is possible for the vget() to block the whole
1245                  * pageout daemon, but the new low-memory handling
1246                  * code should prevent it.
1247                  *
1248                  * The previous code skipped locked vnodes and, worse,
1249                  * reordered pages in the queue.  This results in
1250                  * completely non-deterministic operation because,
1251                  * quite often, a vm_fault has initiated an I/O and
1252                  * is holding a locked vnode at just the point where
1253                  * the pageout daemon is woken up.
1254                  *
1255                  * We can't wait forever for the vnode lock, we might
1256                  * deadlock due to a vn_read() getting stuck in
1257                  * vm_wait while holding this vnode.  We skip the
1258                  * vnode if we can't get it in a reasonable amount
1259                  * of time.
1260                  *
1261                  * vpfailed is used to (try to) avoid the case where
1262                  * a large number of pages are associated with a
1263                  * locked vnode, which could cause the pageout daemon
1264                  * to stall for an excessive amount of time.
1265                  */
1266                 if (object->type == OBJT_VNODE) {
1267                         int flags;
1268
1269                         vp = object->handle;
1270                         flags = LK_EXCLUSIVE;
1271                         if (vp == *vpfailedp)
1272                                 flags |= LK_NOWAIT;
1273                         else
1274                                 flags |= LK_TIMELOCK;
1275                         vm_page_hold(m);
1276                         vm_page_wakeup(m);
1277
1278                         /*
1279                          * We have unbusied (m) temporarily so we can
1280                          * acquire the vp lock without deadlocking.
1281                          * (m) is held to prevent destruction.
1282                          */
1283                         if (vget(vp, flags) != 0) {
1284                                 *vpfailedp = vp;
1285                                 ++pageout_lock_miss;
1286                                 if (object->flags & OBJ_MIGHTBEDIRTY)
1287                                             ++*vnodes_skippedp;
1288                                 vm_page_unhold(m);
1289                                 return 0;
1290                         }
1291
1292                         /*
1293                          * The page might have been moved to another
1294                          * queue during potential blocking in vget()
1295                          * above.  The page might have been freed and
1296                          * reused for another vnode.  The object might
1297                          * have been reused for another vnode.
1298                          */
1299                         if (m->queue - m->pc != PQ_INACTIVE ||
1300                             m->object != object ||
1301                             object->handle != vp) {
1302                                 if (object->flags & OBJ_MIGHTBEDIRTY)
1303                                         ++*vnodes_skippedp;
1304                                 vput(vp);
1305                                 vm_page_unhold(m);
1306                                 return 0;
1307                         }
1308
1309                         /*
1310                          * The page may have been busied during the
1311                          * blocking in vput();  We don't move the
1312                          * page back onto the end of the queue so that
1313                          * statistics are more correct if we don't.
1314                          */
1315                         if (vm_page_busy_try(m, TRUE)) {
1316                                 vput(vp);
1317                                 vm_page_unhold(m);
1318                                 return 0;
1319                         }
1320                         vm_page_unhold(m);
1321
1322                         /*
1323                          * (m) is busied again
1324                          *
1325                          * We own the busy bit and remove our hold
1326                          * bit.  If the page is still held it
1327                          * might be undergoing I/O, so skip it.
1328                          */
1329                         if (m->hold_count) {
1330                                 vm_page_and_queue_spin_lock(m);
1331                                 if (m->queue - m->pc == PQ_INACTIVE) {
1332                                         TAILQ_REMOVE(&vm_page_queues[m->queue].pl, m, pageq);
1333                                         TAILQ_INSERT_TAIL(&vm_page_queues[m->queue].pl, m, pageq);
1334                                         ++vm_swapcache_inactive_heuristic;
1335                                 }
1336                                 vm_page_and_queue_spin_unlock(m);
1337                                 if (object->flags & OBJ_MIGHTBEDIRTY)
1338                                         ++*vnodes_skippedp;
1339                                 vm_page_wakeup(m);
1340                                 vput(vp);
1341                                 return 0;
1342                         }
1343                         /* (m) is left busied as we fall through */
1344                 }
1345
1346                 /*
1347                  * page is busy and not held here.
1348                  *
1349                  * If a page is dirty, then it is either being washed
1350                  * (but not yet cleaned) or it is still in the
1351                  * laundry.  If it is still in the laundry, then we
1352                  * start the cleaning operation.
1353                  *
1354                  * decrement inactive_shortage on success to account
1355                  * for the (future) cleaned page.  Otherwise we
1356                  * could wind up laundering or cleaning too many
1357                  * pages.
1358                  *
1359                  * NOTE: Cleaning the page here does not cause
1360                  *       force_deficit to be adjusted, because the
1361                  *       page is not being freed or moved to the
1362                  *       cache.
1363                  */
1364                 count = vm_pageout_clean_helper(m, vmflush_flags);
1365                 *max_launderp -= count;
1366
1367                 /*
1368                  * Clean ate busy, page no longer accessible
1369                  */
1370                 if (vp != NULL)
1371                         vput(vp);
1372         } else {
1373                 vm_page_wakeup(m);
1374         }
1375         return count;
1376 }
1377
1378 static int
1379 vm_pageout_scan_active(int pass, int q,
1380                        int avail_shortage, int inactive_shortage,
1381                        int *recycle_countp)
1382 {
1383         struct vm_page marker;
1384         vm_page_t m;
1385         int actcount;
1386         int delta = 0;
1387         int maxscan;
1388
1389         /*
1390          * We want to move pages from the active queue to the inactive
1391          * queue to get the inactive queue to the inactive target.  If
1392          * we still have a page shortage from above we try to directly free
1393          * clean pages instead of moving them.
1394          *
1395          * If we do still have a shortage we keep track of the number of
1396          * pages we free or cache (recycle_count) as a measure of thrashing
1397          * between the active and inactive queues.
1398          *
1399          * If we were able to completely satisfy the free+cache targets
1400          * from the inactive pool we limit the number of pages we move
1401          * from the active pool to the inactive pool to 2x the pages we
1402          * had removed from the inactive pool (with a minimum of 1/5 the
1403          * inactive target).  If we were not able to completely satisfy
1404          * the free+cache targets we go for the whole target aggressively.
1405          *
1406          * NOTE: Both variables can end up negative.
1407          * NOTE: We are still in a critical section.
1408          */
1409
1410         bzero(&marker, sizeof(marker));
1411         marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
1412         marker.queue = PQ_ACTIVE + q;
1413         marker.pc = q;
1414         marker.wire_count = 1;
1415
1416         vm_page_queues_spin_lock(PQ_ACTIVE + q);
1417         TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1418         maxscan = vm_page_queues[PQ_ACTIVE + q].lcnt;
1419
1420         /*
1421          * Queue locked at top of loop to avoid stack marker issues.
1422          */
1423         while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
1424                maxscan-- > 0 && (avail_shortage - delta > 0 ||
1425                                 inactive_shortage > 0))
1426         {
1427                 KKASSERT(m->queue == PQ_ACTIVE + q);
1428                 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl,
1429                              &marker, pageq);
1430                 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m,
1431                                    &marker, pageq);
1432
1433                 /*
1434                  * Skip marker pages (atomic against other markers to avoid
1435                  * infinite hop-over scans).
1436                  */
1437                 if (m->flags & PG_MARKER)
1438                         continue;
1439
1440                 /*
1441                  * Try to busy the page.  Don't mess with pages which are
1442                  * already busy or reorder them in the queue.
1443                  */
1444                 if (vm_page_busy_try(m, TRUE))
1445                         continue;
1446
1447                 /*
1448                  * Remaining operations run with the page busy and neither
1449                  * the page or the queue will be spin-locked.
1450                  */
1451                 vm_page_queues_spin_unlock(PQ_ACTIVE + q);
1452                 KKASSERT(m->queue == PQ_ACTIVE + q);
1453
1454                 /*
1455                  * Don't deactivate pages that are held, even if we can
1456                  * busy them.  (XXX why not?)
1457                  */
1458                 if (m->hold_count != 0) {
1459                         vm_page_and_queue_spin_lock(m);
1460                         if (m->queue - m->pc == PQ_ACTIVE) {
1461                                 TAILQ_REMOVE(
1462                                         &vm_page_queues[PQ_ACTIVE + q].pl,
1463                                         m, pageq);
1464                                 TAILQ_INSERT_TAIL(
1465                                         &vm_page_queues[PQ_ACTIVE + q].pl,
1466                                         m, pageq);
1467                         }
1468                         vm_page_and_queue_spin_unlock(m);
1469                         vm_page_wakeup(m);
1470                         goto next;
1471                 }
1472
1473                 /*
1474                  * The count for pagedaemon pages is done after checking the
1475                  * page for eligibility...
1476                  */
1477                 mycpu->gd_cnt.v_pdpages++;
1478
1479                 /*
1480                  * Check to see "how much" the page has been used and clear
1481                  * the tracking access bits.  If the object has no references
1482                  * don't bother paying the expense.
1483                  */
1484                 actcount = 0;
1485                 if (m->object && m->object->ref_count != 0) {
1486                         if (m->flags & PG_REFERENCED)
1487                                 ++actcount;
1488                         actcount += pmap_ts_referenced(m);
1489                         if (actcount) {
1490                                 m->act_count += ACT_ADVANCE + actcount;
1491                                 if (m->act_count > ACT_MAX)
1492                                         m->act_count = ACT_MAX;
1493                         }
1494                 }
1495                 vm_page_flag_clear(m, PG_REFERENCED);
1496
1497                 /*
1498                  * actcount is only valid if the object ref_count is non-zero.
1499                  * If the page does not have an object, actcount will be zero.
1500                  */
1501                 if (actcount && m->object->ref_count != 0) {
1502                         vm_page_and_queue_spin_lock(m);
1503                         if (m->queue - m->pc == PQ_ACTIVE) {
1504                                 TAILQ_REMOVE(
1505                                         &vm_page_queues[PQ_ACTIVE + q].pl,
1506                                         m, pageq);
1507                                 TAILQ_INSERT_TAIL(
1508                                         &vm_page_queues[PQ_ACTIVE + q].pl,
1509                                         m, pageq);
1510                         }
1511                         vm_page_and_queue_spin_unlock(m);
1512                         vm_page_wakeup(m);
1513                 } else {
1514                         switch(m->object->type) {
1515                         case OBJT_DEFAULT:
1516                         case OBJT_SWAP:
1517                                 m->act_count -= min(m->act_count,
1518                                                     vm_anonmem_decline);
1519                                 break;
1520                         default:
1521                                 m->act_count -= min(m->act_count,
1522                                                     vm_filemem_decline);
1523                                 break;
1524                         }
1525                         if (vm_pageout_algorithm ||
1526                             (m->object == NULL) ||
1527                             (m->object && (m->object->ref_count == 0)) ||
1528                             m->act_count < pass + 1
1529                         ) {
1530                                 /*
1531                                  * Deactivate the page.  If we had a
1532                                  * shortage from our inactive scan try to
1533                                  * free (cache) the page instead.
1534                                  *
1535                                  * Don't just blindly cache the page if
1536                                  * we do not have a shortage from the
1537                                  * inactive scan, that could lead to
1538                                  * gigabytes being moved.
1539                                  */
1540                                 --inactive_shortage;
1541                                 if (avail_shortage - delta > 0 ||
1542                                     (m->object && (m->object->ref_count == 0)))
1543                                 {
1544                                         if (avail_shortage - delta > 0)
1545                                                 ++*recycle_countp;
1546                                         vm_page_protect(m, VM_PROT_NONE);
1547                                         if (m->dirty == 0 &&
1548                                             (m->flags & PG_NEED_COMMIT) == 0 &&
1549                                             avail_shortage - delta > 0) {
1550                                                 vm_page_cache(m);
1551                                         } else {
1552                                                 vm_page_deactivate(m);
1553                                                 vm_page_wakeup(m);
1554                                         }
1555                                 } else {
1556                                         vm_page_deactivate(m);
1557                                         vm_page_wakeup(m);
1558                                 }
1559                                 ++delta;
1560                         } else {
1561                                 vm_page_and_queue_spin_lock(m);
1562                                 if (m->queue - m->pc == PQ_ACTIVE) {
1563                                         TAILQ_REMOVE(
1564                                             &vm_page_queues[PQ_ACTIVE + q].pl,
1565                                             m, pageq);
1566                                         TAILQ_INSERT_TAIL(
1567                                             &vm_page_queues[PQ_ACTIVE + q].pl,
1568                                             m, pageq);
1569                                 }
1570                                 vm_page_and_queue_spin_unlock(m);
1571                                 vm_page_wakeup(m);
1572                         }
1573                 }
1574 next:
1575                 lwkt_yield();
1576                 vm_page_queues_spin_lock(PQ_ACTIVE + q);
1577         }
1578
1579         /*
1580          * Clean out our local marker.
1581          *
1582          * Page queue still spin-locked.
1583          */
1584         TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1585         vm_page_queues_spin_unlock(PQ_ACTIVE + q);
1586
1587         return (delta);
1588 }
1589
1590 /*
1591  * The number of actually free pages can drop down to v_free_reserved,
1592  * we try to build the free count back above v_free_min.  Note that
1593  * vm_paging_needed() also returns TRUE if v_free_count is not at
1594  * least v_free_min so that is the minimum we must build the free
1595  * count to.
1596  *
1597  * We use a slightly higher target to improve hysteresis,
1598  * ((v_free_target + v_free_min) / 2).  Since v_free_target
1599  * is usually the same as v_cache_min this maintains about
1600  * half the pages in the free queue as are in the cache queue,
1601  * providing pretty good pipelining for pageout operation.
1602  *
1603  * The system operator can manipulate vm.v_cache_min and
1604  * vm.v_free_target to tune the pageout demon.  Be sure
1605  * to keep vm.v_free_min < vm.v_free_target.
1606  *
1607  * Note that the original paging target is to get at least
1608  * (free_min + cache_min) into (free + cache).  The slightly
1609  * higher target will shift additional pages from cache to free
1610  * without effecting the original paging target in order to
1611  * maintain better hysteresis and not have the free count always
1612  * be dead-on v_free_min.
1613  *
1614  * NOTE: we are still in a critical section.
1615  *
1616  * Pages moved from PQ_CACHE to totally free are not counted in the
1617  * pages_freed counter.
1618  */
1619 static void
1620 vm_pageout_scan_cache(int avail_shortage, int pass,
1621                       int vnodes_skipped, int recycle_count)
1622 {
1623         static int lastkillticks;
1624         struct vm_pageout_scan_info info;
1625         vm_page_t m;
1626
1627         while (vmstats.v_free_count <
1628                (vmstats.v_free_min + vmstats.v_free_target) / 2) {
1629                 /*
1630                  * This steals some code from vm/vm_page.c
1631                  */
1632                 static int cache_rover = 0;
1633
1634                 m = vm_page_list_find(PQ_CACHE,
1635                                       cache_rover & PQ_L2_MASK, FALSE);
1636                 if (m == NULL)
1637                         break;
1638                 /* page is returned removed from its queue and spinlocked */
1639                 if (vm_page_busy_try(m, TRUE)) {
1640                         vm_page_deactivate_locked(m);
1641                         vm_page_spin_unlock(m);
1642                         continue;
1643                 }
1644                 vm_page_spin_unlock(m);
1645                 pagedaemon_wakeup();
1646                 lwkt_yield();
1647
1648                 /*
1649                  * Remaining operations run with the page busy and neither
1650                  * the page or the queue will be spin-locked.
1651                  */
1652                 if ((m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) ||
1653                     m->hold_count ||
1654                     m->wire_count) {
1655                         vm_page_deactivate(m);
1656                         vm_page_wakeup(m);
1657                         continue;
1658                 }
1659                 KKASSERT((m->flags & PG_MAPPED) == 0);
1660                 KKASSERT(m->dirty == 0);
1661                 cache_rover += PQ_PRIME2;
1662                 vm_pageout_page_free(m);
1663                 mycpu->gd_cnt.v_dfree++;
1664         }
1665
1666 #if !defined(NO_SWAPPING)
1667         /*
1668          * Idle process swapout -- run once per second.
1669          */
1670         if (vm_swap_idle_enabled) {
1671                 static time_t lsec;
1672                 if (time_uptime != lsec) {
1673                         atomic_set_int(&vm_pageout_req_swapout, VM_SWAP_IDLE);
1674                         vm_req_vmdaemon();
1675                         lsec = time_uptime;
1676                 }
1677         }
1678 #endif
1679
1680         /*
1681          * If we didn't get enough free pages, and we have skipped a vnode
1682          * in a writeable object, wakeup the sync daemon.  And kick swapout
1683          * if we did not get enough free pages.
1684          */
1685         if (vm_paging_target() > 0) {
1686                 if (vnodes_skipped && vm_page_count_min(0))
1687                         speedup_syncer(NULL);
1688 #if !defined(NO_SWAPPING)
1689                 if (vm_swap_enabled && vm_page_count_target()) {
1690                         atomic_set_int(&vm_pageout_req_swapout, VM_SWAP_NORMAL);
1691                         vm_req_vmdaemon();
1692                 }
1693 #endif
1694         }
1695
1696         /*
1697          * Handle catastrophic conditions.  Under good conditions we should
1698          * be at the target, well beyond our minimum.  If we could not even
1699          * reach our minimum the system is under heavy stress.  But just being
1700          * under heavy stress does not trigger process killing.
1701          *
1702          * We consider ourselves to have run out of memory if the swap pager
1703          * is full and avail_shortage is still positive.  The secondary check
1704          * ensures that we do not kill processes if the instantanious
1705          * availability is good, even if the pageout demon pass says it
1706          * couldn't get to the target.
1707          */
1708         if (swap_pager_almost_full &&
1709             pass > 0 &&
1710             (vm_page_count_min(recycle_count) || avail_shortage > 0)) {
1711                 kprintf("Warning: system low on memory+swap "
1712                         "shortage %d for %d ticks!\n",
1713                         avail_shortage, ticks - swap_fail_ticks);
1714         }
1715         if (swap_pager_full &&
1716             pass > 1 &&
1717             avail_shortage > 0 &&
1718             vm_paging_target() > 0 &&
1719             (unsigned int)(ticks - lastkillticks) >= hz) {
1720                 /*
1721                  * Kill something, maximum rate once per second to give
1722                  * the process time to free up sufficient memory.
1723                  */
1724                 lastkillticks = ticks;
1725                 info.bigproc = NULL;
1726                 info.bigsize = 0;
1727                 allproc_scan(vm_pageout_scan_callback, &info);
1728                 if (info.bigproc != NULL) {
1729                         info.bigproc->p_nice = PRIO_MIN;
1730                         info.bigproc->p_usched->resetpriority(
1731                                 FIRST_LWP_IN_PROC(info.bigproc));
1732                         atomic_set_int(&info.bigproc->p_flags, P_LOWMEMKILL);
1733                         killproc(info.bigproc, "out of swap space");
1734                         wakeup(&vmstats.v_free_count);
1735                         PRELE(info.bigproc);
1736                 }
1737         }
1738 }
1739
1740 static int
1741 vm_pageout_scan_callback(struct proc *p, void *data)
1742 {
1743         struct vm_pageout_scan_info *info = data;
1744         vm_offset_t size;
1745
1746         /*
1747          * Never kill system processes or init.  If we have configured swap
1748          * then try to avoid killing low-numbered pids.
1749          */
1750         if ((p->p_flags & P_SYSTEM) || (p->p_pid == 1) ||
1751             ((p->p_pid < 48) && (vm_swap_size != 0))) {
1752                 return (0);
1753         }
1754
1755         lwkt_gettoken(&p->p_token);
1756
1757         /*
1758          * if the process is in a non-running type state,
1759          * don't touch it.
1760          */
1761         if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) {
1762                 lwkt_reltoken(&p->p_token);
1763                 return (0);
1764         }
1765
1766         /*
1767          * Get the approximate process size.  Note that anonymous pages
1768          * with backing swap will be counted twice, but there should not
1769          * be too many such pages due to the stress the VM system is
1770          * under at this point.
1771          */
1772         size = vmspace_anonymous_count(p->p_vmspace) +
1773                 vmspace_swap_count(p->p_vmspace);
1774
1775         /*
1776          * If the this process is bigger than the biggest one
1777          * remember it.
1778          */
1779         if (info->bigsize < size) {
1780                 if (info->bigproc)
1781                         PRELE(info->bigproc);
1782                 PHOLD(p);
1783                 info->bigproc = p;
1784                 info->bigsize = size;
1785         }
1786         lwkt_reltoken(&p->p_token);
1787         lwkt_yield();
1788
1789         return(0);
1790 }
1791
1792 /*
1793  * This routine tries to maintain the pseudo LRU active queue,
1794  * so that during long periods of time where there is no paging,
1795  * that some statistic accumulation still occurs.  This code
1796  * helps the situation where paging just starts to occur.
1797  */
1798 static void
1799 vm_pageout_page_stats(int q)
1800 {
1801         static int fullintervalcount = 0;
1802         struct vm_page marker;
1803         vm_page_t m;
1804         int pcount, tpcount;            /* Number of pages to check */
1805         int page_shortage;
1806
1807         page_shortage = (vmstats.v_inactive_target + vmstats.v_cache_max +
1808                          vmstats.v_free_min) -
1809                         (vmstats.v_free_count + vmstats.v_inactive_count +
1810                          vmstats.v_cache_count);
1811
1812         if (page_shortage <= 0)
1813                 return;
1814
1815         pcount = vm_page_queues[PQ_ACTIVE + q].lcnt;
1816         fullintervalcount += vm_pageout_stats_interval;
1817         if (fullintervalcount < vm_pageout_full_stats_interval) {
1818                 tpcount = (vm_pageout_stats_max * pcount) /
1819                           vmstats.v_page_count + 1;
1820                 if (pcount > tpcount)
1821                         pcount = tpcount;
1822         } else {
1823                 fullintervalcount = 0;
1824         }
1825
1826         bzero(&marker, sizeof(marker));
1827         marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
1828         marker.queue = PQ_ACTIVE + q;
1829         marker.pc = q;
1830         marker.wire_count = 1;
1831
1832         vm_page_queues_spin_lock(PQ_ACTIVE + q);
1833         TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1834
1835         /*
1836          * Queue locked at top of loop to avoid stack marker issues.
1837          */
1838         while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
1839                pcount-- > 0)
1840         {
1841                 int actcount;
1842
1843                 KKASSERT(m->queue == PQ_ACTIVE + q);
1844                 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1845                 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m,
1846                                    &marker, pageq);
1847
1848                 /*
1849                  * Skip marker pages (atomic against other markers to avoid
1850                  * infinite hop-over scans).
1851                  */
1852                 if (m->flags & PG_MARKER)
1853                         continue;
1854
1855                 /*
1856                  * Ignore pages we can't busy
1857                  */
1858                 if (vm_page_busy_try(m, TRUE))
1859                         continue;
1860
1861                 /*
1862                  * Remaining operations run with the page busy and neither
1863                  * the page or the queue will be spin-locked.
1864                  */
1865                 vm_page_queues_spin_unlock(PQ_ACTIVE + q);
1866                 KKASSERT(m->queue == PQ_ACTIVE + q);
1867
1868                 /*
1869                  * We now have a safely busied page, the page and queue
1870                  * spinlocks have been released.
1871                  *
1872                  * Ignore held pages
1873                  */
1874                 if (m->hold_count) {
1875                         vm_page_wakeup(m);
1876                         goto next;
1877                 }
1878
1879                 /*
1880                  * Calculate activity
1881                  */
1882                 actcount = 0;
1883                 if (m->flags & PG_REFERENCED) {
1884                         vm_page_flag_clear(m, PG_REFERENCED);
1885                         actcount += 1;
1886                 }
1887                 actcount += pmap_ts_referenced(m);
1888
1889                 /*
1890                  * Update act_count and move page to end of queue.
1891                  */
1892                 if (actcount) {
1893                         m->act_count += ACT_ADVANCE + actcount;
1894                         if (m->act_count > ACT_MAX)
1895                                 m->act_count = ACT_MAX;
1896                         vm_page_and_queue_spin_lock(m);
1897                         if (m->queue - m->pc == PQ_ACTIVE) {
1898                                 TAILQ_REMOVE(
1899                                         &vm_page_queues[PQ_ACTIVE + q].pl,
1900                                         m, pageq);
1901                                 TAILQ_INSERT_TAIL(
1902                                         &vm_page_queues[PQ_ACTIVE + q].pl,
1903                                         m, pageq);
1904                         }
1905                         vm_page_and_queue_spin_unlock(m);
1906                         vm_page_wakeup(m);
1907                         goto next;
1908                 }
1909
1910                 if (m->act_count == 0) {
1911                         /*
1912                          * We turn off page access, so that we have
1913                          * more accurate RSS stats.  We don't do this
1914                          * in the normal page deactivation when the
1915                          * system is loaded VM wise, because the
1916                          * cost of the large number of page protect
1917                          * operations would be higher than the value
1918                          * of doing the operation.
1919                          *
1920                          * We use the marker to save our place so
1921                          * we can release the spin lock.  both (m)
1922                          * and (next) will be invalid.
1923                          */
1924                         vm_page_protect(m, VM_PROT_NONE);
1925                         vm_page_deactivate(m);
1926                 } else {
1927                         m->act_count -= min(m->act_count, ACT_DECLINE);
1928                         vm_page_and_queue_spin_lock(m);
1929                         if (m->queue - m->pc == PQ_ACTIVE) {
1930                                 TAILQ_REMOVE(
1931                                         &vm_page_queues[PQ_ACTIVE + q].pl,
1932                                         m, pageq);
1933                                 TAILQ_INSERT_TAIL(
1934                                         &vm_page_queues[PQ_ACTIVE + q].pl,
1935                                         m, pageq);
1936                         }
1937                         vm_page_and_queue_spin_unlock(m);
1938                 }
1939                 vm_page_wakeup(m);
1940 next:
1941                 vm_page_queues_spin_lock(PQ_ACTIVE + q);
1942         }
1943
1944         /*
1945          * Remove our local marker
1946          *
1947          * Page queue still spin-locked.
1948          */
1949         TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1950         vm_page_queues_spin_unlock(PQ_ACTIVE + q);
1951 }
1952
1953 static int
1954 vm_pageout_free_page_calc(vm_size_t count)
1955 {
1956         if (count < vmstats.v_page_count)
1957                  return 0;
1958         /*
1959          * free_reserved needs to include enough for the largest swap pager
1960          * structures plus enough for any pv_entry structs when paging.
1961          *
1962          * v_free_min           normal allocations
1963          * v_free_reserved      system allocations
1964          * v_pageout_free_min   allocations by pageout daemon
1965          * v_interrupt_free_min low level allocations (e.g swap structures)
1966          */
1967         if (vmstats.v_page_count > 1024)
1968                 vmstats.v_free_min = 64 + (vmstats.v_page_count - 1024) / 200;
1969         else
1970                 vmstats.v_free_min = 64;
1971         vmstats.v_free_reserved = vmstats.v_free_min * 4 / 8 + 7;
1972         vmstats.v_free_severe = vmstats.v_free_min * 4 / 8 + 0;
1973         vmstats.v_pageout_free_min = vmstats.v_free_min * 2 / 8 + 7;
1974         vmstats.v_interrupt_free_min = vmstats.v_free_min * 1 / 8 + 7;
1975
1976         return 1;
1977 }
1978
1979
1980 /*
1981  * vm_pageout is the high level pageout daemon.
1982  *
1983  * No requirements.
1984  */
1985 static void
1986 vm_pageout_thread(void)
1987 {
1988         int pass;
1989         int q;
1990         int q1iterator = 0;
1991         int q2iterator = 0;
1992
1993         /*
1994          * Initialize some paging parameters.
1995          */
1996         curthread->td_flags |= TDF_SYSTHREAD;
1997
1998         vm_pageout_free_page_calc(vmstats.v_page_count);
1999
2000         /*
2001          * v_free_target and v_cache_min control pageout hysteresis.  Note
2002          * that these are more a measure of the VM cache queue hysteresis
2003          * then the VM free queue.  Specifically, v_free_target is the
2004          * high water mark (free+cache pages).
2005          *
2006          * v_free_reserved + v_cache_min (mostly means v_cache_min) is the
2007          * low water mark, while v_free_min is the stop.  v_cache_min must
2008          * be big enough to handle memory needs while the pageout daemon
2009          * is signalled and run to free more pages.
2010          */
2011         if (vmstats.v_free_count > 6144)
2012                 vmstats.v_free_target = 4 * vmstats.v_free_min + vmstats.v_free_reserved;
2013         else
2014                 vmstats.v_free_target = 2 * vmstats.v_free_min + vmstats.v_free_reserved;
2015
2016         /*
2017          * NOTE: With the new buffer cache b_act_count we want the default
2018          *       inactive target to be a percentage of available memory.
2019          *
2020          *       The inactive target essentially determines the minimum
2021          *       number of 'temporary' pages capable of caching one-time-use
2022          *       files when the VM system is otherwise full of pages
2023          *       belonging to multi-time-use files or active program data.
2024          *
2025          * NOTE: The inactive target is aggressively persued only if the
2026          *       inactive queue becomes too small.  If the inactive queue
2027          *       is large enough to satisfy page movement to free+cache
2028          *       then it is repopulated more slowly from the active queue.
2029          *       This allows a general inactive_target default to be set.
2030          *
2031          *       There is an issue here for processes which sit mostly idle
2032          *       'overnight', such as sshd, tcsh, and X.  Any movement from
2033          *       the active queue will eventually cause such pages to
2034          *       recycle eventually causing a lot of paging in the morning.
2035          *       To reduce the incidence of this pages cycled out of the
2036          *       buffer cache are moved directly to the inactive queue if
2037          *       they were only used once or twice.
2038          *
2039          *       The vfs.vm_cycle_point sysctl can be used to adjust this.
2040          *       Increasing the value (up to 64) increases the number of
2041          *       buffer recyclements which go directly to the inactive queue.
2042          */
2043         if (vmstats.v_free_count > 2048) {
2044                 vmstats.v_cache_min = vmstats.v_free_target;
2045                 vmstats.v_cache_max = 2 * vmstats.v_cache_min;
2046         } else {
2047                 vmstats.v_cache_min = 0;
2048                 vmstats.v_cache_max = 0;
2049         }
2050         vmstats.v_inactive_target = vmstats.v_free_count / 4;
2051
2052         /* XXX does not really belong here */
2053         if (vm_page_max_wired == 0)
2054                 vm_page_max_wired = vmstats.v_free_count / 3;
2055
2056         if (vm_pageout_stats_max == 0)
2057                 vm_pageout_stats_max = vmstats.v_free_target;
2058
2059         /*
2060          * Set interval in seconds for stats scan.
2061          */
2062         if (vm_pageout_stats_interval == 0)
2063                 vm_pageout_stats_interval = 5;
2064         if (vm_pageout_full_stats_interval == 0)
2065                 vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4;
2066
2067
2068         /*
2069          * Set maximum free per pass
2070          */
2071         if (vm_pageout_stats_free_max == 0)
2072                 vm_pageout_stats_free_max = 5;
2073
2074         swap_pager_swap_init();
2075         pass = 0;
2076
2077         /*
2078          * The pageout daemon is never done, so loop forever.
2079          */
2080         while (TRUE) {
2081                 int error;
2082                 int avail_shortage;
2083                 int inactive_shortage;
2084                 int vnodes_skipped = 0;
2085                 int recycle_count = 0;
2086                 int tmp;
2087
2088                 /*
2089                  * Wait for an action request.  If we timeout check to
2090                  * see if paging is needed (in case the normal wakeup
2091                  * code raced us).
2092                  */
2093                 if (vm_pages_needed == 0) {
2094                         error = tsleep(&vm_pages_needed,
2095                                        0, "psleep",
2096                                        vm_pageout_stats_interval * hz);
2097                         if (error &&
2098                             vm_paging_needed() == 0 &&
2099                             vm_pages_needed == 0) {
2100                                 for (q = 0; q < PQ_L2_SIZE; ++q)
2101                                         vm_pageout_page_stats(q);
2102                                 continue;
2103                         }
2104                         vm_pages_needed = 1;
2105                 }
2106
2107                 mycpu->gd_cnt.v_pdwakeups++;
2108
2109                 /*
2110                  * Scan for INACTIVE->CLEAN/PAGEOUT
2111                  *
2112                  * This routine tries to avoid thrashing the system with
2113                  * unnecessary activity.
2114                  *
2115                  * Calculate our target for the number of free+cache pages we
2116                  * want to get to.  This is higher then the number that causes
2117                  * allocations to stall (severe) in order to provide hysteresis,
2118                  * and if we don't make it all the way but get to the minimum
2119                  * we're happy.  Goose it a bit if there are multiple requests
2120                  * for memory.
2121                  *
2122                  * Don't reduce avail_shortage inside the loop or the
2123                  * PQAVERAGE() calculation will break.
2124                  *
2125                  * NOTE! deficit is differentiated from avail_shortage as
2126                  *       REQUIRING at least (deficit) pages to be cleaned,
2127                  *       even if the page queues are in good shape.  This
2128                  *       is used primarily for handling per-process
2129                  *       RLIMIT_RSS and may also see small values when
2130                  *       processes block due to low memory.
2131                  */
2132                 avail_shortage = vm_paging_target() + vm_pageout_deficit;
2133                 vm_pageout_deficit = 0;
2134
2135                 if (avail_shortage > 0) {
2136                         int delta = 0;
2137
2138                         for (q = 0; q < PQ_L2_SIZE; ++q) {
2139                                 delta += vm_pageout_scan_inactive(
2140                                             pass,
2141                                             (q + q1iterator) & PQ_L2_MASK,
2142                                             PQAVERAGE(avail_shortage),
2143                                             &vnodes_skipped);
2144                                 if (avail_shortage - delta <= 0)
2145                                         break;
2146                         }
2147                         avail_shortage -= delta;
2148                         q1iterator = q + 1;
2149                 }
2150
2151                 /*
2152                  * Figure out how many active pages we must deactivate.  If
2153                  * we were able to reach our target with just the inactive
2154                  * scan above we limit the number of active pages we
2155                  * deactivate to reduce unnecessary work.
2156                  */
2157                 inactive_shortage = vmstats.v_inactive_target -
2158                                     vmstats.v_inactive_count;
2159
2160                 /*
2161                  * If we were unable to free sufficient inactive pages to
2162                  * satisfy the free/cache queue requirements then simply
2163                  * reaching the inactive target may not be good enough.
2164                  * Try to deactivate pages in excess of the target based
2165                  * on the shortfall.
2166                  *
2167                  * However to prevent thrashing the VM system do not
2168                  * deactivate more than an additional 1/10 the inactive
2169                  * target's worth of active pages.
2170                  */
2171                 if (avail_shortage > 0) {
2172                         tmp = avail_shortage * 2;
2173                         if (tmp > vmstats.v_inactive_target / 10)
2174                                 tmp = vmstats.v_inactive_target / 10;
2175                         inactive_shortage += tmp;
2176                 }
2177
2178                 /*
2179                  * Only trigger a pmap cleanup on inactive shortage.
2180                  */
2181                 if (inactive_shortage > 0) {
2182                         pmap_collect();
2183                 }
2184
2185                 /*
2186                  * Scan for ACTIVE->INACTIVE
2187                  *
2188                  * Only trigger on inactive shortage.  Triggering on
2189                  * avail_shortage can starve the active queue with
2190                  * unnecessary active->inactive transitions and destroy
2191                  * performance.
2192                  */
2193                 if (/*avail_shortage > 0 ||*/ inactive_shortage > 0) {
2194                         int delta = 0;
2195
2196                         for (q = 0; q < PQ_L2_SIZE; ++q) {
2197                                 delta += vm_pageout_scan_active(
2198                                                 pass,
2199                                                 (q + q2iterator) & PQ_L2_MASK,
2200                                                 PQAVERAGE(avail_shortage),
2201                                                 PQAVERAGE(inactive_shortage),
2202                                                 &recycle_count);
2203                                 if (inactive_shortage - delta <= 0 &&
2204                                     avail_shortage - delta <= 0) {
2205                                         break;
2206                                 }
2207                         }
2208                         inactive_shortage -= delta;
2209                         avail_shortage -= delta;
2210                         q2iterator = q + 1;
2211                 }
2212
2213                 /*
2214                  * Scan for CACHE->FREE
2215                  *
2216                  * Finally free enough cache pages to meet our free page
2217                  * requirement and take more drastic measures if we are
2218                  * still in trouble.
2219                  */
2220                 vm_pageout_scan_cache(avail_shortage, pass,
2221                                       vnodes_skipped, recycle_count);
2222
2223                 /*
2224                  * Wait for more work.
2225                  */
2226                 if (avail_shortage > 0) {
2227                         ++pass;
2228                         if (pass < 10 && vm_pages_needed > 1) {
2229                                 /*
2230                                  * Normal operation, additional processes
2231                                  * have already kicked us.  Retry immediately
2232                                  * unless swap space is completely full in
2233                                  * which case delay a bit.
2234                                  */
2235                                 if (swap_pager_full) {
2236                                         tsleep(&vm_pages_needed, 0, "pdelay",
2237                                                 hz / 5);
2238                                 } /* else immediate retry */
2239                         } else if (pass < 10) {
2240                                 /*
2241                                  * Normal operation, fewer processes.  Delay
2242                                  * a bit but allow wakeups.
2243                                  */
2244                                 vm_pages_needed = 0;
2245                                 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10);
2246                                 vm_pages_needed = 1;
2247                         } else if (swap_pager_full == 0) {
2248                                 /*
2249                                  * We've taken too many passes, forced delay.
2250                                  */
2251                                 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10);
2252                         } else {
2253                                 /*
2254                                  * Running out of memory, catastrophic
2255                                  * back-off to one-second intervals.
2256                                  */
2257                                 tsleep(&vm_pages_needed, 0, "pdelay", hz);
2258                         }
2259                 } else if (vm_pages_needed) {
2260                         /*
2261                          * Interlocked wakeup of waiters (non-optional).
2262                          *
2263                          * Similar to vm_page_free_wakeup() in vm_page.c,
2264                          * wake
2265                          */
2266                         pass = 0;
2267                         if (!vm_page_count_min(vm_page_free_hysteresis) ||
2268                             !vm_page_count_target()) {
2269                                 vm_pages_needed = 0;
2270                                 wakeup(&vmstats.v_free_count);
2271                         }
2272                 } else {
2273                         pass = 0;
2274                 }
2275         }
2276 }
2277
2278 static struct kproc_desc page_kp = {
2279         "pagedaemon",
2280         vm_pageout_thread,
2281         &pagethread
2282 };
2283 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &page_kp);
2284
2285
2286 /*
2287  * Called after allocating a page out of the cache or free queue
2288  * to possibly wake the pagedaemon up to replentish our supply.
2289  *
2290  * We try to generate some hysteresis by waking the pagedaemon up
2291  * when our free+cache pages go below the free_min+cache_min level.
2292  * The pagedaemon tries to get the count back up to at least the
2293  * minimum, and through to the target level if possible.
2294  *
2295  * If the pagedaemon is already active bump vm_pages_needed as a hint
2296  * that there are even more requests pending.
2297  *
2298  * SMP races ok?
2299  * No requirements.
2300  */
2301 void
2302 pagedaemon_wakeup(void)
2303 {
2304         if (vm_paging_needed() && curthread != pagethread) {
2305                 if (vm_pages_needed == 0) {
2306                         vm_pages_needed = 1;    /* SMP race ok */
2307                         wakeup(&vm_pages_needed);
2308                 } else if (vm_page_count_min(0)) {
2309                         ++vm_pages_needed;      /* SMP race ok */
2310                 }
2311         }
2312 }
2313
2314 #if !defined(NO_SWAPPING)
2315
2316 /*
2317  * SMP races ok?
2318  * No requirements.
2319  */
2320 static void
2321 vm_req_vmdaemon(void)
2322 {
2323         static int lastrun = 0;
2324
2325         if ((ticks > (lastrun + hz)) || (ticks < lastrun)) {
2326                 wakeup(&vm_daemon_needed);
2327                 lastrun = ticks;
2328         }
2329 }
2330
2331 static int vm_daemon_callback(struct proc *p, void *data __unused);
2332
2333 /*
2334  * No requirements.
2335  */
2336 static void
2337 vm_daemon(void)
2338 {
2339         int req_swapout;
2340
2341         while (TRUE) {
2342                 tsleep(&vm_daemon_needed, 0, "psleep", 0);
2343                 req_swapout = atomic_swap_int(&vm_pageout_req_swapout, 0);
2344
2345                 /*
2346                  * forced swapouts
2347                  */
2348                 if (req_swapout)
2349                         swapout_procs(vm_pageout_req_swapout);
2350
2351                 /*
2352                  * scan the processes for exceeding their rlimits or if
2353                  * process is swapped out -- deactivate pages
2354                  */
2355                 allproc_scan(vm_daemon_callback, NULL);
2356         }
2357 }
2358
2359 static int
2360 vm_daemon_callback(struct proc *p, void *data __unused)
2361 {
2362         struct vmspace *vm;
2363         vm_pindex_t limit, size;
2364
2365         /*
2366          * if this is a system process or if we have already
2367          * looked at this process, skip it.
2368          */
2369         lwkt_gettoken(&p->p_token);
2370
2371         if (p->p_flags & (P_SYSTEM | P_WEXIT)) {
2372                 lwkt_reltoken(&p->p_token);
2373                 return (0);
2374         }
2375
2376         /*
2377          * if the process is in a non-running type state,
2378          * don't touch it.
2379          */
2380         if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) {
2381                 lwkt_reltoken(&p->p_token);
2382                 return (0);
2383         }
2384
2385         /*
2386          * get a limit
2387          */
2388         limit = OFF_TO_IDX(qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur,
2389                                 p->p_rlimit[RLIMIT_RSS].rlim_max));
2390
2391         /*
2392          * let processes that are swapped out really be
2393          * swapped out.  Set the limit to nothing to get as
2394          * many pages out to swap as possible.
2395          */
2396         if (p->p_flags & P_SWAPPEDOUT)
2397                 limit = 0;
2398
2399         vm = p->p_vmspace;
2400         vmspace_hold(vm);
2401         size = pmap_resident_tlnw_count(&vm->vm_pmap);
2402         if (limit >= 0 && size >= limit && vm_pageout_memuse_mode >= 1) {
2403                 vm_pageout_map_deactivate_pages(&vm->vm_map, limit);
2404         }
2405         vmspace_drop(vm);
2406
2407         lwkt_reltoken(&p->p_token);
2408
2409         return (0);
2410 }
2411
2412 #endif