2 * Copyright (c) 2003-2020 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * Copyright (c) 1991 Regents of the University of California.
35 * All rights reserved.
36 * Copyright (c) 1994 John S. Dyson
37 * All rights reserved.
38 * Copyright (c) 1994 David Greenman
39 * All rights reserved.
41 * This code is derived from software contributed to Berkeley by
42 * The Mach Operating System project at Carnegie-Mellon University.
44 * Redistribution and use in source and binary forms, with or without
45 * modification, are permitted provided that the following conditions
47 * 1. Redistributions of source code must retain the above copyright
48 * notice, this list of conditions and the following disclaimer.
49 * 2. Redistributions in binary form must reproduce the above copyright
50 * notice, this list of conditions and the following disclaimer in the
51 * documentation and/or other materials provided with the distribution.
52 * 3. Neither the name of the University nor the names of its contributors
53 * may be used to endorse or promote products derived from this software
54 * without specific prior written permission.
56 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
57 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
58 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
59 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
60 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
61 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
62 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
63 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
64 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
65 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68 * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91
71 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
72 * All rights reserved.
74 * Authors: Avadis Tevanian, Jr., Michael Wayne Young
76 * Permission to use, copy, modify and distribute this software and
77 * its documentation is hereby granted, provided that both the copyright
78 * notice and this permission notice appear in all copies of the
79 * software, derivative works or modified versions, and any portions
80 * thereof, and that both notices appear in supporting documentation.
82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
86 * Carnegie Mellon requests users of this software to return to
88 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
89 * School of Computer Science
90 * Carnegie Mellon University
91 * Pittsburgh PA 15213-3890
93 * any improvements or extensions that they make and grant Carnegie the
94 * rights to redistribute these changes.
98 * The proverbial page-out daemon, rewritten many times over the decades.
102 #include <sys/param.h>
103 #include <sys/systm.h>
104 #include <sys/kernel.h>
105 #include <sys/proc.h>
106 #include <sys/kthread.h>
107 #include <sys/resourcevar.h>
108 #include <sys/signalvar.h>
109 #include <sys/vnode.h>
110 #include <sys/vmmeter.h>
111 #include <sys/conf.h>
112 #include <sys/sysctl.h>
115 #include <vm/vm_param.h>
116 #include <sys/lock.h>
117 #include <vm/vm_object.h>
118 #include <vm/vm_page.h>
119 #include <vm/vm_map.h>
120 #include <vm/vm_pageout.h>
121 #include <vm/vm_pager.h>
122 #include <vm/swap_pager.h>
123 #include <vm/vm_extern.h>
125 #include <sys/spinlock2.h>
126 #include <vm/vm_page2.h>
129 * System initialization
132 /* the kernel process "vm_pageout"*/
133 static int vm_pageout_page(vm_page_t m, long *max_launderp,
134 long *vnodes_skippedp, struct vnode **vpfailedp,
135 int pass, int vmflush_flags, long *counts);
136 static int vm_pageout_clean_helper (vm_page_t, int);
137 static void vm_pageout_free_page_calc (vm_size_t count);
138 static void vm_pageout_page_free(vm_page_t m) ;
139 __read_frequently struct thread *emergpager;
140 __read_frequently struct thread *pagethread;
141 static int sequence_emerg_pager;
143 #if !defined(NO_SWAPPING)
144 /* the kernel process "vm_daemon"*/
145 static void vm_daemon (void);
146 static struct thread *vmthread;
148 static struct kproc_desc vm_kp = {
153 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp);
156 __read_mostly int vm_pages_needed = 0; /* pageout daemon tsleep event */
157 __read_mostly int vm_pageout_deficit = 0;/* Estimated number of pages deficit */
158 __read_mostly int vm_pageout_pages_needed = 0;/* pageout daemon needs pages */
159 __read_mostly int vm_page_free_hysteresis = 16;
160 __read_mostly static int vm_pagedaemon_time;
162 #if !defined(NO_SWAPPING)
163 static int vm_pageout_req_swapout;
164 static int vm_daemon_needed;
166 __read_mostly static int vm_max_launder = 0;
167 __read_mostly static int vm_emerg_launder = 100;
168 __read_mostly static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0;
169 __read_mostly static int vm_pageout_full_stats_interval = 0;
170 __read_mostly static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0;
171 __read_mostly static int defer_swap_pageouts=0;
172 __read_mostly static int disable_swap_pageouts=0;
173 __read_mostly static u_int vm_anonmem_decline = ACT_DECLINE;
174 __read_mostly static u_int vm_filemem_decline = ACT_DECLINE * 2;
175 __read_mostly static int vm_pageout_debug;
177 #if defined(NO_SWAPPING)
178 __read_mostly static int vm_swap_enabled=0;
179 __read_mostly static int vm_swap_idle_enabled=0;
181 __read_mostly static int vm_swap_enabled=1;
182 __read_mostly static int vm_swap_idle_enabled=0;
185 /* 0-disable, 1-passive, 2-active swp, 3-acive swp + single-queue dirty pages*/
186 __read_mostly int vm_pageout_memuse_mode=2;
187 __read_mostly int vm_pageout_allow_active=1;
189 SYSCTL_UINT(_vm, VM_PAGEOUT_ALGORITHM, anonmem_decline,
190 CTLFLAG_RW, &vm_anonmem_decline, 0, "active->inactive anon memory");
192 SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, filemem_decline,
193 CTLFLAG_RW, &vm_filemem_decline, 0, "active->inactive file cache");
195 SYSCTL_INT(_vm, OID_AUTO, page_free_hysteresis,
196 CTLFLAG_RW, &vm_page_free_hysteresis, 0,
197 "Free more pages than the minimum required");
199 SYSCTL_INT(_vm, OID_AUTO, max_launder,
200 CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout");
201 SYSCTL_INT(_vm, OID_AUTO, emerg_launder,
202 CTLFLAG_RW, &vm_emerg_launder, 0, "Emergency pager minimum");
204 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max,
205 CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length");
207 SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval,
208 CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan");
210 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval,
211 CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan");
213 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_free_max,
214 CTLFLAG_RW, &vm_pageout_stats_free_max, 0, "Not implemented");
215 SYSCTL_INT(_vm, OID_AUTO, pageout_memuse_mode,
216 CTLFLAG_RW, &vm_pageout_memuse_mode, 0, "memoryuse resource mode");
217 SYSCTL_INT(_vm, OID_AUTO, pageout_allow_active,
218 CTLFLAG_RW, &vm_pageout_allow_active, 0, "allow inactive+active");
219 SYSCTL_INT(_vm, OID_AUTO, pageout_debug,
220 CTLFLAG_RW, &vm_pageout_debug, 0, "debug pageout pages (count)");
223 #if defined(NO_SWAPPING)
224 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
225 CTLFLAG_RD, &vm_swap_enabled, 0, "");
226 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
227 CTLFLAG_RD, &vm_swap_idle_enabled, 0, "");
229 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
230 CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout");
231 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
232 CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
235 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts,
236 CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem");
238 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
239 CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
241 static int pageout_lock_miss;
242 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
243 CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
245 int vm_page_max_wired; /* XXX max # of wired pages system-wide */
247 #if !defined(NO_SWAPPING)
248 static void vm_req_vmdaemon (void);
250 static void vm_pageout_page_stats(int q);
252 #define MAXSCAN_DIVIDER 10
255 * Calculate approximately how many pages on each queue to try to
256 * clean. An exact calculation creates an edge condition when the
257 * queues are unbalanced so add significant slop. The queue scans
258 * will stop early when targets are reached and will start where they
259 * left off on the next pass.
261 * We need to be generous here because there are all sorts of loading
262 * conditions that can cause edge cases if try to average over all queues.
263 * In particular, storage subsystems have become so fast that paging
264 * activity can become quite frantic. Eventually we will probably need
265 * two paging threads, one for dirty pages and one for clean, to deal
266 * with the bandwidth requirements.
268 * So what we do is calculate a value that can be satisfied nominally by
269 * only having to scan half the queues.
277 avg = ((n + (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) + 1);
279 avg = ((n - (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) - 1);
285 * vm_pageout_clean_helper:
287 * Clean the page and remove it from the laundry. The page must be busied
288 * by the caller and will be disposed of (put away, flushed) by this routine.
291 vm_pageout_clean_helper(vm_page_t m, int vmflush_flags)
294 vm_page_t mc[BLIST_MAX_ALLOC];
296 int ib, is, page_base;
297 vm_pindex_t pindex = m->pindex;
302 * Don't mess with the page if it's held or special. Theoretically
303 * we can pageout held pages but there is no real need to press our
306 if (m->hold_count != 0 || (m->flags & PG_UNQUEUED)) {
312 * Place page in cluster. Align cluster for optimal swap space
313 * allocation (whether it is swap or not). This is typically ~16-32
314 * pages, which also tends to align the cluster to multiples of the
315 * filesystem block size if backed by a filesystem.
317 page_base = pindex % BLIST_MAX_ALLOC;
323 * Scan object for clusterable pages.
325 * We can cluster ONLY if: ->> the page is NOT
326 * clean, wired, busy, held, or mapped into a
327 * buffer, and one of the following:
328 * 1) The page is inactive, or a seldom used
331 * 2) we force the issue.
333 * During heavy mmap/modification loads the pageout
334 * daemon can really fragment the underlying file
335 * due to flushing pages out of order and not trying
336 * align the clusters (which leave sporatic out-of-order
337 * holes). To solve this problem we do the reverse scan
338 * first and attempt to align our cluster, then do a
339 * forward scan if room remains.
341 vm_object_hold(object);
346 p = vm_page_lookup_busy_try(object, pindex - page_base + ib,
348 if (error || p == NULL)
350 if ((p->queue - p->pc) == PQ_CACHE ||
351 (p->flags & PG_UNQUEUED)) {
355 vm_page_test_dirty(p);
356 if (((p->dirty & p->valid) == 0 &&
357 (p->flags & PG_NEED_COMMIT) == 0) ||
358 p->wire_count != 0 || /* may be held by buf cache */
359 p->hold_count != 0) { /* may be undergoing I/O */
363 if (p->queue - p->pc != PQ_INACTIVE) {
364 if (p->queue - p->pc != PQ_ACTIVE ||
365 (vmflush_flags & OBJPC_ALLOW_ACTIVE) == 0) {
372 * Try to maintain page groupings in the cluster.
374 if (m->flags & PG_WINATCFLS)
375 vm_page_flag_set(p, PG_WINATCFLS);
377 vm_page_flag_clear(p, PG_WINATCFLS);
378 p->act_count = m->act_count;
385 while (is < BLIST_MAX_ALLOC &&
386 pindex - page_base + is < object->size) {
389 p = vm_page_lookup_busy_try(object, pindex - page_base + is,
391 if (error || p == NULL)
393 if (((p->queue - p->pc) == PQ_CACHE) ||
394 (p->flags & PG_UNQUEUED)) {
398 vm_page_test_dirty(p);
399 if (((p->dirty & p->valid) == 0 &&
400 (p->flags & PG_NEED_COMMIT) == 0) ||
401 p->wire_count != 0 || /* may be held by buf cache */
402 p->hold_count != 0) { /* may be undergoing I/O */
406 if (p->queue - p->pc != PQ_INACTIVE) {
407 if (p->queue - p->pc != PQ_ACTIVE ||
408 (vmflush_flags & OBJPC_ALLOW_ACTIVE) == 0) {
415 * Try to maintain page groupings in the cluster.
417 if (m->flags & PG_WINATCFLS)
418 vm_page_flag_set(p, PG_WINATCFLS);
420 vm_page_flag_clear(p, PG_WINATCFLS);
421 p->act_count = m->act_count;
427 vm_object_drop(object);
430 * we allow reads during pageouts...
432 return vm_pageout_flush(&mc[ib], is - ib, vmflush_flags);
436 * vm_pageout_flush() - launder the given pages
438 * The given pages are laundered. Note that we setup for the start of
439 * I/O ( i.e. busy the page ), mark it read-only, and bump the object
440 * reference count all in here rather then in the parent. If we want
441 * the parent to do more sophisticated things we may have to change
444 * The pages in the array must be busied by the caller and will be
445 * unbusied by this function.
448 vm_pageout_flush(vm_page_t *mc, int count, int vmflush_flags)
451 int pageout_status[count];
456 * Initiate I/O. Bump the vm_page_t->busy counter.
458 for (i = 0; i < count; i++) {
459 KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL,
460 ("vm_pageout_flush page %p index %d/%d: partially "
461 "invalid page", mc[i], i, count));
462 vm_page_io_start(mc[i]);
466 * We must make the pages read-only. This will also force the
467 * modified bit in the related pmaps to be cleared. The pager
468 * cannot clear the bit for us since the I/O completion code
469 * typically runs from an interrupt. The act of making the page
470 * read-only handles the case for us.
472 * Then we can unbusy the pages, we still hold a reference by virtue
475 for (i = 0; i < count; i++) {
476 if (vmflush_flags & OBJPC_TRY_TO_CACHE)
477 vm_page_protect(mc[i], VM_PROT_NONE);
479 vm_page_protect(mc[i], VM_PROT_READ);
480 vm_page_wakeup(mc[i]);
483 object = mc[0]->object;
484 vm_object_pip_add(object, count);
486 vm_pager_put_pages(object, mc, count,
488 ((object == &kernel_object) ?
492 for (i = 0; i < count; i++) {
493 vm_page_t mt = mc[i];
495 switch (pageout_status[i]) {
504 * Page outside of range of object. Right now we
505 * essentially lose the changes by pretending it
508 vm_page_busy_wait(mt, FALSE, "pgbad");
509 pmap_clear_modify(mt);
516 * A page typically cannot be paged out when we
517 * have run out of swap. We leave the page
518 * marked inactive and will try to page it out
521 * Starvation of the active page list is used to
522 * determine when the system is massively memory
531 * If not PENDing this was a synchronous operation and we
532 * clean up after the I/O. If it is PENDing the mess is
533 * cleaned up asynchronously.
535 * Also nominally act on the caller's wishes if the caller
536 * wants to try to really clean (cache or free) the page.
538 * Also nominally deactivate the page if the system is
541 if (pageout_status[i] != VM_PAGER_PEND) {
542 vm_page_busy_wait(mt, FALSE, "pgouw");
543 vm_page_io_finish(mt);
544 if (vmflush_flags & OBJPC_TRY_TO_CACHE) {
545 vm_page_try_to_cache(mt);
546 } else if (vm_page_count_severe()) {
547 vm_page_deactivate(mt);
552 vm_object_pip_wakeup(object);
558 #if !defined(NO_SWAPPING)
561 * Callback function, page busied for us. We must dispose of the busy
562 * condition. Any related pmap pages may be held but will not be locked.
566 vm_pageout_mdp_callback(struct pmap_pgscan_info *info, vm_offset_t va,
573 * Basic tests - There should never be a marker, and we can stop
574 * once the RSS is below the required level.
576 KKASSERT((p->flags & PG_MARKER) == 0);
577 if (pmap_resident_tlnw_count(info->pmap) <= info->limit) {
582 mycpu->gd_cnt.v_pdpages++;
584 if (p->wire_count || p->hold_count || (p->flags & PG_UNQUEUED)) {
592 * Check if the page has been referened recently. If it has,
593 * activate it and skip.
595 actcount = pmap_ts_referenced(p);
597 vm_page_flag_set(p, PG_REFERENCED);
598 } else if (p->flags & PG_REFERENCED) {
603 if (p->queue - p->pc != PQ_ACTIVE) {
604 vm_page_and_queue_spin_lock(p);
605 if (p->queue - p->pc != PQ_ACTIVE) {
606 vm_page_and_queue_spin_unlock(p);
609 vm_page_and_queue_spin_unlock(p);
612 p->act_count += actcount;
613 if (p->act_count > ACT_MAX)
614 p->act_count = ACT_MAX;
616 vm_page_flag_clear(p, PG_REFERENCED);
622 * Remove the page from this particular pmap. Once we do this, our
623 * pmap scans will not see it again (unless it gets faulted in), so
624 * we must actively dispose of or deal with the page.
626 pmap_remove_specific(info->pmap, p);
629 * If the page is not mapped to another process (i.e. as would be
630 * typical if this were a shared page from a library) then deactivate
631 * the page and clean it in two passes only.
633 * If the page hasn't been referenced since the last check, remove it
634 * from the pmap. If it is no longer mapped, deactivate it
635 * immediately, accelerating the normal decline.
637 * Once the page has been removed from the pmap the RSS code no
638 * longer tracks it so we have to make sure that it is staged for
639 * potential flush action.
643 if ((p->flags & PG_MAPPED) == 0 ||
644 (pmap_mapped_sync(p) & PG_MAPPED) == 0) {
645 if (p->queue - p->pc == PQ_ACTIVE) {
646 vm_page_deactivate(p);
648 if (p->queue - p->pc == PQ_INACTIVE) {
654 * Ok, try to fully clean the page and any nearby pages such that at
655 * least the requested page is freed or moved to the cache queue.
657 * We usually do this synchronously to allow us to get the page into
658 * the CACHE queue quickly, which will prevent memory exhaustion if
659 * a process with a memoryuse limit is running away. However, the
660 * sysadmin may desire to set vm.swap_user_async which relaxes this
661 * and improves write performance.
664 long max_launder = 0x7FFF;
665 long vnodes_skipped = 0;
666 long counts[4] = { 0, 0, 0, 0 };
668 struct vnode *vpfailed = NULL;
672 if (vm_pageout_memuse_mode >= 2) {
673 vmflush_flags = OBJPC_TRY_TO_CACHE |
675 if (swap_user_async == 0)
676 vmflush_flags |= OBJPC_SYNC;
677 vm_page_flag_set(p, PG_WINATCFLS);
679 vm_pageout_page(p, &max_launder,
681 &vpfailed, 1, vmflush_flags,
692 * Must be at end to avoid SMP races.
700 * Deactivate some number of pages in a map due to set RLIMIT_RSS limits.
701 * that is relatively difficult to do. We try to keep track of where we
702 * left off last time to reduce scan overhead.
704 * Called when vm_pageout_memuse_mode is >= 1.
707 vm_pageout_map_deactivate_pages(vm_map_t map, vm_pindex_t limit)
709 vm_offset_t pgout_offset;
710 struct pmap_pgscan_info info;
713 pgout_offset = map->pgout_offset;
716 kprintf("%016jx ", pgout_offset);
718 if (pgout_offset < VM_MIN_USER_ADDRESS)
719 pgout_offset = VM_MIN_USER_ADDRESS;
720 if (pgout_offset >= VM_MAX_USER_ADDRESS)
722 info.pmap = vm_map_pmap(map);
724 info.beg_addr = pgout_offset;
725 info.end_addr = VM_MAX_USER_ADDRESS;
726 info.callback = vm_pageout_mdp_callback;
728 info.actioncount = 0;
732 pgout_offset = info.offset;
734 kprintf("%016jx %08lx %08lx\n", pgout_offset,
735 info.cleancount, info.actioncount);
738 if (pgout_offset != VM_MAX_USER_ADDRESS &&
739 pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) {
741 } else if (retries &&
742 pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) {
746 map->pgout_offset = pgout_offset;
751 * Called when the pageout scan wants to free a page. We no longer
752 * try to cycle the vm_object here with a reference & dealloc, which can
753 * cause a non-trivial object collapse in a critical path.
755 * It is unclear why we cycled the ref_count in the past, perhaps to try
756 * to optimize shadow chain collapses but I don't quite see why it would
757 * be necessary. An OBJ_DEAD object should terminate any and all vm_pages
758 * synchronously and not have to be kicked-start.
761 vm_pageout_page_free(vm_page_t m)
763 vm_page_protect(m, VM_PROT_NONE);
768 * vm_pageout_scan does the dirty work for the pageout daemon.
770 struct vm_pageout_scan_info {
771 struct proc *bigproc;
775 static int vm_pageout_scan_callback(struct proc *p, void *data);
778 * Scan inactive queue
780 * WARNING! Can be called from two pagedaemon threads simultaneously.
783 vm_pageout_scan_inactive(int pass, int q, long avail_shortage,
784 long *vnodes_skipped, long *counts)
787 struct vm_page marker;
788 struct vnode *vpfailed; /* warning, allowed to be stale */
795 isep = (curthread == emergpager);
796 if ((unsigned)pass > 1000)
800 * This routine is called for each of PQ_L2_SIZE inactive queues.
801 * We want the vm_max_launder parameter to apply to the whole
802 * queue (i.e. per-whole-queue pass, not per-sub-queue).
804 * In each successive full-pass when the page target is not met we
805 * allow the per-queue max_launder to increase up to a maximum of
806 * vm_max_launder / 16.
809 max_launder = (long)vm_max_launder * (pass + 1) / PQ_L2_SIZE;
811 max_launder = (long)vm_max_launder / PQ_L2_SIZE;
812 max_launder /= MAXSCAN_DIVIDER;
814 if (max_launder <= 1)
816 if (max_launder >= vm_max_launder / 16)
817 max_launder = vm_max_launder / 16 + 1;
820 * Start scanning the inactive queue for pages we can move to the
821 * cache or free. The scan will stop when the target is reached or
822 * we have scanned the entire inactive queue. Note that m->act_count
823 * is not used to form decisions for the inactive queue, only for the
826 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED
831 * Initialize our marker
833 bzero(&marker, sizeof(marker));
834 marker.flags = PG_FICTITIOUS | PG_MARKER;
835 marker.busy_count = PBUSY_LOCKED;
836 marker.queue = PQ_INACTIVE + q;
838 marker.wire_count = 1;
841 * Inactive queue scan.
843 * We pick off approximately 1/10 of each queue. Each queue is
844 * effectively organized LRU so scanning the entire queue would
845 * improperly pick up pages that might still be in regular use.
847 * NOTE: The vm_page must be spinlocked before the queue to avoid
848 * deadlocks, so it is easiest to simply iterate the loop
849 * with the queue unlocked at the top.
853 vm_page_queues_spin_lock(PQ_INACTIVE + q);
854 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq);
855 maxscan = vm_page_queues[PQ_INACTIVE + q].lcnt / MAXSCAN_DIVIDER + 1;
858 * Queue locked at top of loop to avoid stack marker issues.
860 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
861 maxscan-- > 0 && avail_shortage - delta > 0)
865 KKASSERT(m->queue == PQ_INACTIVE + q);
866 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl,
868 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE + q].pl, m,
870 mycpu->gd_cnt.v_pdpages++;
873 * Skip marker pages (atomic against other markers to avoid
874 * infinite hop-over scans).
876 if (m->flags & PG_MARKER)
880 * Try to busy the page. Don't mess with pages which are
881 * already busy or reorder them in the queue.
883 if (vm_page_busy_try(m, TRUE))
887 * Remaining operations run with the page busy and neither
888 * the page or the queue will be spin-locked.
890 KKASSERT(m->queue == PQ_INACTIVE + q);
891 vm_page_queues_spin_unlock(PQ_INACTIVE + q);
894 * The emergency pager runs when the primary pager gets
895 * stuck, which typically means the primary pager deadlocked
896 * on a vnode-backed page. Therefore, the emergency pager
897 * must skip any complex objects.
899 * We disallow VNODEs unless they are VCHR whos device ops
900 * does not flag D_NOEMERGPGR.
902 if (isep && m->object) {
905 switch(m->object->type) {
909 * Allow anonymous memory and assume that
910 * swap devices are not complex, since its
911 * kinda worthless if we can't swap out dirty
917 * Allow VCHR device if the D_NOEMERGPGR
918 * flag is not set, deny other vnode types
919 * as being too complex.
921 vp = m->object->handle;
922 if (vp && vp->v_type == VCHR &&
923 vp->v_rdev && vp->v_rdev->si_ops &&
924 (vp->v_rdev->si_ops->head.flags &
925 D_NOEMERGPGR) == 0) {
928 /* Deny - fall through */
934 vm_page_queues_spin_lock(PQ_INACTIVE + q);
941 * Try to pageout the page and perhaps other nearby pages.
942 * We want to get the pages into the cache eventually (
943 * first or second pass). Otherwise the pages can wind up
944 * just cycling in the inactive queue, getting flushed over
947 * Generally speaking we recycle dirty pages within PQ_INACTIVE
948 * twice (double LRU) before paging them out. If the
949 * memuse_mode is >= 3 we run them single-LRU like we do clean
952 if (vm_pageout_memuse_mode >= 3)
953 vm_page_flag_set(m, PG_WINATCFLS);
956 if (vm_pageout_allow_active)
957 vmflush_flags |= OBJPC_ALLOW_ACTIVE;
958 if (m->flags & PG_WINATCFLS)
959 vmflush_flags |= OBJPC_TRY_TO_CACHE;
960 count = vm_pageout_page(m, &max_launder, vnodes_skipped,
961 &vpfailed, pass, vmflush_flags, counts);
965 * Systems with a ton of memory can wind up with huge
966 * deactivation counts. Because the inactive scan is
967 * doing a lot of flushing, the combination can result
968 * in excessive paging even in situations where other
969 * unrelated threads free up sufficient VM.
971 * To deal with this we abort the nominal active->inactive
972 * scan before we hit the inactive target when free+cache
973 * levels have reached a reasonable target.
975 * When deciding to stop early we need to add some slop to
976 * the test and we need to return full completion to the caller
977 * to prevent the caller from thinking there is something
978 * wrong and issuing a low-memory+swap warning or pkill.
980 * A deficit forces paging regardless of the state of the
981 * VM page queues (used for RSS enforcement).
984 vm_page_queues_spin_lock(PQ_INACTIVE + q);
985 if (vm_paging_target() < -vm_max_launder) {
987 * Stopping early, return full completion to caller.
989 if (delta < avail_shortage)
990 delta = avail_shortage;
995 /* page queue still spin-locked */
996 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq);
997 vm_page_queues_spin_unlock(PQ_INACTIVE + q);
1003 * Pageout the specified page, return the total number of pages paged out
1004 * (this routine may cluster).
1006 * The page must be busied and soft-busied by the caller and will be disposed
1007 * of by this function.
1010 vm_pageout_page(vm_page_t m, long *max_launderp, long *vnodes_skippedp,
1011 struct vnode **vpfailedp, int pass, int vmflush_flags,
1019 * Wiring no longer removes a page from its queue. The last unwiring
1020 * will requeue the page. Obviously wired pages cannot be paged out
1021 * so unqueue it and return.
1023 if (m->wire_count) {
1024 vm_page_unqueue_nowakeup(m);
1030 * A held page may be undergoing I/O, so skip it.
1032 if (m->hold_count) {
1033 vm_page_and_queue_spin_lock(m);
1034 if (m->queue - m->pc == PQ_INACTIVE) {
1036 &vm_page_queues[m->queue].pl, m, pageq);
1038 &vm_page_queues[m->queue].pl, m, pageq);
1040 vm_page_and_queue_spin_unlock(m);
1045 if (m->object == NULL || m->object->ref_count == 0) {
1047 * If the object is not being used, we ignore previous
1050 vm_page_flag_clear(m, PG_REFERENCED);
1051 pmap_clear_reference(m);
1052 /* fall through to end */
1053 } else if (((m->flags & PG_REFERENCED) == 0) &&
1054 (actcount = pmap_ts_referenced(m))) {
1056 * Otherwise, if the page has been referenced while
1057 * in the inactive queue, we bump the "activation
1058 * count" upwards, making it less likely that the
1059 * page will be added back to the inactive queue
1060 * prematurely again. Here we check the page tables
1061 * (or emulated bits, if any), given the upper level
1062 * VM system not knowing anything about existing
1066 vm_page_activate(m);
1067 m->act_count += (actcount + ACT_ADVANCE);
1073 * (m) is still busied.
1075 * If the upper level VM system knows about any page
1076 * references, we activate the page. We also set the
1077 * "activation count" higher than normal so that we will less
1078 * likely place pages back onto the inactive queue again.
1080 if ((m->flags & PG_REFERENCED) != 0) {
1081 vm_page_flag_clear(m, PG_REFERENCED);
1082 actcount = pmap_ts_referenced(m);
1083 vm_page_activate(m);
1084 m->act_count += (actcount + ACT_ADVANCE + 1);
1091 * If the upper level VM system doesn't know anything about
1092 * the page being dirty, we have to check for it again. As
1093 * far as the VM code knows, any partially dirty pages are
1096 * Pages marked PG_WRITEABLE may be mapped into the user
1097 * address space of a process running on another cpu. A
1098 * user process (without holding the MP lock) running on
1099 * another cpu may be able to touch the page while we are
1100 * trying to remove it. vm_page_cache() will handle this
1103 if (m->dirty == 0) {
1104 vm_page_test_dirty(m);
1109 if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) {
1111 * Invalid pages can be easily freed
1113 vm_pageout_page_free(m);
1114 mycpu->gd_cnt.v_dfree++;
1117 } else if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) {
1119 * Clean pages can be placed onto the cache queue.
1120 * This effectively frees them.
1125 } else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) {
1127 * Dirty pages need to be paged out, but flushing
1128 * a page is extremely expensive verses freeing
1129 * a clean page. Rather then artificially limiting
1130 * the number of pages we can flush, we instead give
1131 * dirty pages extra priority on the inactive queue
1132 * by forcing them to be cycled through the queue
1133 * twice before being flushed, after which the
1134 * (now clean) page will cycle through once more
1135 * before being freed. This significantly extends
1136 * the thrash point for a heavily loaded machine.
1139 vm_page_flag_set(m, PG_WINATCFLS);
1140 vm_page_and_queue_spin_lock(m);
1141 if (m->queue - m->pc == PQ_INACTIVE) {
1143 &vm_page_queues[m->queue].pl, m, pageq);
1145 &vm_page_queues[m->queue].pl, m, pageq);
1147 vm_page_and_queue_spin_unlock(m);
1149 } else if (*max_launderp > 0) {
1151 * We always want to try to flush some dirty pages if
1152 * we encounter them, to keep the system stable.
1153 * Normally this number is small, but under extreme
1154 * pressure where there are insufficient clean pages
1155 * on the inactive queue, we may have to go all out.
1157 int swap_pageouts_ok;
1158 struct vnode *vp = NULL;
1160 if ((m->flags & PG_WINATCFLS) == 0)
1161 vm_page_flag_set(m, PG_WINATCFLS);
1162 swap_pageouts_ok = 0;
1165 (object->type != OBJT_SWAP) &&
1166 (object->type != OBJT_DEFAULT)) {
1167 swap_pageouts_ok = 1;
1169 swap_pageouts_ok = !(defer_swap_pageouts ||
1170 disable_swap_pageouts);
1171 swap_pageouts_ok |= (!disable_swap_pageouts &&
1172 defer_swap_pageouts &&
1173 vm_page_count_min(0));
1177 * We don't bother paging objects that are "dead".
1178 * Those objects are in a "rundown" state.
1180 if (!swap_pageouts_ok ||
1182 (object->flags & OBJ_DEAD)) {
1183 vm_page_and_queue_spin_lock(m);
1184 if (m->queue - m->pc == PQ_INACTIVE) {
1186 &vm_page_queues[m->queue].pl,
1189 &vm_page_queues[m->queue].pl,
1192 vm_page_and_queue_spin_unlock(m);
1198 * (m) is still busied.
1200 * The object is already known NOT to be dead. It
1201 * is possible for the vget() to block the whole
1202 * pageout daemon, but the new low-memory handling
1203 * code should prevent it.
1205 * The previous code skipped locked vnodes and, worse,
1206 * reordered pages in the queue. This results in
1207 * completely non-deterministic operation because,
1208 * quite often, a vm_fault has initiated an I/O and
1209 * is holding a locked vnode at just the point where
1210 * the pageout daemon is woken up.
1212 * We can't wait forever for the vnode lock, we might
1213 * deadlock due to a vn_read() getting stuck in
1214 * vm_wait while holding this vnode. We skip the
1215 * vnode if we can't get it in a reasonable amount
1218 * vpfailed is used to (try to) avoid the case where
1219 * a large number of pages are associated with a
1220 * locked vnode, which could cause the pageout daemon
1221 * to stall for an excessive amount of time.
1223 if (object->type == OBJT_VNODE) {
1226 vp = object->handle;
1227 flags = LK_EXCLUSIVE;
1228 if (vp == *vpfailedp)
1231 flags |= LK_TIMELOCK;
1236 * We have unbusied (m) temporarily so we can
1237 * acquire the vp lock without deadlocking.
1238 * (m) is held to prevent destruction.
1240 if (vget(vp, flags) != 0) {
1242 ++pageout_lock_miss;
1243 if (object->flags & OBJ_MIGHTBEDIRTY)
1250 * The page might have been moved to another
1251 * queue during potential blocking in vget()
1252 * above. The page might have been freed and
1253 * reused for another vnode. The object might
1254 * have been reused for another vnode.
1256 if (m->queue - m->pc != PQ_INACTIVE ||
1257 m->object != object ||
1258 object->handle != vp) {
1259 if (object->flags & OBJ_MIGHTBEDIRTY)
1267 * The page may have been busied during the
1268 * blocking in vput(); We don't move the
1269 * page back onto the end of the queue so that
1270 * statistics are more correct if we don't.
1272 if (vm_page_busy_try(m, TRUE)) {
1280 * If it was wired while we didn't own it.
1282 if (m->wire_count) {
1283 vm_page_unqueue_nowakeup(m);
1290 * (m) is busied again
1292 * We own the busy bit and remove our hold
1293 * bit. If the page is still held it
1294 * might be undergoing I/O, so skip it.
1296 if (m->hold_count) {
1298 vm_page_and_queue_spin_lock(m);
1299 if (m->queue - m->pc == PQ_INACTIVE) {
1300 TAILQ_REMOVE(&vm_page_queues[m->queue].pl, m, pageq);
1301 TAILQ_INSERT_TAIL(&vm_page_queues[m->queue].pl, m, pageq);
1303 vm_page_and_queue_spin_unlock(m);
1304 if (object->flags & OBJ_MIGHTBEDIRTY)
1312 * Recheck queue, object, and vp now that we have
1313 * rebusied the page.
1315 if (m->queue - m->pc != PQ_INACTIVE ||
1316 m->object != object ||
1317 object->handle != vp) {
1318 kprintf("vm_pageout_page: "
1319 "rebusy %p failed(A)\n",
1325 * Check page validity
1327 if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) {
1328 kprintf("vm_pageout_page: "
1329 "rebusy %p failed(B)\n",
1333 if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) {
1334 kprintf("vm_pageout_page: "
1335 "rebusy %p failed(C)\n",
1340 /* (m) is left busied as we fall through */
1344 * page is busy and not held here.
1346 * If a page is dirty, then it is either being washed
1347 * (but not yet cleaned) or it is still in the
1348 * laundry. If it is still in the laundry, then we
1349 * start the cleaning operation.
1351 * decrement inactive_shortage on success to account
1352 * for the (future) cleaned page. Otherwise we
1353 * could wind up laundering or cleaning too many
1356 * NOTE: Cleaning the page here does not cause
1357 * force_deficit to be adjusted, because the
1358 * page is not being freed or moved to the
1361 count = vm_pageout_clean_helper(m, vmflush_flags);
1363 *max_launderp -= count;
1366 * Clean ate busy, page no longer accessible
1379 * WARNING! Can be called from two pagedaemon threads simultaneously.
1382 vm_pageout_scan_active(int pass, int q,
1383 long avail_shortage, long inactive_shortage,
1384 long *recycle_countp)
1386 struct vm_page marker;
1393 isep = (curthread == emergpager);
1396 * We want to move pages from the active queue to the inactive
1397 * queue to get the inactive queue to the inactive target. If
1398 * we still have a page shortage from above we try to directly free
1399 * clean pages instead of moving them.
1401 * If we do still have a shortage we keep track of the number of
1402 * pages we free or cache (recycle_count) as a measure of thrashing
1403 * between the active and inactive queues.
1405 * If we were able to completely satisfy the free+cache targets
1406 * from the inactive pool we limit the number of pages we move
1407 * from the active pool to the inactive pool to 2x the pages we
1408 * had removed from the inactive pool (with a minimum of 1/5 the
1409 * inactive target). If we were not able to completely satisfy
1410 * the free+cache targets we go for the whole target aggressively.
1412 * NOTE: Both variables can end up negative.
1413 * NOTE: We are still in a critical section.
1415 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED
1419 bzero(&marker, sizeof(marker));
1420 marker.flags = PG_FICTITIOUS | PG_MARKER;
1421 marker.busy_count = PBUSY_LOCKED;
1422 marker.queue = PQ_ACTIVE + q;
1424 marker.wire_count = 1;
1426 vm_page_queues_spin_lock(PQ_ACTIVE + q);
1427 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1428 maxscan = vm_page_queues[PQ_ACTIVE + q].lcnt / MAXSCAN_DIVIDER + 1;
1431 * Queue locked at top of loop to avoid stack marker issues.
1433 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
1434 maxscan-- > 0 && (avail_shortage - delta > 0 ||
1435 inactive_shortage > 0))
1437 KKASSERT(m->queue == PQ_ACTIVE + q);
1438 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl,
1440 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m,
1444 * Skip marker pages (atomic against other markers to avoid
1445 * infinite hop-over scans).
1447 if (m->flags & PG_MARKER)
1451 * Try to busy the page. Don't mess with pages which are
1452 * already busy or reorder them in the queue.
1454 if (vm_page_busy_try(m, TRUE))
1458 * Remaining operations run with the page busy and neither
1459 * the page or the queue will be spin-locked.
1461 KKASSERT(m->queue == PQ_ACTIVE + q);
1462 vm_page_queues_spin_unlock(PQ_ACTIVE + q);
1466 * Don't deactivate pages that are held, even if we can
1467 * busy them. (XXX why not?)
1469 if (m->hold_count) {
1470 vm_page_and_queue_spin_lock(m);
1471 if (m->queue - m->pc == PQ_ACTIVE) {
1473 &vm_page_queues[PQ_ACTIVE + q].pl,
1476 &vm_page_queues[PQ_ACTIVE + q].pl,
1479 vm_page_and_queue_spin_unlock(m);
1485 * We can just remove wired pages from the queue
1487 if (m->wire_count) {
1488 vm_page_unqueue_nowakeup(m);
1494 * The emergency pager ignores vnode-backed pages as these
1495 * are the pages that probably bricked the main pager.
1497 if (isep && m->object && m->object->type == OBJT_VNODE) {
1498 vm_page_and_queue_spin_lock(m);
1499 if (m->queue - m->pc == PQ_ACTIVE) {
1501 &vm_page_queues[PQ_ACTIVE + q].pl,
1504 &vm_page_queues[PQ_ACTIVE + q].pl,
1507 vm_page_and_queue_spin_unlock(m);
1513 * The count for pagedaemon pages is done after checking the
1514 * page for eligibility...
1516 mycpu->gd_cnt.v_pdpages++;
1519 * Check to see "how much" the page has been used and clear
1520 * the tracking access bits. If the object has no references
1521 * don't bother paying the expense.
1524 if (m->object && m->object->ref_count != 0) {
1525 if (m->flags & PG_REFERENCED)
1527 actcount += pmap_ts_referenced(m);
1529 m->act_count += ACT_ADVANCE + actcount;
1530 if (m->act_count > ACT_MAX)
1531 m->act_count = ACT_MAX;
1534 vm_page_flag_clear(m, PG_REFERENCED);
1537 * actcount is only valid if the object ref_count is non-zero.
1538 * If the page does not have an object, actcount will be zero.
1540 if (actcount && m->object->ref_count != 0) {
1541 vm_page_and_queue_spin_lock(m);
1542 if (m->queue - m->pc == PQ_ACTIVE) {
1544 &vm_page_queues[PQ_ACTIVE + q].pl,
1547 &vm_page_queues[PQ_ACTIVE + q].pl,
1550 vm_page_and_queue_spin_unlock(m);
1553 switch(m->object->type) {
1556 m->act_count -= min(m->act_count,
1557 vm_anonmem_decline);
1560 m->act_count -= min(m->act_count,
1561 vm_filemem_decline);
1564 if (vm_pageout_algorithm ||
1565 (m->object == NULL) ||
1566 (m->object && (m->object->ref_count == 0)) ||
1567 m->act_count < pass + 1
1570 * Deactivate the page. If we had a
1571 * shortage from our inactive scan try to
1572 * free (cache) the page instead.
1574 * Don't just blindly cache the page if
1575 * we do not have a shortage from the
1576 * inactive scan, that could lead to
1577 * gigabytes being moved.
1579 --inactive_shortage;
1580 if (avail_shortage - delta > 0 ||
1581 (m->object && (m->object->ref_count == 0)))
1583 if (avail_shortage - delta > 0)
1585 vm_page_protect(m, VM_PROT_NONE);
1586 if (m->dirty == 0 &&
1587 (m->flags & PG_NEED_COMMIT) == 0 &&
1588 avail_shortage - delta > 0) {
1591 vm_page_deactivate(m);
1595 vm_page_deactivate(m);
1600 vm_page_and_queue_spin_lock(m);
1601 if (m->queue - m->pc == PQ_ACTIVE) {
1603 &vm_page_queues[PQ_ACTIVE + q].pl,
1606 &vm_page_queues[PQ_ACTIVE + q].pl,
1609 vm_page_and_queue_spin_unlock(m);
1615 vm_page_queues_spin_lock(PQ_ACTIVE + q);
1619 * Clean out our local marker.
1621 * Page queue still spin-locked.
1623 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1624 vm_page_queues_spin_unlock(PQ_ACTIVE + q);
1630 * The number of actually free pages can drop down to v_free_reserved,
1631 * we try to build the free count back above v_free_min. Note that
1632 * vm_paging_needed() also returns TRUE if v_free_count is not at
1633 * least v_free_min so that is the minimum we must build the free
1636 * We use a slightly higher target to improve hysteresis,
1637 * ((v_free_target + v_free_min) / 2). Since v_free_target
1638 * is usually the same as v_cache_min this maintains about
1639 * half the pages in the free queue as are in the cache queue,
1640 * providing pretty good pipelining for pageout operation.
1642 * The system operator can manipulate vm.v_cache_min and
1643 * vm.v_free_target to tune the pageout demon. Be sure
1644 * to keep vm.v_free_min < vm.v_free_target.
1646 * Note that the original paging target is to get at least
1647 * (free_min + cache_min) into (free + cache). The slightly
1648 * higher target will shift additional pages from cache to free
1649 * without effecting the original paging target in order to
1650 * maintain better hysteresis and not have the free count always
1651 * be dead-on v_free_min.
1653 * NOTE: we are still in a critical section.
1655 * Pages moved from PQ_CACHE to totally free are not counted in the
1656 * pages_freed counter.
1658 * WARNING! Can be called from two pagedaemon threads simultaneously.
1661 vm_pageout_scan_cache(long avail_shortage, int pass,
1662 long vnodes_skipped, long recycle_count)
1664 static int lastkillticks;
1665 struct vm_pageout_scan_info info;
1669 isep = (curthread == emergpager);
1671 while (vmstats.v_free_count <
1672 (vmstats.v_free_min + vmstats.v_free_target) / 2) {
1674 * This steals some code from vm/vm_page.c
1676 * Create two rovers and adjust the code to reduce
1677 * chances of them winding up at the same index (which
1678 * can cause a lot of contention).
1680 static int cache_rover[2] = { 0, PQ_L2_MASK / 2 };
1682 if (((cache_rover[0] ^ cache_rover[1]) & PQ_L2_MASK) == 0)
1685 m = vm_page_list_find(PQ_CACHE, cache_rover[isep] & PQ_L2_MASK);
1689 * page is returned removed from its queue and spinlocked
1691 * If the busy attempt fails we can still deactivate the page.
1693 if (vm_page_busy_try(m, TRUE)) {
1694 vm_page_deactivate_locked(m);
1695 vm_page_spin_unlock(m);
1698 vm_page_spin_unlock(m);
1699 pagedaemon_wakeup();
1703 * Remaining operations run with the page busy and neither
1704 * the page or the queue will be spin-locked.
1706 if ((m->flags & (PG_UNQUEUED | PG_NEED_COMMIT)) ||
1709 vm_page_deactivate(m);
1715 * Because the page is in the cache, it shouldn't be mapped.
1717 pmap_mapped_sync(m);
1718 KKASSERT((m->flags & PG_MAPPED) == 0);
1719 KKASSERT(m->dirty == 0);
1720 vm_pageout_page_free(m);
1721 mycpu->gd_cnt.v_dfree++;
1724 cache_rover[1] -= PQ_PRIME2;
1726 cache_rover[0] += PQ_PRIME2;
1729 #if !defined(NO_SWAPPING)
1731 * Idle process swapout -- run once per second.
1733 if (vm_swap_idle_enabled) {
1735 if (time_uptime != lsec) {
1736 atomic_set_int(&vm_pageout_req_swapout, VM_SWAP_IDLE);
1744 * If we didn't get enough free pages, and we have skipped a vnode
1745 * in a writeable object, wakeup the sync daemon. And kick swapout
1746 * if we did not get enough free pages.
1748 if (vm_paging_target() > 0) {
1749 if (vnodes_skipped && vm_page_count_min(0))
1750 speedup_syncer(NULL);
1751 #if !defined(NO_SWAPPING)
1752 if (vm_swap_enabled && vm_page_count_target()) {
1753 atomic_set_int(&vm_pageout_req_swapout, VM_SWAP_NORMAL);
1760 * Handle catastrophic conditions. Under good conditions we should
1761 * be at the target, well beyond our minimum. If we could not even
1762 * reach our minimum the system is under heavy stress. But just being
1763 * under heavy stress does not trigger process killing.
1765 * We consider ourselves to have run out of memory if the swap pager
1766 * is full and avail_shortage is still positive. The secondary check
1767 * ensures that we do not kill processes if the instantanious
1768 * availability is good, even if the pageout demon pass says it
1769 * couldn't get to the target.
1771 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT HANDLE SWAP FULL
1774 if (swap_pager_almost_full &&
1777 (vm_page_count_min(recycle_count) || avail_shortage > 0)) {
1778 kprintf("Warning: system low on memory+swap "
1779 "shortage %ld for %d ticks!\n",
1780 avail_shortage, ticks - swap_fail_ticks);
1782 kprintf("Metrics: spaf=%d spf=%d pass=%d "
1783 "avail=%ld target=%ld last=%u\n",
1784 swap_pager_almost_full,
1789 (unsigned int)(ticks - lastkillticks));
1791 if (swap_pager_full &&
1794 avail_shortage > 0 &&
1795 vm_paging_target() > 0 &&
1796 (unsigned int)(ticks - lastkillticks) >= hz) {
1798 * Kill something, maximum rate once per second to give
1799 * the process time to free up sufficient memory.
1801 lastkillticks = ticks;
1802 info.bigproc = NULL;
1804 allproc_scan(vm_pageout_scan_callback, &info, 0);
1805 if (info.bigproc != NULL) {
1806 kprintf("Try to kill process %d %s\n",
1807 info.bigproc->p_pid, info.bigproc->p_comm);
1808 info.bigproc->p_nice = PRIO_MIN;
1809 info.bigproc->p_usched->resetpriority(
1810 FIRST_LWP_IN_PROC(info.bigproc));
1811 atomic_set_int(&info.bigproc->p_flags, P_LOWMEMKILL);
1812 killproc(info.bigproc, "out of swap space");
1813 wakeup(&vmstats.v_free_count);
1814 PRELE(info.bigproc);
1820 vm_pageout_scan_callback(struct proc *p, void *data)
1822 struct vm_pageout_scan_info *info = data;
1826 * Never kill system processes or init. If we have configured swap
1827 * then try to avoid killing low-numbered pids.
1829 if ((p->p_flags & P_SYSTEM) || (p->p_pid == 1) ||
1830 ((p->p_pid < 48) && (vm_swap_size != 0))) {
1834 lwkt_gettoken(&p->p_token);
1837 * if the process is in a non-running type state,
1840 if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) {
1841 lwkt_reltoken(&p->p_token);
1846 * Get the approximate process size. Note that anonymous pages
1847 * with backing swap will be counted twice, but there should not
1848 * be too many such pages due to the stress the VM system is
1849 * under at this point.
1851 size = vmspace_anonymous_count(p->p_vmspace) +
1852 vmspace_swap_count(p->p_vmspace);
1855 * If the this process is bigger than the biggest one
1858 if (info->bigsize < size) {
1860 PRELE(info->bigproc);
1863 info->bigsize = size;
1865 lwkt_reltoken(&p->p_token);
1872 * This old guy slowly walks PQ_HOLD looking for pages which need to be
1873 * moved back to PQ_FREE. It is possible for pages to accumulate here
1874 * when vm_page_free() races against vm_page_unhold(), resulting in a
1875 * page being left on a PQ_HOLD queue with hold_count == 0.
1877 * It is easier to handle this edge condition here, in non-critical code,
1878 * rather than enforce a spin-lock for every 1->0 transition in
1881 * NOTE: TAILQ_FOREACH becomes invalid the instant we unlock the queue.
1884 vm_pageout_scan_hold(int q)
1888 vm_page_queues_spin_lock(PQ_HOLD + q);
1889 TAILQ_FOREACH(m, &vm_page_queues[PQ_HOLD + q].pl, pageq) {
1890 if (m->flags & PG_MARKER)
1894 * Process one page and return
1898 kprintf("DEBUG: pageout HOLD->FREE %p\n", m);
1900 vm_page_queues_spin_unlock(PQ_HOLD + q);
1901 vm_page_unhold(m); /* reprocess */
1904 vm_page_queues_spin_unlock(PQ_HOLD + q);
1908 * This routine tries to maintain the pseudo LRU active queue,
1909 * so that during long periods of time where there is no paging,
1910 * that some statistic accumulation still occurs. This code
1911 * helps the situation where paging just starts to occur.
1914 vm_pageout_page_stats(int q)
1916 static int fullintervalcount = 0;
1917 struct vm_page marker;
1919 long pcount, tpcount; /* Number of pages to check */
1922 page_shortage = (vmstats.v_inactive_target + vmstats.v_cache_max +
1923 vmstats.v_free_min) -
1924 (vmstats.v_free_count + vmstats.v_inactive_count +
1925 vmstats.v_cache_count);
1927 if (page_shortage <= 0)
1930 pcount = vm_page_queues[PQ_ACTIVE + q].lcnt;
1931 fullintervalcount += vm_pageout_stats_interval;
1932 if (fullintervalcount < vm_pageout_full_stats_interval) {
1933 tpcount = (vm_pageout_stats_max * pcount) /
1934 vmstats.v_page_count + 1;
1935 if (pcount > tpcount)
1938 fullintervalcount = 0;
1941 bzero(&marker, sizeof(marker));
1942 marker.flags = PG_FICTITIOUS | PG_MARKER;
1943 marker.busy_count = PBUSY_LOCKED;
1944 marker.queue = PQ_ACTIVE + q;
1946 marker.wire_count = 1;
1948 vm_page_queues_spin_lock(PQ_ACTIVE + q);
1949 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1952 * Queue locked at top of loop to avoid stack marker issues.
1954 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
1959 KKASSERT(m->queue == PQ_ACTIVE + q);
1960 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1961 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m,
1965 * Skip marker pages (atomic against other markers to avoid
1966 * infinite hop-over scans).
1968 if (m->flags & PG_MARKER)
1972 * Ignore pages we can't busy
1974 if (vm_page_busy_try(m, TRUE))
1978 * Remaining operations run with the page busy and neither
1979 * the page or the queue will be spin-locked.
1981 KKASSERT(m->queue == PQ_ACTIVE + q);
1982 vm_page_queues_spin_unlock(PQ_ACTIVE + q);
1985 * We can just remove wired pages from the queue
1987 if (m->wire_count) {
1988 vm_page_unqueue_nowakeup(m);
1995 * We now have a safely busied page, the page and queue
1996 * spinlocks have been released.
1998 * Ignore held and wired pages
2000 if (m->hold_count || m->wire_count) {
2006 * Calculate activity
2009 if (m->flags & PG_REFERENCED) {
2010 vm_page_flag_clear(m, PG_REFERENCED);
2013 actcount += pmap_ts_referenced(m);
2016 * Update act_count and move page to end of queue.
2019 m->act_count += ACT_ADVANCE + actcount;
2020 if (m->act_count > ACT_MAX)
2021 m->act_count = ACT_MAX;
2022 vm_page_and_queue_spin_lock(m);
2023 if (m->queue - m->pc == PQ_ACTIVE) {
2025 &vm_page_queues[PQ_ACTIVE + q].pl,
2028 &vm_page_queues[PQ_ACTIVE + q].pl,
2031 vm_page_and_queue_spin_unlock(m);
2036 if (m->act_count == 0) {
2038 * We turn off page access, so that we have
2039 * more accurate RSS stats. We don't do this
2040 * in the normal page deactivation when the
2041 * system is loaded VM wise, because the
2042 * cost of the large number of page protect
2043 * operations would be higher than the value
2044 * of doing the operation.
2046 * We use the marker to save our place so
2047 * we can release the spin lock. both (m)
2048 * and (next) will be invalid.
2050 vm_page_protect(m, VM_PROT_NONE);
2051 vm_page_deactivate(m);
2053 m->act_count -= min(m->act_count, ACT_DECLINE);
2054 vm_page_and_queue_spin_lock(m);
2055 if (m->queue - m->pc == PQ_ACTIVE) {
2057 &vm_page_queues[PQ_ACTIVE + q].pl,
2060 &vm_page_queues[PQ_ACTIVE + q].pl,
2063 vm_page_and_queue_spin_unlock(m);
2067 vm_page_queues_spin_lock(PQ_ACTIVE + q);
2071 * Remove our local marker
2073 * Page queue still spin-locked.
2075 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
2076 vm_page_queues_spin_unlock(PQ_ACTIVE + q);
2080 vm_pageout_free_page_calc(vm_size_t count)
2083 * v_free_min normal allocations
2084 * v_free_reserved system allocations
2085 * v_pageout_free_min allocations by pageout daemon
2086 * v_interrupt_free_min low level allocations (e.g swap structures)
2088 * v_free_min is used to generate several other baselines, and they
2089 * can get pretty silly on systems with a lot of memory.
2091 vmstats.v_free_min = 64 + vmstats.v_page_count / 200;
2092 vmstats.v_free_reserved = vmstats.v_free_min * 4 / 8 + 7;
2093 vmstats.v_free_severe = vmstats.v_free_min * 4 / 8 + 0;
2094 vmstats.v_pageout_free_min = vmstats.v_free_min * 2 / 8 + 7;
2095 vmstats.v_interrupt_free_min = vmstats.v_free_min * 1 / 8 + 7;
2100 * vm_pageout is the high level pageout daemon. TWO kernel threads run
2101 * this daemon, the primary pageout daemon and the emergency pageout daemon.
2103 * The emergency pageout daemon takes over when the primary pageout daemon
2104 * deadlocks. The emergency pageout daemon ONLY pages out to swap, thus
2105 * avoiding the many low-memory deadlocks which can occur when paging out
2109 vm_pageout_thread(void)
2118 curthread->td_flags |= TDF_SYSTHREAD;
2121 * We only need to setup once.
2124 if (curthread == emergpager) {
2130 * Initialize vm_max_launder per pageout pass to be 1/16
2131 * of total physical memory, plus a little slop.
2133 if (vm_max_launder == 0)
2134 vm_max_launder = physmem / 256 + 16;
2137 * Initialize some paging parameters.
2139 vm_pageout_free_page_calc(vmstats.v_page_count);
2142 * v_free_target and v_cache_min control pageout hysteresis. Note
2143 * that these are more a measure of the VM cache queue hysteresis
2144 * then the VM free queue. Specifically, v_free_target is the
2145 * high water mark (free+cache pages).
2147 * v_free_reserved + v_cache_min (mostly means v_cache_min) is the
2148 * low water mark, while v_free_min is the stop. v_cache_min must
2149 * be big enough to handle memory needs while the pageout daemon
2150 * is signalled and run to free more pages.
2152 vmstats.v_free_target = 4 * vmstats.v_free_min +
2153 vmstats.v_free_reserved;
2156 * NOTE: With the new buffer cache b_act_count we want the default
2157 * inactive target to be a percentage of available memory.
2159 * The inactive target essentially determines the minimum
2160 * number of 'temporary' pages capable of caching one-time-use
2161 * files when the VM system is otherwise full of pages
2162 * belonging to multi-time-use files or active program data.
2164 * NOTE: The inactive target is aggressively persued only if the
2165 * inactive queue becomes too small. If the inactive queue
2166 * is large enough to satisfy page movement to free+cache
2167 * then it is repopulated more slowly from the active queue.
2168 * This allows a general inactive_target default to be set.
2170 * There is an issue here for processes which sit mostly idle
2171 * 'overnight', such as sshd, tcsh, and X. Any movement from
2172 * the active queue will eventually cause such pages to
2173 * recycle eventually causing a lot of paging in the morning.
2174 * To reduce the incidence of this pages cycled out of the
2175 * buffer cache are moved directly to the inactive queue if
2176 * they were only used once or twice.
2178 * The vfs.vm_cycle_point sysctl can be used to adjust this.
2179 * Increasing the value (up to 64) increases the number of
2180 * buffer recyclements which go directly to the inactive queue.
2182 if (vmstats.v_free_count > 2048) {
2183 vmstats.v_cache_min = vmstats.v_free_target;
2184 vmstats.v_cache_max = 2 * vmstats.v_cache_min;
2186 vmstats.v_cache_min = 0;
2187 vmstats.v_cache_max = 0;
2189 vmstats.v_inactive_target = vmstats.v_free_count / 4;
2191 /* XXX does not really belong here */
2192 if (vm_page_max_wired == 0)
2193 vm_page_max_wired = vmstats.v_free_count / 3;
2195 if (vm_pageout_stats_max == 0)
2196 vm_pageout_stats_max = vmstats.v_free_target;
2199 * Set interval in seconds for stats scan.
2201 if (vm_pageout_stats_interval == 0)
2202 vm_pageout_stats_interval = 5;
2203 if (vm_pageout_full_stats_interval == 0)
2204 vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4;
2208 * Set maximum free per pass
2210 if (vm_pageout_stats_free_max == 0)
2211 vm_pageout_stats_free_max = 5;
2213 swap_pager_swap_init();
2216 atomic_swap_int(&sequence_emerg_pager, 1);
2217 wakeup(&sequence_emerg_pager);
2221 * Sequence emergency pager startup
2224 while (sequence_emerg_pager == 0)
2225 tsleep(&sequence_emerg_pager, 0, "pstartup", hz);
2229 * The pageout daemon is never done, so loop forever.
2231 * WARNING! This code is being executed by two kernel threads
2232 * potentially simultaneously.
2236 long avail_shortage;
2237 long inactive_shortage;
2238 long vnodes_skipped = 0;
2239 long recycle_count = 0;
2243 * Wait for an action request. If we timeout check to
2244 * see if paging is needed (in case the normal wakeup
2249 * Emergency pagedaemon monitors the primary
2250 * pagedaemon while vm_pages_needed != 0.
2252 * The emergency pagedaemon only runs if VM paging
2253 * is needed and the primary pagedaemon has not
2254 * updated vm_pagedaemon_time for more than 2 seconds.
2256 if (vm_pages_needed)
2257 tsleep(&vm_pagedaemon_time, 0, "psleep", hz);
2259 tsleep(&vm_pagedaemon_time, 0, "psleep", hz*10);
2260 if (vm_pages_needed == 0) {
2264 if ((int)(ticks - vm_pagedaemon_time) < hz * 2) {
2270 * Primary pagedaemon
2272 * NOTE: We unconditionally cleanup PQ_HOLD even
2273 * when there is no work to do.
2275 vm_pageout_scan_hold(q3iterator & PQ_L2_MASK);
2278 if (vm_pages_needed == 0) {
2279 error = tsleep(&vm_pages_needed,
2281 vm_pageout_stats_interval * hz);
2283 vm_paging_needed(0) == 0 &&
2284 vm_pages_needed == 0) {
2285 for (q = 0; q < PQ_L2_SIZE; ++q)
2286 vm_pageout_page_stats(q);
2289 vm_pagedaemon_time = ticks;
2290 vm_pages_needed = 1;
2293 * Wake the emergency pagedaemon up so it
2294 * can monitor us. It will automatically
2295 * go back into a long sleep when
2296 * vm_pages_needed returns to 0.
2298 wakeup(&vm_pagedaemon_time);
2302 mycpu->gd_cnt.v_pdwakeups++;
2305 * Scan for INACTIVE->CLEAN/PAGEOUT
2307 * This routine tries to avoid thrashing the system with
2308 * unnecessary activity.
2310 * Calculate our target for the number of free+cache pages we
2311 * want to get to. This is higher then the number that causes
2312 * allocations to stall (severe) in order to provide hysteresis,
2313 * and if we don't make it all the way but get to the minimum
2314 * we're happy. Goose it a bit if there are multiple requests
2317 * Don't reduce avail_shortage inside the loop or the
2318 * PQAVERAGE() calculation will break.
2320 * NOTE! deficit is differentiated from avail_shortage as
2321 * REQUIRING at least (deficit) pages to be cleaned,
2322 * even if the page queues are in good shape. This
2323 * is used primarily for handling per-process
2324 * RLIMIT_RSS and may also see small values when
2325 * processes block due to low memory.
2329 vm_pagedaemon_time = ticks;
2330 avail_shortage = vm_paging_target() + vm_pageout_deficit;
2331 vm_pageout_deficit = 0;
2333 if (avail_shortage > 0) {
2335 long counts[4] = { 0, 0, 0, 0 };
2338 if (vm_pageout_debug) {
2339 kprintf("scan_inactive pass %d isep=%d\t",
2340 pass / MAXSCAN_DIVIDER, isep);
2344 for (q = 0; q < PQ_L2_SIZE; ++q) {
2345 delta += vm_pageout_scan_inactive(
2346 pass / MAXSCAN_DIVIDER,
2348 PQAVERAGE(avail_shortage),
2349 &vnodes_skipped, counts);
2354 if (avail_shortage - delta <= 0)
2358 * It is possible for avail_shortage to be
2359 * very large. If a large program exits or
2360 * frees a ton of memory all at once, we do
2361 * not have to continue deactivations.
2363 * (We will still run the active->inactive
2366 if (!vm_page_count_target() &&
2368 vm_page_free_hysteresis)) {
2373 if (vm_pageout_debug) {
2374 kprintf("flushed %ld cleaned %ld "
2375 "lru2 %ld react %ld "
2377 counts[0], counts[1],
2378 counts[2], counts[3],
2381 avail_shortage -= delta;
2386 * Figure out how many active pages we must deactivate. If
2387 * we were able to reach our target with just the inactive
2388 * scan above we limit the number of active pages we
2389 * deactivate to reduce unnecessary work.
2393 vm_pagedaemon_time = ticks;
2394 inactive_shortage = vmstats.v_inactive_target -
2395 vmstats.v_inactive_count;
2398 * If we were unable to free sufficient inactive pages to
2399 * satisfy the free/cache queue requirements then simply
2400 * reaching the inactive target may not be good enough.
2401 * Try to deactivate pages in excess of the target based
2404 * However to prevent thrashing the VM system do not
2405 * deactivate more than an additional 1/10 the inactive
2406 * target's worth of active pages.
2408 if (avail_shortage > 0) {
2409 tmp = avail_shortage * 2;
2410 if (tmp > vmstats.v_inactive_target / 10)
2411 tmp = vmstats.v_inactive_target / 10;
2412 inactive_shortage += tmp;
2416 * Only trigger a pmap cleanup on inactive shortage.
2418 if (isep == 0 && inactive_shortage > 0) {
2423 * Scan for ACTIVE->INACTIVE
2425 * Only trigger on inactive shortage. Triggering on
2426 * avail_shortage can starve the active queue with
2427 * unnecessary active->inactive transitions and destroy
2430 * If this is the emergency pager, always try to move
2431 * a few pages from active to inactive because the inactive
2432 * queue might have enough pages, but not enough anonymous
2435 if (isep && inactive_shortage < vm_emerg_launder)
2436 inactive_shortage = vm_emerg_launder;
2438 if (/*avail_shortage > 0 ||*/ inactive_shortage > 0) {
2443 for (q = 0; q < PQ_L2_SIZE; ++q) {
2444 delta += vm_pageout_scan_active(
2445 pass / MAXSCAN_DIVIDER,
2447 PQAVERAGE(avail_shortage),
2448 PQAVERAGE(inactive_shortage),
2454 if (inactive_shortage - delta <= 0 &&
2455 avail_shortage - delta <= 0) {
2460 * inactive_shortage can be a very large
2461 * number. This is intended to break out
2462 * early if our inactive_target has been
2463 * reached due to other system activity.
2465 if (vmstats.v_inactive_count >
2466 vmstats.v_inactive_target) {
2467 inactive_shortage = 0;
2471 inactive_shortage -= delta;
2472 avail_shortage -= delta;
2477 * Scan for CACHE->FREE
2479 * Finally free enough cache pages to meet our free page
2480 * requirement and take more drastic measures if we are
2485 vm_pagedaemon_time = ticks;
2486 vm_pageout_scan_cache(avail_shortage, pass / MAXSCAN_DIVIDER,
2487 vnodes_skipped, recycle_count);
2490 * This is a bit sophisticated because we do not necessarily
2491 * want to force paging until our targets are reached if we
2492 * were able to successfully retire the shortage we calculated.
2494 if (avail_shortage > 0) {
2496 * If we did not retire enough pages continue the
2497 * pageout operation until we are able to. It
2498 * takes MAXSCAN_DIVIDER passes to cover the entire
2503 if (pass / MAXSCAN_DIVIDER < 10 &&
2504 vm_pages_needed > 1) {
2506 * Normal operation, additional processes
2507 * have already kicked us. Retry immediately
2508 * unless swap space is completely full in
2509 * which case delay a bit.
2511 if (swap_pager_full) {
2512 tsleep(&vm_pages_needed, 0, "pdelay",
2514 } /* else immediate retry */
2515 } else if (pass / MAXSCAN_DIVIDER < 10) {
2517 * Do a short sleep for the first 10 passes,
2518 * allow the sleep to be woken up by resetting
2519 * vm_pages_needed to 1 (NOTE: we are still
2523 vm_pages_needed = 1;
2524 tsleep(&vm_pages_needed, 0, "pdelay", 2);
2525 } else if (swap_pager_full == 0) {
2527 * We've taken too many passes, force a
2530 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10);
2533 * Running out of memory, catastrophic
2534 * back-off to one-second intervals.
2536 tsleep(&vm_pages_needed, 0, "pdelay", hz);
2538 } else if (vm_pages_needed) {
2540 * We retired our calculated shortage but we may have
2541 * to continue paging if threads drain memory too far
2544 * Similar to vm_page_free_wakeup() in vm_page.c.
2547 if (!vm_paging_needed(0)) {
2548 /* still more than half-way to our target */
2549 vm_pages_needed = 0;
2550 wakeup(&vmstats.v_free_count);
2552 if (!vm_page_count_min(vm_page_free_hysteresis)) {
2554 * Continue operations with wakeup
2555 * (set variable to avoid overflow)
2557 vm_pages_needed = 2;
2558 wakeup(&vmstats.v_free_count);
2561 * No wakeup() needed, continue operations.
2562 * (set variable to avoid overflow)
2564 vm_pages_needed = 2;
2568 * Turn paging back on immediately if we are under
2576 static struct kproc_desc pg1_kp = {
2581 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &pg1_kp);
2583 static struct kproc_desc pg2_kp = {
2588 SYSINIT(emergpager, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, kproc_start, &pg2_kp);
2592 * Called after allocating a page out of the cache or free queue
2593 * to possibly wake the pagedaemon up to replentish our supply.
2595 * We try to generate some hysteresis by waking the pagedaemon up
2596 * when our free+cache pages go below the free_min+cache_min level.
2597 * The pagedaemon tries to get the count back up to at least the
2598 * minimum, and through to the target level if possible.
2600 * If the pagedaemon is already active bump vm_pages_needed as a hint
2601 * that there are even more requests pending.
2607 pagedaemon_wakeup(void)
2609 if (vm_paging_needed(0) && curthread != pagethread) {
2610 if (vm_pages_needed <= 1) {
2611 vm_pages_needed = 1; /* SMP race ok */
2612 wakeup(&vm_pages_needed); /* tickle pageout */
2613 } else if (vm_page_count_min(0)) {
2614 ++vm_pages_needed; /* SMP race ok */
2615 /* a wakeup() would be wasted here */
2620 #if !defined(NO_SWAPPING)
2627 vm_req_vmdaemon(void)
2629 static int lastrun = 0;
2631 if ((ticks > (lastrun + hz)) || (ticks < lastrun)) {
2632 wakeup(&vm_daemon_needed);
2637 static int vm_daemon_callback(struct proc *p, void *data __unused);
2648 tsleep(&vm_daemon_needed, 0, "psleep", 0);
2649 req_swapout = atomic_swap_int(&vm_pageout_req_swapout, 0);
2655 swapout_procs(vm_pageout_req_swapout);
2658 * scan the processes for exceeding their rlimits or if
2659 * process is swapped out -- deactivate pages
2661 allproc_scan(vm_daemon_callback, NULL, 0);
2666 vm_daemon_callback(struct proc *p, void *data __unused)
2669 vm_pindex_t limit, size;
2672 * if this is a system process or if we have already
2673 * looked at this process, skip it.
2675 lwkt_gettoken(&p->p_token);
2677 if (p->p_flags & (P_SYSTEM | P_WEXIT)) {
2678 lwkt_reltoken(&p->p_token);
2683 * if the process is in a non-running type state,
2686 if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) {
2687 lwkt_reltoken(&p->p_token);
2694 limit = OFF_TO_IDX(qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur,
2695 p->p_rlimit[RLIMIT_RSS].rlim_max));
2698 * let processes that are swapped out really be
2699 * swapped out. Set the limit to nothing to get as
2700 * many pages out to swap as possible.
2702 if (p->p_flags & P_SWAPPEDOUT)
2707 size = pmap_resident_tlnw_count(&vm->vm_pmap);
2708 if (limit >= 0 && size > 4096 &&
2709 size - 4096 >= limit && vm_pageout_memuse_mode >= 1) {
2710 vm_pageout_map_deactivate_pages(&vm->vm_map, limit);
2714 lwkt_reltoken(&p->p_token);