gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2003-2020 The DragonFly Project. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
	5	* by Matthew Dillon <dillon@backplane.com>
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	*
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*
	34	* Copyright (c) 1991 Regents of the University of California.
	35	* All rights reserved.
	36	* Copyright (c) 1994 John S. Dyson
	37	* All rights reserved.
	38	* Copyright (c) 1994 David Greenman
	39	* All rights reserved.
	40	*
	41	* This code is derived from software contributed to Berkeley by
	42	* The Mach Operating System project at Carnegie-Mellon University.
	43	*
	44	* Redistribution and use in source and binary forms, with or without
	45	* modification, are permitted provided that the following conditions
	46	* are met:
	47	* 1. Redistributions of source code must retain the above copyright
	48	* notice, this list of conditions and the following disclaimer.
	49	* 2. Redistributions in binary form must reproduce the above copyright
	50	* notice, this list of conditions and the following disclaimer in the
	51	* documentation and/or other materials provided with the distribution.
	52	* 3. Neither the name of the University nor the names of its contributors
	53	* may be used to endorse or promote products derived from this software
	54	* without specific prior written permission.
	55	*
	56	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	57	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	58	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	59	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	60	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	61	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	62	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	63	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	64	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	65	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	66	* SUCH DAMAGE.
	67	*
	68	* from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91
	69	*
	70	*
	71	* Copyright (c) 1987, 1990 Carnegie-Mellon University.
	72	* All rights reserved.
	73	*
	74	* Authors: Avadis Tevanian, Jr., Michael Wayne Young
	75	*
	76	* Permission to use, copy, modify and distribute this software and
	77	* its documentation is hereby granted, provided that both the copyright
	78	* notice and this permission notice appear in all copies of the
	79	* software, derivative works or modified versions, and any portions
	80	* thereof, and that both notices appear in supporting documentation.
	81	*
	82	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
	83	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
	84	* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
	85	*
	86	* Carnegie Mellon requests users of this software to return to
	87	*
	88	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
	89	* School of Computer Science
	90	* Carnegie Mellon University
	91	* Pittsburgh PA 15213-3890
	92	*
	93	* any improvements or extensions that they make and grant Carnegie the
	94	* rights to redistribute these changes.
	95	*/
	96
	97	/*
	98	* The proverbial page-out daemon, rewritten many times over the decades.
	99	*/
	100
	101	#include "opt_vm.h"
	102	#include <sys/param.h>
	103	#include <sys/systm.h>
	104	#include <sys/kernel.h>
	105	#include <sys/proc.h>
	106	#include <sys/kthread.h>
	107	#include <sys/resourcevar.h>
	108	#include <sys/signalvar.h>
	109	#include <sys/vnode.h>
	110	#include <sys/malloc.h>
	111	#include <sys/vmmeter.h>
	112	#include <sys/conf.h>
	113	#include <sys/sysctl.h>
	114
	115	#include <vm/vm.h>
	116	#include <vm/vm_param.h>
	117	#include <sys/lock.h>
	118	#include <vm/vm_object.h>
	119	#include <vm/vm_page.h>
	120	#include <vm/vm_map.h>
	121	#include <vm/vm_pageout.h>
	122	#include <vm/vm_pager.h>
	123	#include <vm/swap_pager.h>
	124	#include <vm/vm_extern.h>
	125
	126	#include <sys/spinlock2.h>
	127	#include <vm/vm_page2.h>
	128
	129	/*
	130	* Persistent markers held by pageout daemon (array)
	131	*/
	132	struct markers {
	133	struct vm_page hold;
	134	struct vm_page stat;
	135	struct vm_page pact;
	136	};
	137
	138	/*
	139	* System initialization
	140	*/
	141
	142	/* the kernel process "vm_pageout"*/
	143	static int vm_pageout_page(vm_page_t m, long *max_launderp,
	144	long vnodes_skippedp, struct vnode *vpfailedp,
	145	int pass, int vmflush_flags, long *counts);
	146	static int vm_pageout_clean_helper (vm_page_t, int);
	147	static void vm_pageout_free_page_calc (vm_size_t count);
	148	static void vm_pageout_page_free(vm_page_t m) ;
	149	__read_frequently struct thread *emergpager;
	150	__read_frequently struct thread *pagethread;
	151	static int sequence_emerg_pager;
	152
	153	#if !defined(NO_SWAPPING)
	154	/* the kernel process "vm_daemon"*/
	155	static void vm_daemon (void);
	156	static struct thread *vmthread;
	157
	158	static struct kproc_desc vm_kp = {
	159	"vmdaemon",
	160	vm_daemon,
	161	&vmthread
	162	};
	163	SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp);
	164	#endif
	165
	166	__read_mostly int vm_pages_needed = 0; /* pageout daemon tsleep event */
	167	__read_mostly int vm_pageout_deficit = 0;/* Estimated number of pages deficit */
	168	__read_mostly int vm_pageout_pages_needed = 0;/* pageout daemon needs pages */
	169	__read_mostly int vm_page_free_hysteresis = 16;
	170	__read_mostly static time_t vm_pagedaemon_uptime;
	171
	172	#if !defined(NO_SWAPPING)
	173	static int vm_daemon_needed;
	174	#endif
	175	__read_mostly static int vm_queue_idle_perc = 20;
	176	__read_mostly static int vm_max_launder = 0;
	177	__read_mostly static int vm_emerg_launder = 100;
	178	__read_mostly static int vm_pageout_stats_actcmp = 0;
	179	__read_mostly static int vm_pageout_stats_inamin = 16;
	180	__read_mostly static int vm_pageout_stats_inalim = 4096;
	181	__read_mostly static int vm_pageout_stats_scan = 0;
	182	__read_mostly static int vm_pageout_stats_ticks = 0;
	183	__read_mostly static int vm_pageout_algorithm = 0;
	184	__read_mostly static int defer_swap_pageouts = 0;
	185	__read_mostly static int disable_swap_pageouts = 0;
	186	__read_mostly static u_int vm_anonmem_decline = ACT_DECLINE;
	187	__read_mostly static u_int vm_filemem_decline = ACT_DECLINE * 2;
	188	__read_mostly static int vm_pageout_debug;
	189	__read_mostly static long vm_pageout_stats_rsecs = 300;
	190
	191	#if defined(NO_SWAPPING)
	192	__read_mostly static int vm_swap_enabled=0;
	193	#else
	194	__read_mostly static int vm_swap_enabled=1;
	195	#endif
	196
	197	/* 0-disable, 1-passive, 2-active swp, 3-acive swp + single-queue dirty pages*/
	198	__read_mostly int vm_pageout_memuse_mode=2;
	199	__read_mostly int vm_pageout_allow_active=1;
	200
	201	SYSCTL_UINT(_vm, VM_PAGEOUT_ALGORITHM, anonmem_decline,
	202	CTLFLAG_RW, &vm_anonmem_decline, 0, "active->inactive anon memory");
	203
	204	SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, filemem_decline,
	205	CTLFLAG_RW, &vm_filemem_decline, 0, "active->inactive file cache");
	206
	207	SYSCTL_INT(_vm, OID_AUTO, page_free_hysteresis,
	208	CTLFLAG_RW, &vm_page_free_hysteresis, 0,
	209	"Free more pages than the minimum required");
	210
	211	SYSCTL_INT(_vm, OID_AUTO, queue_idle_perc,
	212	CTLFLAG_RW, &vm_queue_idle_perc, 0, "page stats stop point, percent");
	213
	214	SYSCTL_INT(_vm, OID_AUTO, max_launder,
	215	CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout");
	216	SYSCTL_INT(_vm, OID_AUTO, emerg_launder,
	217	CTLFLAG_RW, &vm_emerg_launder, 0, "Emergency pager minimum");
	218
	219	SYSCTL_INT(_vm, OID_AUTO, pageout_stats_actcmp,
	220	CTLFLAG_RW, &vm_pageout_stats_actcmp, 0,
	221	"Current dynamic act_count comparator");
	222	SYSCTL_INT(_vm, OID_AUTO, pageout_stats_inamin,
	223	CTLFLAG_RW, &vm_pageout_stats_inamin, 0,
	224	"min out of lim tests must match");
	225	SYSCTL_INT(_vm, OID_AUTO, pageout_stats_inalim,
	226	CTLFLAG_RW, &vm_pageout_stats_inalim, 0,
	227	"min out of lim tests must match");
	228	SYSCTL_INT(_vm, OID_AUTO, pageout_stats_ticks,
	229	CTLFLAG_RW, &vm_pageout_stats_ticks, 0,
	230	"Interval for partial stats scan");
	231	SYSCTL_INT(_vm, OID_AUTO, pageout_stats_scan,
	232	CTLFLAG_RW, &vm_pageout_stats_scan, 0,
	233	"hold/ACT scan count per interval");
	234	SYSCTL_LONG(_vm, OID_AUTO, pageout_stats_rsecs,
	235	CTLFLAG_RW, &vm_pageout_stats_rsecs, 0,
	236	"min out of lim tests must match");
	237
	238	SYSCTL_INT(_vm, OID_AUTO, pageout_memuse_mode,
	239	CTLFLAG_RW, &vm_pageout_memuse_mode, 0, "memoryuse resource mode");
	240	SYSCTL_INT(_vm, OID_AUTO, pageout_allow_active,
	241	CTLFLAG_RW, &vm_pageout_allow_active, 0, "allow inactive+active");
	242	SYSCTL_INT(_vm, OID_AUTO, pageout_debug,
	243	CTLFLAG_RW, &vm_pageout_debug, 0, "debug pageout pages (count)");
	244
	245
	246	#if defined(NO_SWAPPING)
	247	SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
	248	CTLFLAG_RD, &vm_swap_enabled, 0, "");
	249	#else
	250	SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
	251	CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout");
	252	#endif
	253
	254	SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts,
	255	CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem");
	256
	257	SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
	258	CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
	259
	260	static int pageout_lock_miss;
	261	SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
	262	CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
	263
	264	int vm_page_max_wired; /* XXX max # of wired pages system-wide */
	265
	266	static MALLOC_DEFINE(M_PAGEOUT, "pageout", "Pageout structures");
	267
	268	#if !defined(NO_SWAPPING)
	269	static void vm_req_vmdaemon (void);
	270	#endif
	271
	272	#define MAXSCAN_DIVIDER 10
	273
	274	#define VM_CACHE_SCAN_MIN 16
	275	#define VM_CACHE_SCAN_NOM (VM_CACHE_SCAN_MIN * 4)
	276
	277	/*
	278	* Calculate approximately how many pages on each queue to try to
	279	* clean. An exact calculation creates an edge condition when the
	280	* queues are unbalanced so add significant slop. The queue scans
	281	* will stop early when targets are reached and will start where they
	282	* left off on the next pass.
	283	*
	284	* We need to be generous here because there are all sorts of loading
	285	* conditions that can cause edge cases if try to average over all queues.
	286	* In particular, storage subsystems have become so fast that paging
	287	* activity can become quite frantic. Eventually we will probably need
	288	* two paging threads, one for dirty pages and one for clean, to deal
	289	* with the bandwidth requirements.
	290
	291	* So what we do is calculate a value that can be satisfied nominally by
	292	* only having to scan half the queues.
	293	*/
	294	static __inline long
	295	PQAVERAGE(long n)
	296	{
	297	long avg;
	298
	299	if (n >= 0) {
	300	avg = ((n + (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) + 1);
	301	} else {
	302	avg = ((n - (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) - 1);
	303	}
	304	return avg;
	305	}
	306
	307	/*
	308	* vm_pageout_clean_helper:
	309	*
	310	* Clean the page and remove it from the laundry. The page must be busied
	311	* by the caller and will be disposed of (put away, flushed) by this routine.
	312	*/
	313	static int
	314	vm_pageout_clean_helper(vm_page_t m, int vmflush_flags)
	315	{
	316	vm_object_t object;
	317	vm_page_t mc[BLIST_MAX_ALLOC];
	318	int error;
	319	int ib, is, page_base;
	320	vm_pindex_t pindex = m->pindex;
	321
	322	object = m->object;
	323
	324	/*
	325	* Don't mess with the page if it's held or special. Theoretically
	326	* we can pageout held pages but there is no real need to press our
	327	* luck, so don't.
	328	*/
	329	if (m->hold_count != 0 \|\| (m->flags & PG_UNQUEUED)) {
	330	vm_page_wakeup(m);
	331	return 0;
	332	}
	333
	334	/*
	335	* Place page in cluster. Align cluster for optimal swap space
	336	* allocation (whether it is swap or not). This is typically ~16-32
	337	* pages, which also tends to align the cluster to multiples of the
	338	* filesystem block size if backed by a filesystem.
	339	*/
	340	page_base = pindex % BLIST_MAX_ALLOC;
	341	mc[page_base] = m;
	342	ib = page_base - 1;
	343	is = page_base + 1;
	344
	345	/*
	346	* Scan object for clusterable pages.
	347	*
	348	* We can cluster ONLY if: ->> the page is NOT
	349	* clean, wired, busy, held, or mapped into a
	350	* buffer, and one of the following:
	351	* 1) The page is inactive, or a seldom used
	352	* active page.
	353	* -or-
	354	* 2) we force the issue.
	355	*
	356	* During heavy mmap/modification loads the pageout
	357	* daemon can really fragment the underlying file
	358	* due to flushing pages out of order and not trying
	359	* align the clusters (which leave sporatic out-of-order
	360	* holes). To solve this problem we do the reverse scan
	361	* first and attempt to align our cluster, then do a
	362	* forward scan if room remains.
	363	*/
	364	vm_object_hold(object);
	365
	366	while (ib >= 0) {
	367	vm_page_t p;
	368
	369	p = vm_page_lookup_busy_try(object, pindex - page_base + ib,
	370	TRUE, &error);
	371	if (error \|\| p == NULL)
	372	break;
	373	if ((p->queue - p->pc) == PQ_CACHE \|\|
	374	(p->flags & PG_UNQUEUED)) {
	375	vm_page_wakeup(p);
	376	break;
	377	}
	378	vm_page_test_dirty(p);
	379	if (((p->dirty & p->valid) == 0 &&
	380	(p->flags & PG_NEED_COMMIT) == 0) \|\|
	381	p->wire_count != 0 \|\| /* may be held by buf cache */
	382	p->hold_count != 0) { /* may be undergoing I/O */
	383	vm_page_wakeup(p);
	384	break;
	385	}
	386	if (p->queue - p->pc != PQ_INACTIVE) {
	387	if (p->queue - p->pc != PQ_ACTIVE \|\|
	388	(vmflush_flags & OBJPC_ALLOW_ACTIVE) == 0) {
	389	vm_page_wakeup(p);
	390	break;
	391	}
	392	}
	393
	394	/*
	395	* Try to maintain page groupings in the cluster.
	396	*/
	397	if (m->flags & PG_WINATCFLS)
	398	vm_page_flag_set(p, PG_WINATCFLS);
	399	else
	400	vm_page_flag_clear(p, PG_WINATCFLS);
	401	p->act_count = m->act_count;
	402
	403	mc[ib] = p;
	404	--ib;
	405	}
	406	++ib; /* fixup */
	407
	408	while (is < BLIST_MAX_ALLOC &&
	409	pindex - page_base + is < object->size) {
	410	vm_page_t p;
	411
	412	p = vm_page_lookup_busy_try(object, pindex - page_base + is,
	413	TRUE, &error);
	414	if (error \|\| p == NULL)
	415	break;
	416	if (((p->queue - p->pc) == PQ_CACHE) \|\|
	417	(p->flags & PG_UNQUEUED)) {
	418	vm_page_wakeup(p);
	419	break;
	420	}
	421	vm_page_test_dirty(p);
	422	if (((p->dirty & p->valid) == 0 &&
	423	(p->flags & PG_NEED_COMMIT) == 0) \|\|
	424	p->wire_count != 0 \|\| /* may be held by buf cache */
	425	p->hold_count != 0) { /* may be undergoing I/O */
	426	vm_page_wakeup(p);
	427	break;
	428	}
	429	if (p->queue - p->pc != PQ_INACTIVE) {
	430	if (p->queue - p->pc != PQ_ACTIVE \|\|
	431	(vmflush_flags & OBJPC_ALLOW_ACTIVE) == 0) {
	432	vm_page_wakeup(p);
	433	break;
	434	}
	435	}
	436
	437	/*
	438	* Try to maintain page groupings in the cluster.
	439	*/
	440	if (m->flags & PG_WINATCFLS)
	441	vm_page_flag_set(p, PG_WINATCFLS);
	442	else
	443	vm_page_flag_clear(p, PG_WINATCFLS);
	444	p->act_count = m->act_count;
	445
	446	mc[is] = p;
	447	++is;
	448	}
	449
	450	vm_object_drop(object);
	451
	452	/*
	453	* we allow reads during pageouts...
	454	*/
	455	return vm_pageout_flush(&mc[ib], is - ib, vmflush_flags);
	456	}
	457
	458	/*
	459	* vm_pageout_flush() - launder the given pages
	460	*
	461	* The given pages are laundered. Note that we setup for the start of
	462	* I/O ( i.e. busy the page ), mark it read-only, and bump the object
	463	* reference count all in here rather then in the parent. If we want
	464	* the parent to do more sophisticated things we may have to change
	465	* the ordering.
	466	*
	467	* The pages in the array must be busied by the caller and will be
	468	* unbusied by this function.
	469	*/
	470	int
	471	vm_pageout_flush(vm_page_t *mc, int count, int vmflush_flags)
	472	{
	473	vm_object_t object;
	474	int pageout_status[count];
	475	int numpagedout = 0;
	476	int i;
	477
	478	/*
	479	* Initiate I/O. Bump the vm_page_t->busy counter.
	480	*/
	481	for (i = 0; i < count; i++) {
	482	KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL,
	483	("vm_pageout_flush page %p index %d/%d: partially "
	484	"invalid page", mc[i], i, count));
	485	vm_page_io_start(mc[i]);
	486	}
	487
	488	/*
	489	* We must make the pages read-only. This will also force the
	490	* modified bit in the related pmaps to be cleared. The pager
	491	* cannot clear the bit for us since the I/O completion code
	492	* typically runs from an interrupt. The act of making the page
	493	* read-only handles the case for us.
	494	*
	495	* Then we can unbusy the pages, we still hold a reference by virtue
	496	* of our soft-busy.
	497	*/
	498	for (i = 0; i < count; i++) {
	499	if (vmflush_flags & OBJPC_TRY_TO_CACHE)
	500	vm_page_protect(mc[i], VM_PROT_NONE);
	501	else
	502	vm_page_protect(mc[i], VM_PROT_READ);
	503	vm_page_wakeup(mc[i]);
	504	}
	505
	506	object = mc[0]->object;
	507	vm_object_pip_add(object, count);
	508
	509	vm_pager_put_pages(object, mc, count,
	510	(vmflush_flags \|
	511	((object == kernel_object) ? OBJPC_SYNC : 0)),
	512	pageout_status);
	513
	514	for (i = 0; i < count; i++) {
	515	vm_page_t mt = mc[i];
	516
	517	switch (pageout_status[i]) {
	518	case VM_PAGER_OK:
	519	numpagedout++;
	520	break;
	521	case VM_PAGER_PEND:
	522	numpagedout++;
	523	break;
	524	case VM_PAGER_BAD:
	525	/*
	526	* Page outside of range of object. Right now we
	527	* essentially lose the changes by pretending it
	528	* worked.
	529	*/
	530	vm_page_busy_wait(mt, FALSE, "pgbad");
	531	pmap_clear_modify(mt);
	532	vm_page_undirty(mt);
	533	vm_page_wakeup(mt);
	534	break;
	535	case VM_PAGER_ERROR:
	536	case VM_PAGER_FAIL:
	537	/*
	538	* A page typically cannot be paged out when we
	539	* have run out of swap. We leave the page
	540	* marked inactive and will try to page it out
	541	* again later.
	542	*
	543	* Starvation of the active page list is used to
	544	* determine when the system is massively memory
	545	* starved.
	546	*/
	547	break;
	548	case VM_PAGER_AGAIN:
	549	break;
	550	}
	551
	552	/*
	553	* If not PENDing this was a synchronous operation and we
	554	* clean up after the I/O. If it is PENDing the mess is
	555	* cleaned up asynchronously.
	556	*
	557	* Also nominally act on the caller's wishes if the caller
	558	* wants to try to really clean (cache or free) the page.
	559	*
	560	* Also nominally deactivate the page if the system is
	561	* memory-stressed.
	562	*/
	563	if (pageout_status[i] != VM_PAGER_PEND) {
	564	vm_page_busy_wait(mt, FALSE, "pgouw");
	565	vm_page_io_finish(mt);
	566	if (vmflush_flags & OBJPC_TRY_TO_CACHE) {
	567	vm_page_try_to_cache(mt);
	568	} else if (vm_paging_severe()) {
	569	vm_page_deactivate(mt);
	570	vm_page_wakeup(mt);
	571	} else {
	572	vm_page_wakeup(mt);
	573	}
	574	vm_object_pip_wakeup(object);
	575	}
	576	}
	577	return numpagedout;
	578	}
	579
	580	#if !defined(NO_SWAPPING)
	581
	582	/*
	583	* Callback function, page busied for us. We must dispose of the busy
	584	* condition. Any related pmap pages may be held but will not be locked.
	585	*/
	586	static
	587	int
	588	vm_pageout_mdp_callback(struct pmap_pgscan_info *info, vm_offset_t va,
	589	vm_page_t p)
	590	{
	591	int actcount;
	592	int cleanit = 0;
	593
	594	/*
	595	* Basic tests - There should never be a marker, and we can stop
	596	* once the RSS is below the required level.
	597	*/
	598	KKASSERT((p->flags & PG_MARKER) == 0);
	599	if (pmap_resident_tlnw_count(info->pmap) <= info->limit) {
	600	vm_page_wakeup(p);
	601	return(-1);
	602	}
	603
	604	mycpu->gd_cnt.v_pdpages++;
	605
	606	if (p->wire_count \|\| p->hold_count \|\| (p->flags & PG_UNQUEUED)) {
	607	vm_page_wakeup(p);
	608	goto done;
	609	}
	610
	611	++info->actioncount;
	612
	613	/*
	614	* Check if the page has been referened recently. If it has,
	615	* activate it and skip.
	616	*/
	617	actcount = pmap_ts_referenced(p);
	618	if (actcount) {
	619	vm_page_flag_set(p, PG_REFERENCED);
	620	} else if (p->flags & PG_REFERENCED) {
	621	actcount = 1;
	622	}
	623
	624	if (actcount) {
	625	if (p->queue - p->pc != PQ_ACTIVE) {
	626	vm_page_and_queue_spin_lock(p);
	627	if (p->queue - p->pc != PQ_ACTIVE) {
	628	vm_page_and_queue_spin_unlock(p);
	629	vm_page_activate(p);
	630	} else {
	631	vm_page_and_queue_spin_unlock(p);
	632	}
	633	} else {
	634	p->act_count += actcount;
	635	if (p->act_count > ACT_MAX)
	636	p->act_count = ACT_MAX;
	637	}
	638	vm_page_flag_clear(p, PG_REFERENCED);
	639	vm_page_wakeup(p);
	640	goto done;
	641	}
	642
	643	/*
	644	* Remove the page from this particular pmap. Once we do this, our
	645	* pmap scans will not see it again (unless it gets faulted in), so
	646	* we must actively dispose of or deal with the page.
	647	*/
	648	pmap_remove_specific(info->pmap, p);
	649
	650	/*
	651	* If the page is not mapped to another process (i.e. as would be
	652	* typical if this were a shared page from a library) then deactivate
	653	* the page and clean it in two passes only.
	654	*
	655	* If the page hasn't been referenced since the last check, remove it
	656	* from the pmap. If it is no longer mapped, deactivate it
	657	* immediately, accelerating the normal decline.
	658	*
	659	* Once the page has been removed from the pmap the RSS code no
	660	* longer tracks it so we have to make sure that it is staged for
	661	* potential flush action.
	662	*
	663	* XXX
	664	*/
	665	if ((p->flags & PG_MAPPED) == 0 \|\|
	666	(pmap_mapped_sync(p) & PG_MAPPED) == 0) {
	667	if (p->queue - p->pc == PQ_ACTIVE) {
	668	vm_page_deactivate(p);
	669	}
	670	if (p->queue - p->pc == PQ_INACTIVE) {
	671	cleanit = 1;
	672	}
	673	}
	674
	675	/*
	676	* Ok, try to fully clean the page and any nearby pages such that at
	677	* least the requested page is freed or moved to the cache queue.
	678	*
	679	* We usually do this synchronously to allow us to get the page into
	680	* the CACHE queue quickly, which will prevent memory exhaustion if
	681	* a process with a memoryuse limit is running away. However, the
	682	* sysadmin may desire to set vm.swap_user_async which relaxes this
	683	* and improves write performance.
	684	*/
	685	if (cleanit) {
	686	long max_launder = 0x7FFF;
	687	long vnodes_skipped = 0;
	688	long counts[4] = { 0, 0, 0, 0 };
	689	int vmflush_flags;
	690	struct vnode *vpfailed = NULL;
	691
	692	info->offset = va;
	693
	694	if (vm_pageout_memuse_mode >= 2) {
	695	vmflush_flags = OBJPC_TRY_TO_CACHE \|
	696	OBJPC_ALLOW_ACTIVE;
	697	if (swap_user_async == 0)
	698	vmflush_flags \|= OBJPC_SYNC;
	699	vm_page_flag_set(p, PG_WINATCFLS);
	700	info->cleancount +=
	701	vm_pageout_page(p, &max_launder,
	702	&vnodes_skipped,
	703	&vpfailed, 1, vmflush_flags,
	704	counts);
	705	} else {
	706	vm_page_wakeup(p);
	707	++info->cleancount;
	708	}
	709	} else {
	710	vm_page_wakeup(p);
	711	}
	712
	713	/*
	714	* Must be at end to avoid SMP races.
	715	*/
	716	done:
	717	lwkt_user_yield();
	718	return 0;
	719	}
	720
	721	/*
	722	* Deactivate some number of pages in a map due to set RLIMIT_RSS limits.
	723	* that is relatively difficult to do. We try to keep track of where we
	724	* left off last time to reduce scan overhead.
	725	*
	726	* Called when vm_pageout_memuse_mode is >= 1.
	727	*/
	728	void
	729	vm_pageout_map_deactivate_pages(vm_map_t map, vm_pindex_t limit)
	730	{
	731	vm_offset_t pgout_offset;
	732	struct pmap_pgscan_info info;
	733	int retries = 3;
	734
	735	pgout_offset = map->pgout_offset;
	736	again:
	737	#if 0
	738	kprintf("%016jx ", pgout_offset);
	739	#endif
	740	if (pgout_offset < VM_MIN_USER_ADDRESS)
	741	pgout_offset = VM_MIN_USER_ADDRESS;
	742	if (pgout_offset >= VM_MAX_USER_ADDRESS)
	743	pgout_offset = 0;
	744	info.pmap = vm_map_pmap(map);
	745	info.limit = limit;
	746	info.beg_addr = pgout_offset;
	747	info.end_addr = VM_MAX_USER_ADDRESS;
	748	info.callback = vm_pageout_mdp_callback;
	749	info.cleancount = 0;
	750	info.actioncount = 0;
	751	info.busycount = 0;
	752
	753	pmap_pgscan(&info);
	754	pgout_offset = info.offset;
	755	#if 0
	756	kprintf("%016jx %08lx %08lx\n", pgout_offset,
	757	info.cleancount, info.actioncount);
	758	#endif
	759
	760	if (pgout_offset != VM_MAX_USER_ADDRESS &&
	761	pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) {
	762	goto again;
	763	} else if (retries &&
	764	pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) {
	765	--retries;
	766	goto again;
	767	}
	768	map->pgout_offset = pgout_offset;
	769	}
	770	#endif
	771
	772	/*
	773	* Called when the pageout scan wants to free a page. We no longer
	774	* try to cycle the vm_object here with a reference & dealloc, which can
	775	* cause a non-trivial object collapse in a critical path.
	776	*
	777	* It is unclear why we cycled the ref_count in the past, perhaps to try
	778	* to optimize shadow chain collapses but I don't quite see why it would
	779	* be necessary. An OBJ_DEAD object should terminate any and all vm_pages
	780	* synchronously and not have to be kicked-start.
	781	*/
	782	static void
	783	vm_pageout_page_free(vm_page_t m)
	784	{
	785	vm_page_protect(m, VM_PROT_NONE);
	786	vm_page_free(m);
	787	}
	788
	789	/*
	790	* vm_pageout_scan does the dirty work for the pageout daemon.
	791	*/
	792	struct vm_pageout_scan_info {
	793	struct proc *bigproc;
	794	vm_offset_t bigsize;
	795	};
	796
	797	static int vm_pageout_scan_callback(struct proc p, void data);
	798
	799	/*
	800	* Scan inactive queue for pages we can cache or free.
	801	*
	802	* WARNING! Can be called from two pagedaemon threads simultaneously.
	803	*/
	804	static int
	805	vm_pageout_scan_inactive(int pass, int q, long avail_shortage,
	806	long vnodes_skipped, long counts)
	807	{
	808	vm_page_t m;
	809	struct vm_page marker;
	810	struct vnode vpfailed; / warning, allowed to be stale */
	811	long maxscan;
	812	long delta = 0;
	813	long max_launder;
	814	int isep;
	815	int vmflush_flags;
	816
	817	isep = (curthread == emergpager);
	818
	819	/*
	820	* This routine is called for each of PQ_L2_SIZE inactive queues.
	821	* We want the vm_max_launder parameter to apply to the whole
	822	* queue (i.e. per-whole-queue pass, not per-sub-queue).
	823	*
	824	* In each successive full-pass when the page target is not met we
	825	* allow the per-queue max_launder to increase up to a maximum of
	826	* vm_max_launder / 16.
	827	*/
	828	max_launder = (long)vm_max_launder / PQ_L2_SIZE;
	829	if (pass)
	830	max_launder *= 2;
	831	max_launder = (max_launder + MAXSCAN_DIVIDER - 1) / MAXSCAN_DIVIDER;
	832
	833	if (max_launder <= 1)
	834	max_launder = 1;
	835	if (max_launder >= vm_max_launder / 16)
	836	max_launder = vm_max_launder / 16 + 1;
	837
	838	/*
	839	* Start scanning the inactive queue for pages we can move to the
	840	* cache or free. The scan will stop when the target is reached or
	841	* we have scanned the entire inactive queue. Note that m->act_count
	842	* is not used to form decisions for the inactive queue, only for the
	843	* active queue.
	844	*
	845	* NOTE! THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED
	846	* PAGES.
	847	*/
	848
	849	/*
	850	* Initialize our marker
	851	*/
	852	bzero(&marker, sizeof(marker));
	853	marker.flags = PG_FICTITIOUS \| PG_MARKER;
	854	marker.busy_count = PBUSY_LOCKED;
	855	marker.queue = PQ_INACTIVE + q;
	856	marker.pc = q;
	857	marker.wire_count = 1;
	858
	859	/*
	860	* Inactive queue scan.
	861	*
	862	* We pick off approximately 1/10 of each queue. Each queue is
	863	* effectively organized LRU so scanning the entire queue would
	864	* improperly pick up pages that might still be in regular use.
	865	*
	866	* NOTE: The vm_page must be spinlocked before the queue to avoid
	867	* deadlocks, so it is easiest to simply iterate the loop
	868	* with the queue unlocked at the top.
	869	*/
	870	vpfailed = NULL;
	871
	872	vm_page_queues_spin_lock(PQ_INACTIVE + q);
	873	TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq);
	874	maxscan = (vm_page_queues[PQ_INACTIVE + q].lcnt + MAXSCAN_DIVIDER - 1) /
	875	MAXSCAN_DIVIDER + 1;
	876
	877	/*
	878	* Queue locked at top of loop to avoid stack marker issues.
	879	*/
	880	while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
	881	maxscan-- > 0 && avail_shortage - delta > 0)
	882	{
	883	int count;
	884
	885	KKASSERT(m->queue == PQ_INACTIVE + q);
	886	TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl,
	887	&marker, pageq);
	888	TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE + q].pl, m,
	889	&marker, pageq);
	890	mycpu->gd_cnt.v_pdpages++;
	891
	892	/*
	893	* Skip marker pages (atomic against other markers to avoid
	894	* infinite hop-over scans).
	895	*/
	896	if (m->flags & PG_MARKER)
	897	continue;
	898
	899	/*
	900	* Try to busy the page. Don't mess with pages which are
	901	* already busy or reorder them in the queue.
	902	*/
	903	if (vm_page_busy_try(m, TRUE))
	904	continue;
	905
	906	/*
	907	* Remaining operations run with the page busy and neither
	908	* the page or the queue will be spin-locked.
	909	*/
	910	KKASSERT(m->queue == PQ_INACTIVE + q);
	911	vm_page_queues_spin_unlock(PQ_INACTIVE + q);
	912
	913	/*
	914	* The emergency pager runs when the primary pager gets
	915	* stuck, which typically means the primary pager deadlocked
	916	* on a vnode-backed page. Therefore, the emergency pager
	917	* must skip any complex objects.
	918	*
	919	* We disallow VNODEs unless they are VCHR whos device ops
	920	* does not flag D_NOEMERGPGR.
	921	*/
	922	if (isep && m->object) {
	923	struct vnode *vp;
	924
	925	switch(m->object->type) {
	926	case OBJT_DEFAULT:
	927	case OBJT_SWAP:
	928	/*
	929	* Allow anonymous memory and assume that
	930	* swap devices are not complex, since its
	931	* kinda worthless if we can't swap out dirty
	932	* anonymous pages.
	933	*/
	934	break;
	935	case OBJT_VNODE:
	936	/*
	937	* Allow VCHR device if the D_NOEMERGPGR
	938	* flag is not set, deny other vnode types
	939	* as being too complex.
	940	*/
	941	vp = m->object->handle;
	942	if (vp && vp->v_type == VCHR &&
	943	vp->v_rdev && vp->v_rdev->si_ops &&
	944	(vp->v_rdev->si_ops->head.flags &
	945	D_NOEMERGPGR) == 0) {
	946	break;
	947	}
	948	/* Deny - fall through */
	949	default:
	950	/*
	951	* Deny
	952	*/
	953	vm_page_wakeup(m);
	954	vm_page_queues_spin_lock(PQ_INACTIVE + q);
	955	lwkt_yield();
	956	continue;
	957	}
	958	}
	959
	960	/*
	961	* Try to pageout the page and perhaps other nearby pages.
	962	* We want to get the pages into the cache eventually (
	963	* first or second pass). Otherwise the pages can wind up
	964	* just cycling in the inactive queue, getting flushed over
	965	* and over again.
	966	*
	967	* Generally speaking we recycle dirty pages within PQ_INACTIVE
	968	* twice (double LRU) before paging them out. If the
	969	* memuse_mode is >= 3 we run them single-LRU like we do clean
	970	* pages.
	971	*/
	972	if (vm_pageout_memuse_mode >= 3)
	973	vm_page_flag_set(m, PG_WINATCFLS);
	974
	975	vmflush_flags = 0;
	976	if (vm_pageout_allow_active)
	977	vmflush_flags \|= OBJPC_ALLOW_ACTIVE;
	978	if (m->flags & PG_WINATCFLS)
	979	vmflush_flags \|= OBJPC_TRY_TO_CACHE;
	980	count = vm_pageout_page(m, &max_launder, vnodes_skipped,
	981	&vpfailed, pass, vmflush_flags, counts);
	982	delta += count;
	983
	984	/*
	985	* Systems with a ton of memory can wind up with huge
	986	* deactivation counts. Because the inactive scan is
	987	* doing a lot of flushing, the combination can result
	988	* in excessive paging even in situations where other
	989	* unrelated threads free up sufficient VM.
	990	*
	991	* To deal with this we abort the nominal active->inactive
	992	* scan before we hit the inactive target when free+cache
	993	* levels have reached a reasonable target.
	994	*
	995	* When deciding to stop early we need to add some slop to
	996	* the test and we need to return full completion to the caller
	997	* to prevent the caller from thinking there is something
	998	* wrong and issuing a low-memory+swap warning or pkill.
	999	*
	1000	* A deficit forces paging regardless of the state of the
	1001	* VM page queues (used for RSS enforcement).
	1002	*/
	1003	lwkt_yield();
	1004	vm_page_queues_spin_lock(PQ_INACTIVE + q);
	1005
	1006	/* if (vm_paging_target() < -vm_max_launder) */
	1007	if (!vm_paging_target2()) {
	1008	/*
	1009	* Stopping early, return full completion to caller.
	1010	*/
	1011	if (delta < avail_shortage)
	1012	delta = avail_shortage;
	1013	break;
	1014	}
	1015	}
	1016
	1017	/* page queue still spin-locked */
	1018	TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq);
	1019	vm_page_queues_spin_unlock(PQ_INACTIVE + q);
	1020
	1021	return (delta);
	1022	}
	1023
	1024	/*
	1025	* Pageout the specified page, return the total number of pages paged out
	1026	* (this routine may cluster).
	1027	*
	1028	* The page must be busied and soft-busied by the caller and will be disposed
	1029	* of by this function.
	1030	*/
	1031	static int
	1032	vm_pageout_page(vm_page_t m, long max_launderp, long vnodes_skippedp,
	1033	struct vnode **vpfailedp, int pass, int vmflush_flags,
	1034	long *counts)
	1035	{
	1036	vm_object_t object;
	1037	int actcount;
	1038	int count = 0;
	1039
	1040	/*
	1041	* Wiring no longer removes a page from its queue. The last unwiring
	1042	* will requeue the page. Obviously wired pages cannot be paged out
	1043	* so unqueue it and return.
	1044	*/
	1045	if (m->wire_count) {
	1046	vm_page_unqueue_nowakeup(m);
	1047	vm_page_wakeup(m);
	1048	return 0;
	1049	}
	1050
	1051	/*
	1052	* A held page may be undergoing I/O, so skip it.
	1053	*/
	1054	if (m->hold_count) {
	1055	vm_page_and_queue_spin_lock(m);
	1056	if (m->queue - m->pc == PQ_INACTIVE) {
	1057	TAILQ_REMOVE(
	1058	&vm_page_queues[m->queue].pl, m, pageq);
	1059	TAILQ_INSERT_TAIL(
	1060	&vm_page_queues[m->queue].pl, m, pageq);
	1061	}
	1062	vm_page_and_queue_spin_unlock(m);
	1063	vm_page_wakeup(m);
	1064	return 0;
	1065	}
	1066
	1067	if (m->object == NULL \|\| m->object->ref_count == 0) {
	1068	/*
	1069	* If the object is not being used, we ignore previous
	1070	* references.
	1071	*/
	1072	vm_page_flag_clear(m, PG_REFERENCED);
	1073	pmap_clear_reference(m);
	1074	/* fall through to end */
	1075	} else if (((m->flags & PG_REFERENCED) == 0) &&
	1076	(actcount = pmap_ts_referenced(m))) {
	1077	/*
	1078	* Otherwise, if the page has been referenced while
	1079	* in the inactive queue, we bump the "activation
	1080	* count" upwards, making it less likely that the
	1081	* page will be added back to the inactive queue
	1082	* prematurely again. Here we check the page tables
	1083	* (or emulated bits, if any), given the upper level
	1084	* VM system not knowing anything about existing
	1085	* references.
	1086	*/
	1087	++counts[3];
	1088	vm_page_activate(m);
	1089	m->act_count += (actcount + ACT_ADVANCE);
	1090	vm_page_wakeup(m);
	1091	return 0;
	1092	}
	1093
	1094	/*
	1095	* (m) is still busied.
	1096	*
	1097	* If the upper level VM system knows about any page
	1098	* references, we activate the page. We also set the
	1099	* "activation count" higher than normal so that we will less
	1100	* likely place pages back onto the inactive queue again.
	1101	*/
	1102	if ((m->flags & PG_REFERENCED) != 0) {
	1103	vm_page_flag_clear(m, PG_REFERENCED);
	1104	actcount = pmap_ts_referenced(m);
	1105	vm_page_activate(m);
	1106	m->act_count += (actcount + ACT_ADVANCE + 1);
	1107	vm_page_wakeup(m);
	1108	++counts[3];
	1109	return 0;
	1110	}
	1111
	1112	/*
	1113	* If the upper level VM system doesn't know anything about
	1114	* the page being dirty, we have to check for it again. As
	1115	* far as the VM code knows, any partially dirty pages are
	1116	* fully dirty.
	1117	*
	1118	* Pages marked PG_WRITEABLE may be mapped into the user
	1119	* address space of a process running on another cpu. A
	1120	* user process (without holding the MP lock) running on
	1121	* another cpu may be able to touch the page while we are
	1122	* trying to remove it. vm_page_cache() will handle this
	1123	* case for us.
	1124	*/
	1125	if (m->dirty == 0) {
	1126	vm_page_test_dirty(m);
	1127	} else {
	1128	vm_page_dirty(m);
	1129	}
	1130
	1131	if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) {
	1132	/*
	1133	* Invalid pages can be easily freed
	1134	*/
	1135	vm_pageout_page_free(m);
	1136	mycpu->gd_cnt.v_dfree++;
	1137	++count;
	1138	++counts[1];
	1139	} else if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) {
	1140	/*
	1141	* Clean pages can be placed onto the cache queue.
	1142	* This effectively frees them.
	1143	*/
	1144	vm_page_cache(m);
	1145	++count;
	1146	++counts[1];
	1147	} else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) {
	1148	/*
	1149	* Dirty pages need to be paged out, but flushing
	1150	* a page is extremely expensive verses freeing
	1151	* a clean page. Rather then artificially limiting
	1152	* the number of pages we can flush, we instead give
	1153	* dirty pages extra priority on the inactive queue
	1154	* by forcing them to be cycled through the queue
	1155	* twice before being flushed, after which the
	1156	* (now clean) page will cycle through once more
	1157	* before being freed. This significantly extends
	1158	* the thrash point for a heavily loaded machine.
	1159	*/
	1160	++counts[2];
	1161	vm_page_flag_set(m, PG_WINATCFLS);
	1162	vm_page_and_queue_spin_lock(m);
	1163	if (m->queue - m->pc == PQ_INACTIVE) {
	1164	TAILQ_REMOVE(
	1165	&vm_page_queues[m->queue].pl, m, pageq);
	1166	TAILQ_INSERT_TAIL(
	1167	&vm_page_queues[m->queue].pl, m, pageq);
	1168	}
	1169	vm_page_and_queue_spin_unlock(m);
	1170	vm_page_wakeup(m);
	1171	} else if (*max_launderp > 0) {
	1172	/*
	1173	* We always want to try to flush some dirty pages if
	1174	* we encounter them, to keep the system stable.
	1175	* Normally this number is small, but under extreme
	1176	* pressure where there are insufficient clean pages
	1177	* on the inactive queue, we may have to go all out.
	1178	*/
	1179	int swap_pageouts_ok;
	1180	struct vnode *vp = NULL;
	1181
	1182	if ((m->flags & PG_WINATCFLS) == 0)
	1183	vm_page_flag_set(m, PG_WINATCFLS);
	1184	swap_pageouts_ok = 0;
	1185	object = m->object;
	1186	if (object &&
	1187	(object->type != OBJT_SWAP) &&
	1188	(object->type != OBJT_DEFAULT)) {
	1189	swap_pageouts_ok = 1;
	1190	} else {
	1191	swap_pageouts_ok = !(defer_swap_pageouts \|\|
	1192	disable_swap_pageouts);
	1193	swap_pageouts_ok \|= (!disable_swap_pageouts &&
	1194	defer_swap_pageouts &&
	1195	vm_paging_min());
	1196	}
	1197
	1198	/*
	1199	* We don't bother paging objects that are "dead".
	1200	* Those objects are in a "rundown" state.
	1201	*/
	1202	if (!swap_pageouts_ok \|\|
	1203	(object == NULL) \|\|
	1204	(object->flags & OBJ_DEAD)) {
	1205	vm_page_and_queue_spin_lock(m);
	1206	if (m->queue - m->pc == PQ_INACTIVE) {
	1207	TAILQ_REMOVE(
	1208	&vm_page_queues[m->queue].pl,
	1209	m, pageq);
	1210	TAILQ_INSERT_TAIL(
	1211	&vm_page_queues[m->queue].pl,
	1212	m, pageq);
	1213	}
	1214	vm_page_and_queue_spin_unlock(m);
	1215	vm_page_wakeup(m);
	1216	return 0;
	1217	}
	1218
	1219	/*
	1220	* (m) is still busied.
	1221	*
	1222	* The object is already known NOT to be dead. It
	1223	* is possible for the vget() to block the whole
	1224	* pageout daemon, but the new low-memory handling
	1225	* code should prevent it.
	1226	*
	1227	* The previous code skipped locked vnodes and, worse,
	1228	* reordered pages in the queue. This results in
	1229	* completely non-deterministic operation because,
	1230	* quite often, a vm_fault has initiated an I/O and
	1231	* is holding a locked vnode at just the point where
	1232	* the pageout daemon is woken up.
	1233	*
	1234	* We can't wait forever for the vnode lock, we might
	1235	* deadlock due to a vn_read() getting stuck in
	1236	* vm_wait while holding this vnode. We skip the
	1237	* vnode if we can't get it in a reasonable amount
	1238	* of time.
	1239	*
	1240	* vpfailed is used to (try to) avoid the case where
	1241	* a large number of pages are associated with a
	1242	* locked vnode, which could cause the pageout daemon
	1243	* to stall for an excessive amount of time.
	1244	*/
	1245	if (object->type == OBJT_VNODE) {
	1246	int flags;
	1247
	1248	vp = object->handle;
	1249	flags = LK_EXCLUSIVE;
	1250	if (vp == *vpfailedp)
	1251	flags \|= LK_NOWAIT;
	1252	else
	1253	flags \|= LK_TIMELOCK;
	1254	vm_page_hold(m);
	1255	vm_page_wakeup(m);
	1256
	1257	/*
	1258	* We have unbusied (m) temporarily so we can
	1259	* acquire the vp lock without deadlocking.
	1260	* (m) is held to prevent destruction.
	1261	*/
	1262	if (vget(vp, flags) != 0) {
	1263	*vpfailedp = vp;
	1264	++pageout_lock_miss;
	1265	if (object->flags & OBJ_MIGHTBEDIRTY)
	1266	++*vnodes_skippedp;
	1267	vm_page_unhold(m);
	1268	return 0;
	1269	}
	1270
	1271	/*
	1272	* The page might have been moved to another
	1273	* queue during potential blocking in vget()
	1274	* above. The page might have been freed and
	1275	* reused for another vnode. The object might
	1276	* have been reused for another vnode.
	1277	*/
	1278	if (m->queue - m->pc != PQ_INACTIVE \|\|
	1279	m->object != object \|\|
	1280	object->handle != vp) {
	1281	if (object->flags & OBJ_MIGHTBEDIRTY)
	1282	++*vnodes_skippedp;
	1283	vput(vp);
	1284	vm_page_unhold(m);
	1285	return 0;
	1286	}
	1287
	1288	/*
	1289	* The page may have been busied during the
	1290	* blocking in vput(); We don't move the
	1291	* page back onto the end of the queue so that
	1292	* statistics are more correct if we don't.
	1293	*/
	1294	if (vm_page_busy_try(m, TRUE)) {
	1295	vput(vp);
	1296	vm_page_unhold(m);
	1297	return 0;
	1298	}
	1299	vm_page_unhold(m);
	1300
	1301	/*
	1302	* If it was wired while we didn't own it.
	1303	*/
	1304	if (m->wire_count) {
	1305	vm_page_unqueue_nowakeup(m);
	1306	vput(vp);
	1307	vm_page_wakeup(m);
	1308	return 0;
	1309	}
	1310
	1311	/*
	1312	* (m) is busied again
	1313	*
	1314	* We own the busy bit and remove our hold
	1315	* bit. If the page is still held it
	1316	* might be undergoing I/O, so skip it.
	1317	*/
	1318	if (m->hold_count) {
	1319	rebusy_failed:
	1320	vm_page_and_queue_spin_lock(m);
	1321	if (m->queue - m->pc == PQ_INACTIVE) {
	1322	TAILQ_REMOVE(&vm_page_queues[m->queue].pl, m, pageq);
	1323	TAILQ_INSERT_TAIL(&vm_page_queues[m->queue].pl, m, pageq);
	1324	}
	1325	vm_page_and_queue_spin_unlock(m);
	1326	if (object->flags & OBJ_MIGHTBEDIRTY)
	1327	++*vnodes_skippedp;
	1328	vm_page_wakeup(m);
	1329	vput(vp);
	1330	return 0;
	1331	}
	1332
	1333	/*
	1334	* Recheck queue, object, and vp now that we have
	1335	* rebusied the page.
	1336	*/
	1337	if (m->queue - m->pc != PQ_INACTIVE \|\|
	1338	m->object != object \|\|
	1339	object->handle != vp) {
	1340	kprintf("vm_pageout_page: "
	1341	"rebusy %p failed(A)\n",
	1342	m);
	1343	goto rebusy_failed;
	1344	}
	1345
	1346	/*
	1347	* Check page validity
	1348	*/
	1349	if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) {
	1350	kprintf("vm_pageout_page: "
	1351	"rebusy %p failed(B)\n",
	1352	m);
	1353	goto rebusy_failed;
	1354	}
	1355	if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) {
	1356	kprintf("vm_pageout_page: "
	1357	"rebusy %p failed(C)\n",
	1358	m);
	1359	goto rebusy_failed;
	1360	}
	1361
	1362	/* (m) is left busied as we fall through */
	1363	}
	1364
	1365	/*
	1366	* page is busy and not held here.
	1367	*
	1368	* If a page is dirty, then it is either being washed
	1369	* (but not yet cleaned) or it is still in the
	1370	* laundry. If it is still in the laundry, then we
	1371	* start the cleaning operation.
	1372	*
	1373	* decrement inactive_shortage on success to account
	1374	* for the (future) cleaned page. Otherwise we
	1375	* could wind up laundering or cleaning too many
	1376	* pages.
	1377	*
	1378	* NOTE: Cleaning the page here does not cause
	1379	* force_deficit to be adjusted, because the
	1380	* page is not being freed or moved to the
	1381	* cache.
	1382	*/
	1383	count = vm_pageout_clean_helper(m, vmflush_flags);
	1384	counts[0] += count;
	1385	*max_launderp -= count;
	1386
	1387	/*
	1388	* Clean ate busy, page no longer accessible
	1389	*/
	1390	if (vp != NULL)
	1391	vput(vp);
	1392	} else {
	1393	vm_page_wakeup(m);
	1394	}
	1395	return count;
	1396	}
	1397
	1398	/*
	1399	* Scan active queue
	1400	*
	1401	* WARNING! Can be called from two pagedaemon threads simultaneously.
	1402	*/
	1403	static int
	1404	vm_pageout_scan_active(int pass, int q,
	1405	long avail_shortage, long inactive_shortage,
	1406	struct vm_page *marker,
	1407	long *recycle_countp)
	1408	{
	1409	vm_page_t m;
	1410	int actcount;
	1411	long delta = 0;
	1412	long maxscan;
	1413	int isep;
	1414
	1415	isep = (curthread == emergpager);
	1416
	1417	/*
	1418	* We want to move pages from the active queue to the inactive
	1419	* queue to get the inactive queue to the inactive target. If
	1420	* we still have a page shortage from above we try to directly free
	1421	* clean pages instead of moving them.
	1422	*
	1423	* If we do still have a shortage we keep track of the number of
	1424	* pages we free or cache (recycle_count) as a measure of thrashing
	1425	* between the active and inactive queues.
	1426	*
	1427	* If we were able to completely satisfy the free+cache targets
	1428	* from the inactive pool we limit the number of pages we move
	1429	* from the active pool to the inactive pool to 2x the pages we
	1430	* had removed from the inactive pool (with a minimum of 1/5 the
	1431	* inactive target). If we were not able to completely satisfy
	1432	* the free+cache targets we go for the whole target aggressively.
	1433	*
	1434	* NOTE: Both variables can end up negative.
	1435	* NOTE: We are still in a critical section.
	1436	*
	1437	* NOTE! THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED
	1438	* PAGES.
	1439	*/
	1440
	1441	vm_page_queues_spin_lock(PQ_ACTIVE + q);
	1442	maxscan = (vm_page_queues[PQ_ACTIVE + q].lcnt + MAXSCAN_DIVIDER - 1) /
	1443	MAXSCAN_DIVIDER + 1;
	1444
	1445	/*
	1446	* Queue locked at top of loop to avoid stack marker issues.
	1447	*/
	1448	while ((m = TAILQ_NEXT(marker, pageq)) != NULL &&
	1449	maxscan-- > 0 && (avail_shortage - delta > 0 \|\|
	1450	inactive_shortage > 0))
	1451	{
	1452	KKASSERT(m->queue == PQ_ACTIVE + q);
	1453	TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl,
	1454	marker, pageq);
	1455	TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m,
	1456	marker, pageq);
	1457
	1458	/*
	1459	* Skip marker pages (atomic against other markers to avoid
	1460	* infinite hop-over scans).
	1461	*/
	1462	if (m->flags & PG_MARKER)
	1463	continue;
	1464
	1465	/*
	1466	* Try to busy the page. Don't mess with pages which are
	1467	* already busy or reorder them in the queue.
	1468	*/
	1469	if (vm_page_busy_try(m, TRUE))
	1470	continue;
	1471
	1472	/*
	1473	* Remaining operations run with the page busy and neither
	1474	* the page or the queue will be spin-locked.
	1475	*/
	1476	KKASSERT(m->queue == PQ_ACTIVE + q);
	1477	vm_page_queues_spin_unlock(PQ_ACTIVE + q);
	1478
	1479	#if 0
	1480	/*
	1481	* Don't deactivate pages that are held, even if we can
	1482	* busy them. (XXX why not?)
	1483	*/
	1484	if (m->hold_count) {
	1485	vm_page_and_queue_spin_lock(m);
	1486	if (m->queue - m->pc == PQ_ACTIVE) {
	1487	TAILQ_REMOVE(
	1488	&vm_page_queues[PQ_ACTIVE + q].pl,
	1489	m, pageq);
	1490	TAILQ_INSERT_TAIL(
	1491	&vm_page_queues[PQ_ACTIVE + q].pl,
	1492	m, pageq);
	1493	}
	1494	vm_page_and_queue_spin_unlock(m);
	1495	vm_page_wakeup(m);
	1496	goto next;
	1497	}
	1498	#endif
	1499	/*
	1500	* We can just remove wired pages from the queue
	1501	*/
	1502	if (m->wire_count) {
	1503	vm_page_unqueue_nowakeup(m);
	1504	vm_page_wakeup(m);
	1505	goto next;
	1506	}
	1507
	1508	/*
	1509	* The emergency pager ignores vnode-backed pages as these
	1510	* are the pages that probably bricked the main pager.
	1511	*/
	1512	if (isep && m->object && m->object->type == OBJT_VNODE) {
	1513	#if 0
	1514	vm_page_and_queue_spin_lock(m);
	1515	if (m->queue - m->pc == PQ_ACTIVE) {
	1516	TAILQ_REMOVE(
	1517	&vm_page_queues[PQ_ACTIVE + q].pl,
	1518	m, pageq);
	1519	TAILQ_INSERT_TAIL(
	1520	&vm_page_queues[PQ_ACTIVE + q].pl,
	1521	m, pageq);
	1522	}
	1523	vm_page_and_queue_spin_unlock(m);
	1524	#endif
	1525	vm_page_wakeup(m);
	1526	goto next;
	1527	}
	1528
	1529	/*
	1530	* The count for pagedaemon pages is done after checking the
	1531	* page for eligibility...
	1532	*/
	1533	mycpu->gd_cnt.v_pdpages++;
	1534
	1535	/*
	1536	* Check to see "how much" the page has been used and clear
	1537	* the tracking access bits. If the object has no references
	1538	* don't bother paying the expense.
	1539	*/
	1540	actcount = 0;
	1541	if (m->object && m->object->ref_count != 0) {
	1542	if (m->flags & PG_REFERENCED)
	1543	++actcount;
	1544	actcount += pmap_ts_referenced(m);
	1545	if (actcount) {
	1546	m->act_count += ACT_ADVANCE + actcount;
	1547	if (m->act_count > ACT_MAX)
	1548	m->act_count = ACT_MAX;
	1549	}
	1550	}
	1551	vm_page_flag_clear(m, PG_REFERENCED);
	1552
	1553	/*
	1554	* actcount is only valid if the object ref_count is non-zero.
	1555	* If the page does not have an object, actcount will be zero.
	1556	*/
	1557	if (actcount && m->object->ref_count != 0) {
	1558	#if 0
	1559	vm_page_and_queue_spin_lock(m);
	1560	if (m->queue - m->pc == PQ_ACTIVE) {
	1561	TAILQ_REMOVE(
	1562	&vm_page_queues[PQ_ACTIVE + q].pl,
	1563	m, pageq);
	1564	TAILQ_INSERT_TAIL(
	1565	&vm_page_queues[PQ_ACTIVE + q].pl,
	1566	m, pageq);
	1567	}
	1568	vm_page_and_queue_spin_unlock(m);
	1569	#endif
	1570	vm_page_wakeup(m);
	1571	} else {
	1572	switch(m->object->type) {
	1573	case OBJT_DEFAULT:
	1574	case OBJT_SWAP:
	1575	m->act_count -= min(m->act_count,
	1576	vm_anonmem_decline);
	1577	break;
	1578	default:
	1579	m->act_count -= min(m->act_count,
	1580	vm_filemem_decline);
	1581	break;
	1582	}
	1583	if (vm_pageout_algorithm \|\|
	1584	(m->object == NULL) \|\|
	1585	(m->object && (m->object->ref_count == 0)) \|\|
	1586	m->act_count < pass + 1
	1587	) {
	1588	/*
	1589	* Deactivate the page. If we had a
	1590	* shortage from our inactive scan try to
	1591	* free (cache) the page instead.
	1592	*
	1593	* Don't just blindly cache the page if
	1594	* we do not have a shortage from the
	1595	* inactive scan, that could lead to
	1596	* gigabytes being moved.
	1597	*/
	1598	--inactive_shortage;
	1599	if (avail_shortage - delta > 0 \|\|
	1600	(m->object && (m->object->ref_count == 0)))
	1601	{
	1602	if (avail_shortage - delta > 0)
	1603	++*recycle_countp;
	1604	vm_page_protect(m, VM_PROT_NONE);
	1605	if (m->dirty == 0 &&
	1606	(m->flags & PG_NEED_COMMIT) == 0 &&
	1607	avail_shortage - delta > 0) {
	1608	vm_page_cache(m);
	1609	} else {
	1610	vm_page_deactivate(m);
	1611	vm_page_wakeup(m);
	1612	}
	1613	} else {
	1614	vm_page_deactivate(m);
	1615	vm_page_wakeup(m);
	1616	}
	1617	++delta;
	1618	} else {
	1619	/*
	1620	* Do nothing
	1621	*/
	1622	#if 0
	1623	vm_page_and_queue_spin_lock(m);
	1624	if (m->queue - m->pc == PQ_ACTIVE) {
	1625	TAILQ_REMOVE(
	1626	&vm_page_queues[PQ_ACTIVE + q].pl,
	1627	m, pageq);
	1628	TAILQ_INSERT_TAIL(
	1629	&vm_page_queues[PQ_ACTIVE + q].pl,
	1630	m, pageq);
	1631	}
	1632	vm_page_and_queue_spin_unlock(m);
	1633	#endif
	1634	vm_page_wakeup(m);
	1635	}
	1636	}
	1637	next:
	1638	lwkt_yield();
	1639	vm_page_queues_spin_lock(PQ_ACTIVE + q);
	1640	}
	1641
	1642	/*
	1643	* Clean out our local marker.
	1644	*
	1645	* Page queue still spin-locked.
	1646	*/
	1647	if (m == NULL) {
	1648	TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl,
	1649	marker, pageq);
	1650	TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl,
	1651	marker, pageq);
	1652	}
	1653	vm_page_queues_spin_unlock(PQ_ACTIVE + q);
	1654
	1655	return (delta);
	1656	}
	1657
	1658	/*
	1659	* The number of actually free pages can drop down to v_free_reserved,
	1660	* we try to build the free count back above v_free_min, to v_free_target.
	1661	*
	1662	* Cache pages are already counted as being free-ish.
	1663	*
	1664	* NOTE: we are still in a critical section.
	1665	*
	1666	* Pages moved from PQ_CACHE to totally free are not counted in the
	1667	* pages_freed counter.
	1668	*
	1669	* WARNING! Can be called from two pagedaemon threads simultaneously.
	1670	*/
	1671	static void
	1672	vm_pageout_scan_cache(long avail_shortage, int pass,
	1673	long vnodes_skipped, long recycle_count)
	1674	{
	1675	static int lastkillticks;
	1676	struct vm_pageout_scan_info info;
	1677	vm_page_t m;
	1678	int isep;
	1679
	1680	isep = (curthread == emergpager);
	1681
	1682	/*
	1683	* Test conditions also include a safeety against v_free_min in
	1684	* case the sysop messes up the sysctls.
	1685	*
	1686	* Also include a test to avoid degenerate scans.
	1687	*/
	1688	while ((vmstats.v_free_count < vmstats.v_free_target \|\|
	1689	vmstats.v_free_count < vmstats.v_free_min) &&
	1690	vmstats.v_cache_count > VM_CACHE_SCAN_MIN)
	1691	{
	1692	/*
	1693	* This steals some code from vm/vm_page.c
	1694	*
	1695	* Create two rovers and adjust the code to reduce
	1696	* chances of them winding up at the same index (which
	1697	* can cause a lot of contention).
	1698	*/
	1699	static int cache_rover[2] = { 0, PQ_L2_MASK / 2 };
	1700
	1701	if (((cache_rover[0] ^ cache_rover[1]) & PQ_L2_MASK) == 0)
	1702	goto next_rover;
	1703
	1704	m = vm_page_list_find(PQ_CACHE, cache_rover[isep] & PQ_L2_MASK);
	1705	if (m == NULL)
	1706	break;
	1707	/*
	1708	* page is returned removed from its queue and spinlocked.
	1709	*
	1710	* If the busy attempt fails we can still deactivate the page.
	1711	*/
	1712	if (vm_page_busy_try(m, TRUE)) {
	1713	vm_page_deactivate_locked(m);
	1714	vm_page_spin_unlock(m);
	1715	continue;
	1716	}
	1717	vm_page_spin_unlock(m);
	1718	pagedaemon_wakeup();
	1719	lwkt_yield();
	1720
	1721	/*
	1722	* Report a possible edge case. This shouldn't happen but
	1723	* actually I think it can race against e.g.
	1724	* vm_page_lookup()/busy sequences. If the page isn't
	1725	* in a cache-like state we will deactivate and skip it.
	1726	*/
	1727	if ((m->flags & PG_MAPPED) \|\| (m->valid & m->dirty)) {
	1728	kprintf("WARNING! page race during find/busy: %p "
	1729	"queue == %d dirty=%02x\n",
	1730	m, m->queue - m->pc, m->dirty);
	1731	}
	1732
	1733	/*
	1734	* Remaining operations run with the page busy and neither
	1735	* the page or the queue will be spin-locked.
	1736	*/
	1737	if ((m->flags & (PG_UNQUEUED \| PG_NEED_COMMIT \| PG_MAPPED)) \|\|
	1738	m->hold_count \|\|
	1739	m->wire_count \|\|
	1740	(m->valid & m->dirty))
	1741	{
	1742	vm_page_deactivate(m);
	1743	vm_page_wakeup(m);
	1744	continue;
	1745	}
	1746
	1747	/*
	1748	* Because the page is in the cache, it shouldn't be mapped.
	1749	*/
	1750	pmap_mapped_sync(m);
	1751	KKASSERT((m->flags & PG_MAPPED) == 0);
	1752	KKASSERT(m->dirty == 0);
	1753	vm_pageout_page_free(m);
	1754	mycpu->gd_cnt.v_dfree++;
	1755	next_rover:
	1756	if (isep)
	1757	cache_rover[1] -= PQ_PRIME2;
	1758	else
	1759	cache_rover[0] += PQ_PRIME2;
	1760	}
	1761
	1762	/*
	1763	* If we didn't get enough free pages, and we have skipped a vnode
	1764	* in a writeable object, wakeup the sync daemon. And kick swapout
	1765	* if we did not get enough free pages.
	1766	*/
	1767	if (vm_paging_target1()) {
	1768	if (vnodes_skipped && vm_paging_min())
	1769	speedup_syncer(NULL);
	1770	#if !defined(NO_SWAPPING)
	1771	if (vm_swap_enabled && vm_paging_target1())
	1772	vm_req_vmdaemon();
	1773	#endif
	1774	}
	1775
	1776	/*
	1777	* Handle catastrophic conditions. Under good conditions we should
	1778	* be at the target, well beyond our minimum. If we could not even
	1779	* reach our minimum the system is under heavy stress. But just being
	1780	* under heavy stress does not trigger process killing.
	1781	*
	1782	* We consider ourselves to have run out of memory if the swap pager
	1783	* is full and avail_shortage is still positive. The secondary check
	1784	* ensures that we do not kill processes if the instantanious
	1785	* availability is good, even if the pageout demon pass says it
	1786	* couldn't get to the target.
	1787	*
	1788	* NOTE! THE EMERGENCY PAGER (isep) DOES NOT HANDLE SWAP FULL
	1789	* SITUATIONS.
	1790	*/
	1791	if (swap_pager_almost_full &&
	1792	pass > 0 &&
	1793	isep == 0 &&
	1794	(vm_paging_min_dnc(recycle_count) \|\| avail_shortage > 0)) {
	1795	kprintf("Warning: system low on memory+swap "
	1796	"shortage %ld for %d ticks!\n",
	1797	avail_shortage, ticks - swap_fail_ticks);
	1798	if (bootverbose) {
	1799	kprintf("Metrics: spaf=%d spf=%d pass=%d "
	1800	"availshrt=%ld tgt=%d/%d inacshrt=%ld "
	1801	"last=%u\n",
	1802	swap_pager_almost_full,
	1803	swap_pager_full,
	1804	pass,
	1805	avail_shortage,
	1806	vm_paging_target1(),
	1807	vm_paging_target2(),
	1808	vm_paging_target2_count(),
	1809	(unsigned int)(ticks - lastkillticks));
	1810	}
	1811	}
	1812	if (swap_pager_full &&
	1813	pass > 1 &&
	1814	isep == 0 &&
	1815	avail_shortage > 0 &&
	1816	vm_paging_target1() &&
	1817	(unsigned int)(ticks - lastkillticks) >= hz)
	1818	{
	1819	/*
	1820	* Kill something, maximum rate once per second to give
	1821	* the process time to free up sufficient memory.
	1822	*/
	1823	lastkillticks = ticks;
	1824	info.bigproc = NULL;
	1825	info.bigsize = 0;
	1826	allproc_scan(vm_pageout_scan_callback, &info, 0);
	1827	if (info.bigproc != NULL) {
	1828	kprintf("Try to kill process %d %s\n",
	1829	info.bigproc->p_pid, info.bigproc->p_comm);
	1830	info.bigproc->p_nice = PRIO_MIN;
	1831	info.bigproc->p_usched->resetpriority(
	1832	FIRST_LWP_IN_PROC(info.bigproc));
	1833	atomic_set_int(&info.bigproc->p_flags, P_LOWMEMKILL);
	1834	killproc(info.bigproc, "out of swap space");
	1835	wakeup(&vmstats.v_free_count);
	1836	PRELE(info.bigproc);
	1837	}
	1838	}
	1839	}
	1840
	1841	static int
	1842	vm_pageout_scan_callback(struct proc p, void data)
	1843	{
	1844	struct vm_pageout_scan_info *info = data;
	1845	vm_offset_t size;
	1846
	1847	/*
	1848	* Never kill system processes or init. If we have configured swap
	1849	* then try to avoid killing low-numbered pids.
	1850	*/
	1851	if ((p->p_flags & P_SYSTEM) \|\| (p->p_pid == 1) \|\|
	1852	((p->p_pid < 48) && (vm_swap_size != 0))) {
	1853	return (0);
	1854	}
	1855
	1856	lwkt_gettoken(&p->p_token);
	1857
	1858	/*
	1859	* if the process is in a non-running type state,
	1860	* don't touch it.
	1861	*/
	1862	if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) {
	1863	lwkt_reltoken(&p->p_token);
	1864	return (0);
	1865	}
	1866
	1867	/*
	1868	* Get the approximate process size. Note that anonymous pages
	1869	* with backing swap will be counted twice, but there should not
	1870	* be too many such pages due to the stress the VM system is
	1871	* under at this point.
	1872	*/
	1873	size = vmspace_anonymous_count(p->p_vmspace) +
	1874	vmspace_swap_count(p->p_vmspace);
	1875
	1876	/*
	1877	* If the this process is bigger than the biggest one
	1878	* remember it.
	1879	*/
	1880	if (info->bigsize < size) {
	1881	if (info->bigproc)
	1882	PRELE(info->bigproc);
	1883	PHOLD(p);
	1884	info->bigproc = p;
	1885	info->bigsize = size;
	1886	}
	1887	lwkt_reltoken(&p->p_token);
	1888	lwkt_yield();
	1889
	1890	return(0);
	1891	}
	1892
	1893	/*
	1894	* This old guy slowly walks PQ_HOLD looking for pages which need to be
	1895	* moved back to PQ_FREE. It is possible for pages to accumulate here
	1896	* when vm_page_free() races against vm_page_unhold(), resulting in a
	1897	* page being left on a PQ_HOLD queue with hold_count == 0.
	1898	*
	1899	* It is easier to handle this edge condition here, in non-critical code,
	1900	* rather than enforce a spin-lock for every 1->0 transition in
	1901	* vm_page_unhold().
	1902	*
	1903	* NOTE: TAILQ_FOREACH becomes invalid the instant we unlock the queue.
	1904	*/
	1905	static void
	1906	vm_pageout_scan_hold(int q, struct vm_page *marker)
	1907	{
	1908	vm_page_t m;
	1909	long pcount;
	1910
	1911	pcount = vm_page_queues[PQ_HOLD + q].lcnt;
	1912	if (pcount > vm_pageout_stats_scan)
	1913	pcount = vm_pageout_stats_scan;
	1914
	1915	vm_page_queues_spin_lock(PQ_HOLD + q);
	1916	while ((m = TAILQ_NEXT(marker, pageq)) != NULL &&
	1917	pcount-- > 0)
	1918	{
	1919	KKASSERT(m->queue == PQ_HOLD + q);
	1920	TAILQ_REMOVE(&vm_page_queues[PQ_HOLD + q].pl, marker, pageq);
	1921	TAILQ_INSERT_AFTER(&vm_page_queues[PQ_HOLD + q].pl, m,
	1922	marker, pageq);
	1923
	1924	if (m->flags & PG_MARKER)
	1925	continue;
	1926
	1927	/*
	1928	* Process one page and return
	1929	*/
	1930	if (m->hold_count)
	1931	break;
	1932	kprintf("DEBUG: pageout HOLD->FREE %p\n", m);
	1933	vm_page_hold(m);
	1934	vm_page_queues_spin_unlock(PQ_HOLD + q);
	1935	vm_page_unhold(m); /* reprocess */
	1936	vm_page_queues_spin_lock(PQ_HOLD + q);
	1937	}
	1938
	1939	/*
	1940	* If queue exhausted move the marker back to the head.
	1941	*/
	1942	if (m == NULL) {
	1943	TAILQ_REMOVE(&vm_page_queues[PQ_HOLD + q].pl,
	1944	marker, pageq);
	1945	TAILQ_INSERT_HEAD(&vm_page_queues[PQ_HOLD + q].pl,
	1946	marker, pageq);
	1947	}
	1948
	1949	vm_page_queues_spin_unlock(PQ_HOLD + q);
	1950	}
	1951
	1952	/*
	1953	* This code maintains the m->act for active pages. The scan occurs only
	1954	* as long as the pageout daemon is not running or the inactive target has
	1955	* not been reached.
	1956	*
	1957	* The restrictions prevent an idle machine from degrading all VM pages
	1958	* m->act to 0 or nearly 0, which makes the field useless. For example, if
	1959	* a workstation user goes to bed.
	1960	*/
	1961	static void
	1962	vm_pageout_page_stats(int q, struct vm_page marker, long counterp)
	1963	{
	1964	struct vpgqueues *pq = &vm_page_queues[PQ_ACTIVE + q];
	1965	vm_page_t m;
	1966	long pcount; /* Number of pages to check */
	1967
	1968	/*
	1969	* No point scanning the active queue if it is smaller than
	1970	* 1/2 usable memory. This most typically occurs at system
	1971	* startup or if a huge amount of memory has just been freed.
	1972	*/
	1973	if (vmstats.v_active_count < vmstats.v_free_count +
	1974	vmstats.v_cache_count +
	1975	vmstats.v_inactive_count)
	1976	{
	1977	return;
	1978	}
	1979
	1980	/*
	1981	* Generally do not scan if the pageout daemon is not running
	1982	* or the inactive target has been reached. However, we override
	1983	* this and scan anyway for N seconds after the pageout daemon last
	1984	* ran.
	1985	*
	1986	* This last bit is designed to give the system a little time to
	1987	* stage more pages for potential deactivation. In this situation,
	1988	* if the inactive target has been met, we just update m->act_count
	1989	* and do not otherwise mess with the page. But we don't want it
	1990	* to run forever because that would cause m->act to become unusable
	1991	* if the machine were to become idle.
	1992	*/
	1993	if (vm_pages_needed == 0 && !vm_paging_inactive()) {
	1994	if (time_uptime - vm_pagedaemon_uptime > vm_pageout_stats_rsecs)
	1995	return;
	1996	}
	1997
	1998	if (vm_pageout_debug) {
	1999	static time_t save_time;
	2000	if (save_time != time_uptime) {
	2001	save_time = time_uptime;
	2002	kprintf("DEACTIVATE Q=%4d N=%ld\n",
	2003	q, vm_paging_inactive_count());
	2004	}
	2005	}
	2006
	2007	/*
	2008	* Limited scan to reduce cpu glitches, just in case the
	2009	* pmap_ts_referenced() burns a lot of CPU.
	2010	*/
	2011	pcount = pq->lcnt;
	2012	if (pcount > vm_pageout_stats_scan)
	2013	pcount = vm_pageout_stats_scan;
	2014
	2015	vm_page_queues_spin_lock(PQ_ACTIVE + q);
	2016
	2017	/*
	2018	* Queue locked at top of loop to avoid stack marker issues.
	2019	*/
	2020	while ((m = TAILQ_NEXT(marker, pageq)) != NULL &&
	2021	pcount-- > 0)
	2022	{
	2023	int actcount;
	2024
	2025	KKASSERT(m->queue == PQ_ACTIVE + q);
	2026	TAILQ_REMOVE(&pq->pl, marker, pageq);
	2027	TAILQ_INSERT_AFTER(&pq->pl, m, marker, pageq);
	2028
	2029	/*
	2030	* Skip marker pages (atomic against other markers to avoid
	2031	* infinite hop-over scans).
	2032	*/
	2033	if (m->flags & PG_MARKER)
	2034	continue;
	2035
	2036	++counterp[0];
	2037
	2038	/*
	2039	* Ignore pages we can't busy
	2040	*/
	2041	if (vm_page_busy_try(m, TRUE)) {
	2042	continue;
	2043	}
	2044
	2045	/*
	2046	* Remaining operations run with the page busy and neither
	2047	* the page or the queue will be spin-locked.
	2048	*/
	2049	KKASSERT(m->queue == PQ_ACTIVE + q);
	2050	vm_page_queues_spin_unlock(PQ_ACTIVE + q);
	2051
	2052	/*
	2053	* We can just remove wired pages from the queue
	2054	*/
	2055	if (m->wire_count) {
	2056	vm_page_unqueue_nowakeup(m);
	2057	vm_page_wakeup(m);
	2058	goto next;
	2059	}
	2060
	2061
	2062	/*
	2063	* We now have a safely busied page, the page and queue
	2064	* spinlocks have been released.
	2065	*
	2066	* Ignore held and wired pages
	2067	*/
	2068	if (m->hold_count \|\| m->wire_count) {
	2069	vm_page_wakeup(m);
	2070	goto next;
	2071	}
	2072
	2073	/*
	2074	* Calculate activity
	2075	*/
	2076	actcount = 0;
	2077	if (m->flags & PG_REFERENCED) {
	2078	vm_page_flag_clear(m, PG_REFERENCED);
	2079	actcount += 1;
	2080	}
	2081	actcount += pmap_ts_referenced(m);
	2082
	2083	/*
	2084	* Update act_count and move page to end of queue.
	2085	*/
	2086	if (actcount) {
	2087	m->act_count += ACT_ADVANCE + actcount;
	2088	if (m->act_count > ACT_MAX)
	2089	m->act_count = ACT_MAX;
	2090	#if 0
	2091	vm_page_and_queue_spin_lock(m);
	2092	if (m->queue - m->pc == PQ_ACTIVE) {
	2093	TAILQ_REMOVE(&pq->pl, m, pageq);
	2094	TAILQ_INSERT_TAIL(&pq->pl, m, pageq);
	2095	}
	2096	vm_page_and_queue_spin_unlock(m);
	2097	#endif
	2098	vm_page_wakeup(m);
	2099	goto next;
	2100	}
	2101
	2102	if (m->act_count == 0) {
	2103	/*
	2104	* If the deactivation target has not been reached
	2105	* we try to deactivate the page.
	2106	*
	2107	* If the deactivation target has been reached it
	2108	* is a complete waste of time (both now and later)
	2109	* to try to deactivate more pages.
	2110	*/
	2111	if (vm_paging_inactive()) {
	2112	vm_page_protect(m, VM_PROT_NONE);
	2113	vm_page_deactivate(m);
	2114	}
	2115	++counterp[1];
	2116	} else {
	2117	m->act_count -= min(m->act_count, ACT_DECLINE);
	2118	#if 0
	2119	vm_page_and_queue_spin_lock(m);
	2120	if (m->queue - m->pc == PQ_ACTIVE) {
	2121	TAILQ_REMOVE(&pq->pl, m, pageq);
	2122	TAILQ_INSERT_TAIL(&pq->pl, m, pageq);
	2123	}
	2124	vm_page_and_queue_spin_unlock(m);
	2125	#endif
	2126
	2127	if (m->act_count < vm_pageout_stats_actcmp) {
	2128	if (vm_paging_inactive()) {
	2129	vm_page_protect(m, VM_PROT_NONE);
	2130	vm_page_deactivate(m);
	2131	}
	2132	++counterp[1];
	2133	}
	2134	}
	2135	vm_page_wakeup(m);
	2136	next:
	2137	vm_page_queues_spin_lock(PQ_ACTIVE + q);
	2138	}
	2139
	2140	/*
	2141	* If the queue has been exhausted move the marker back to the head.
	2142	*/
	2143	if (m == NULL) {
	2144	TAILQ_REMOVE(&pq->pl, marker, pageq);
	2145	TAILQ_INSERT_HEAD(&pq->pl, marker, pageq);
	2146	}
	2147
	2148	/*
	2149	* Remove our local marker
	2150	*
	2151	* Page queue still spin-locked.
	2152	*/
	2153	vm_page_queues_spin_unlock(PQ_ACTIVE + q);
	2154
	2155	/*
	2156	* After roughly every (inalim) pages determine if we are making
	2157	* appropriate progress. If we are then reduce the comparison point
	2158	* for act_count, and if we are not increase the comparison point.
	2159	*
	2160	* This allows us to handle heavier loads and also balances the
	2161	* code, particularly at startup.
	2162	*/
	2163	if (counterp[0] > vm_pageout_stats_inalim) {
	2164	if (counterp[1] < vm_pageout_stats_inamin) {
	2165	if (vm_pageout_stats_actcmp < ACT_MAX * 3 / 4)
	2166	++vm_pageout_stats_actcmp;
	2167	} else {
	2168	if (vm_pageout_stats_actcmp > 0)
	2169	--vm_pageout_stats_actcmp;
	2170	}
	2171	counterp[0] = 0;
	2172	counterp[1] = 0;
	2173	}
	2174	}
	2175
	2176	static void
	2177	vm_pageout_free_page_calc(vm_size_t count)
	2178	{
	2179	/*
	2180	* v_free_min normal allocations
	2181	* v_free_reserved system allocations
	2182	* v_pageout_free_min allocations by pageout daemon
	2183	* v_interrupt_free_min low level allocations (e.g swap structures)
	2184	*
	2185	* v_free_min is used to generate several other baselines, and they
	2186	* can get pretty silly on systems with a lot of memory.
	2187	*/
	2188	vmstats.v_free_min = 64 + vmstats.v_page_count / 200;
	2189	vmstats.v_free_reserved = vmstats.v_free_min * 4 / 8 + 7;
	2190	vmstats.v_free_severe = vmstats.v_free_min * 4 / 8 + 0;
	2191	vmstats.v_pageout_free_min = vmstats.v_free_min * 2 / 8 + 7;
	2192	vmstats.v_interrupt_free_min = vmstats.v_free_min * 1 / 8 + 7;
	2193	}
	2194
	2195
	2196	/*
	2197	* vm_pageout is the high level pageout daemon. TWO kernel threads run
	2198	* this daemon, the primary pageout daemon and the emergency pageout daemon.
	2199	*
	2200	* The emergency pageout daemon takes over when the primary pageout daemon
	2201	* deadlocks. The emergency pageout daemon ONLY pages out to swap, thus
	2202	* avoiding the many low-memory deadlocks which can occur when paging out
	2203	* to VFS's.
	2204	*/
	2205	static void
	2206	vm_pageout_thread(void)
	2207	{
	2208	int pass;
	2209	int q;
	2210	int q1iterator = 0;
	2211	int q2iterator = 0;
	2212	int q3iterator = 0;
	2213	int isep;
	2214	enum { PAGING_IDLE, PAGING_TARGET1, PAGING_TARGET2 } state;
	2215	struct markers *markers;
	2216	long scounter[2] = { 0, 0 };
	2217	time_t warn_time;
	2218
	2219	curthread->td_flags \|= TDF_SYSTHREAD;
	2220	state = PAGING_IDLE;
	2221
	2222	/*
	2223	* Allocate continuous markers for hold, stats (active), and
	2224	* paging active queue scan. These scans occur incrementally.
	2225	*/
	2226	markers = kmalloc(sizeof(markers) PQ_L2_SIZE,
	2227	M_PAGEOUT, M_WAITOK \| M_ZERO);
	2228
	2229	for (q = 0; q < PQ_L2_SIZE; ++q) {
	2230	struct markers *mark = &markers[q];
	2231
	2232	mark->hold.flags = PG_FICTITIOUS \| PG_MARKER;
	2233	mark->hold.busy_count = PBUSY_LOCKED;
	2234	mark->hold.queue = PQ_HOLD + q;
	2235	mark->hold.pc = PQ_HOLD + q;
	2236	mark->hold.wire_count = 1;
	2237	vm_page_queues_spin_lock(PQ_HOLD + q);
	2238	TAILQ_INSERT_HEAD(&vm_page_queues[PQ_HOLD + q].pl,
	2239	&mark->hold, pageq);
	2240	vm_page_queues_spin_unlock(PQ_HOLD + q);
	2241
	2242	mark->stat.flags = PG_FICTITIOUS \| PG_MARKER;
	2243	mark->stat.busy_count = PBUSY_LOCKED;
	2244	mark->stat.queue = PQ_ACTIVE + q;
	2245	mark->stat.pc = PQ_ACTIVE + q;
	2246	mark->stat.wire_count = 1;
	2247	vm_page_queues_spin_lock(PQ_ACTIVE + q);
	2248	TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl,
	2249	&mark->stat, pageq);
	2250	vm_page_queues_spin_unlock(PQ_ACTIVE + q);
	2251
	2252	mark->pact.flags = PG_FICTITIOUS \| PG_MARKER;
	2253	mark->pact.busy_count = PBUSY_LOCKED;
	2254	mark->pact.queue = PQ_ACTIVE + q;
	2255	mark->pact.pc = PQ_ACTIVE + q;
	2256	mark->pact.wire_count = 1;
	2257	vm_page_queues_spin_lock(PQ_ACTIVE + q);
	2258	TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl,
	2259	&mark->pact, pageq);
	2260	vm_page_queues_spin_unlock(PQ_ACTIVE + q);
	2261	}
	2262
	2263	/*
	2264	* We only need to setup once.
	2265	*/
	2266	isep = 0;
	2267	if (curthread == emergpager) {
	2268	isep = 1;
	2269	goto skip_setup;
	2270	}
	2271
	2272	/*
	2273	* Initialize vm_max_launder per pageout pass to be 1/16
	2274	* of total physical memory, plus a little slop.
	2275	*/
	2276	if (vm_max_launder == 0)
	2277	vm_max_launder = physmem / 256 + 16;
	2278
	2279	/*
	2280	* Initialize some paging parameters.
	2281	*/
	2282	vm_pageout_free_page_calc(vmstats.v_page_count);
	2283
	2284	/*
	2285	* Basic pageout daemon paging operation settings
	2286	*/
	2287	vmstats.v_free_target = vmstats.v_free_min * 2;
	2288
	2289	vmstats.v_paging_wait = vmstats.v_free_min * 2;
	2290	vmstats.v_paging_start = vmstats.v_free_min * 3;
	2291	vmstats.v_paging_target1 = vmstats.v_free_min * 4;
	2292	vmstats.v_paging_target2 = vmstats.v_free_min * 5;
	2293
	2294	/*
	2295	* NOTE: With the new buffer cache b_act_count we want the default
	2296	* inactive target to be a percentage of available memory.
	2297	*
	2298	* The inactive target essentially determines the minimum
	2299	* number of 'temporary' pages capable of caching one-time-use
	2300	* files when the VM system is otherwise full of pages
	2301	* belonging to multi-time-use files or active program data.
	2302	*
	2303	* NOTE: The inactive target is aggressively persued only if the
	2304	* inactive queue becomes too small. If the inactive queue
	2305	* is large enough to satisfy page movement to free+cache
	2306	* then it is repopulated more slowly from the active queue.
	2307	* This allows a general inactive_target default to be set.
	2308	*
	2309	* There is an issue here for processes which sit mostly idle
	2310	* 'overnight', such as sshd, tcsh, and X. Any movement from
	2311	* the active queue will eventually cause such pages to
	2312	* recycle eventually causing a lot of paging in the morning.
	2313	* To reduce the incidence of this pages cycled out of the
	2314	* buffer cache are moved directly to the inactive queue if
	2315	* they were only used once or twice.
	2316	*
	2317	* The vfs.vm_cycle_point sysctl can be used to adjust this.
	2318	* Increasing the value (up to 64) increases the number of
	2319	* buffer recyclements which go directly to the inactive queue.
	2320	*
	2321	* NOTE: There is 'cache target'. The combined (free + cache( target
	2322	* is handled by the v_paging_* targets above.
	2323	*/
	2324	vmstats.v_inactive_target = vmstats.v_free_count / 16;
	2325	//vmstats.v_inactive_target = vmstats.v_free_min * 4;
	2326
	2327	/* XXX does not really belong here */
	2328	if (vm_page_max_wired == 0)
	2329	vm_page_max_wired = vmstats.v_free_count / 3;
	2330
	2331	/*
	2332	* page stats operation.
	2333	*
	2334	* scan - needs to be large enough for decent turn-around but
	2335	* not so large that it eats a ton of CPU. Pages per run.
	2336	*
	2337	* ticks - interval per run in ticks.
	2338	*
	2339	* run - number of seconds after the pagedaemon has run that
	2340	* we continue to collect page stats, after which we stop.
	2341	*
	2342	* Calculated for 50% coverage.
	2343	*
	2344	*/
	2345	if (vm_pageout_stats_scan == 0) {
	2346	vm_pageout_stats_scan = vmstats.v_free_count / PQ_L2_SIZE / 16;
	2347	if (vm_pageout_stats_scan < 16)
	2348	vm_pageout_stats_scan = 16;
	2349	}
	2350
	2351	if (vm_pageout_stats_ticks == 0)
	2352	vm_pageout_stats_ticks = hz / 10;
	2353
	2354	vm_pagedaemon_uptime = time_uptime;
	2355
	2356	swap_pager_swap_init();
	2357
	2358	atomic_swap_int(&sequence_emerg_pager, 1);
	2359	wakeup(&sequence_emerg_pager);
	2360
	2361	skip_setup:
	2362	/*
	2363	* Sequence emergency pager startup
	2364	*/
	2365	if (isep) {
	2366	while (sequence_emerg_pager == 0)
	2367	tsleep(&sequence_emerg_pager, 0, "pstartup", hz);
	2368	}
	2369
	2370	pass = 0;
	2371	warn_time = time_uptime;
	2372
	2373	/*
	2374	* The pageout daemon is never done, so loop forever.
	2375	*
	2376	* WARNING! This code is being executed by two kernel threads
	2377	* potentially simultaneously.
	2378	*/
	2379	while (TRUE) {
	2380	int error;
	2381	long avail_shortage;
	2382	long inactive_shortage;
	2383	long vnodes_skipped = 0;
	2384	long recycle_count = 0;
	2385	long tmp;
	2386
	2387	/*
	2388	* Don't let pass overflow
	2389	*/
	2390	if (pass > 0x7FFF0000)
	2391	pass = 0x70000000;
	2392
	2393	/*
	2394	* Wait for an action request. If we timeout check to
	2395	* see if paging is needed (in case the normal wakeup
	2396	* code raced us).
	2397	*/
	2398	if (isep) {
	2399	/*
	2400	* Emergency pagedaemon monitors the primary
	2401	* pagedaemon while vm_pages_needed != 0.
	2402	*
	2403	* The emergency pagedaemon only runs if VM paging
	2404	* is needed and the primary pagedaemon has not
	2405	* updated vm_pagedaemon_uptime for more than 2
	2406	* seconds.
	2407	*/
	2408	if (vm_pages_needed)
	2409	tsleep(&vm_pagedaemon_uptime, 0, "psleep", hz);
	2410	else
	2411	tsleep(&vm_pagedaemon_uptime, 0, "psleep", hz*10);
	2412	if (vm_pages_needed == 0) {
	2413	pass = 0;
	2414	continue;
	2415	}
	2416	if ((int)(time_uptime - vm_pagedaemon_uptime) < 2) {
	2417	pass = 0;
	2418	continue;
	2419	}
	2420	} else {
	2421	/*
	2422	* Primary pagedaemon
	2423	*
	2424	* Do an unconditional partial scan to deal with
	2425	* PQ_HOLD races and to maintain active stats on
	2426	* pages that are in PQ_ACTIVE.
	2427	*/
	2428	vm_pageout_scan_hold(q3iterator & PQ_L2_MASK,
	2429	&markers[q3iterator & PQ_L2_MASK].hold);
	2430	vm_pageout_page_stats(q3iterator & PQ_L2_MASK,
	2431	&markers[q3iterator & PQ_L2_MASK].stat,
	2432	scounter);
	2433	++q3iterator;
	2434
	2435	/*
	2436	* Primary idle sleep loop, check condition after
	2437	* sleep.
	2438	*
	2439	* NOTE: State will not be IDLE if vm_pages_needed
	2440	* is non-zero.
	2441	*/
	2442	if (vm_pages_needed == 0) {
	2443	error = tsleep(&vm_pages_needed,
	2444	0, "psleep",
	2445	vm_pageout_stats_ticks);
	2446	if (error &&
	2447	vm_paging_start(0) == 0 &&
	2448	vm_pages_needed == 0)
	2449	{
	2450	continue;
	2451	}
	2452	vm_pagedaemon_uptime = time_uptime;
	2453	vm_pages_needed = 1;
	2454	state = PAGING_TARGET1;
	2455
	2456	/*
	2457	* Wake the emergency pagedaemon up so it
	2458	* can monitor us. It will automatically
	2459	* go back into a long sleep when
	2460	* vm_pages_needed returns to 0.
	2461	*/
	2462	wakeup(&vm_pagedaemon_uptime);
	2463	}
	2464	}
	2465
	2466	mycpu->gd_cnt.v_pdwakeups++;
	2467
	2468	/*
	2469	* Scan for INACTIVE->CLEAN/PAGEOUT
	2470	*
	2471	* This routine tries to avoid thrashing the system with
	2472	* unnecessary activity.
	2473	*
	2474	* Calculate our target for the number of free+cache pages we
	2475	* want to get to. This is higher then the number that causes
	2476	* allocations to stall (severe) in order to provide hysteresis,
	2477	* and if we don't make it all the way but get to the minimum
	2478	* we're happy. Goose it a bit if there are multiple requests
	2479	* for memory.
	2480	*
	2481	* Don't reduce avail_shortage inside the loop or the
	2482	* PQAVERAGE() calculation will break.
	2483	*
	2484	* NOTE! deficit is differentiated from avail_shortage as
	2485	* REQUIRING at least (deficit) pages to be cleaned,
	2486	* even if the page queues are in good shape. This
	2487	* is used primarily for handling per-process
	2488	* RLIMIT_RSS and may also see small values when
	2489	* processes block due to low memory.
	2490	*/
	2491	vmstats_rollup();
	2492	if (isep == 0)
	2493	vm_pagedaemon_uptime = time_uptime;
	2494
	2495	if (state == PAGING_TARGET1) {
	2496	avail_shortage = vm_paging_target1_count() +
	2497	vm_pageout_deficit;
	2498	} else {
	2499	avail_shortage = vm_paging_target2_count() +
	2500	vm_pageout_deficit;
	2501	}
	2502	vm_pageout_deficit = 0;
	2503
	2504	if (avail_shortage > 0) {
	2505	long delta = 0;
	2506	long counts[4] = { 0, 0, 0, 0 };
	2507	long use = avail_shortage;
	2508	int qq;
	2509
	2510	if (vm_pageout_debug) {
	2511	static time_t save_time3;
	2512	if (save_time3 != time_uptime) {
	2513	save_time3 = time_uptime;
	2514	kprintf("scan_inactive "
	2515	"pass %d isep=%d\n",
	2516	pass, isep);
	2517	}
	2518	}
	2519
	2520	/*
	2521	* Once target1 is achieved we move on to target2,
	2522	* but pageout more lazily in smaller batches.
	2523	*/
	2524	if (state == PAGING_TARGET2 &&
	2525	use > vmstats.v_inactive_target / 10)
	2526	{
	2527	use = vmstats.v_inactive_target / 10 + 1;
	2528	}
	2529
	2530	qq = q1iterator;
	2531	for (q = 0; q < PQ_L2_SIZE; ++q) {
	2532	delta += vm_pageout_scan_inactive(
	2533	pass / MAXSCAN_DIVIDER,
	2534	qq & PQ_L2_MASK,
	2535	PQAVERAGE(use),
	2536	&vnodes_skipped, counts);
	2537	if (isep)
	2538	--qq;
	2539	else
	2540	++qq;
	2541	if (avail_shortage - delta <= 0)
	2542	break;
	2543
	2544	/*
	2545	* It is possible for avail_shortage to be
	2546	* very large. If a large program exits or
	2547	* frees a ton of memory all at once, we do
	2548	* not have to continue deactivations.
	2549	*
	2550	* (We will still run the active->inactive
	2551	* target, however).
	2552	*/
	2553	if (!vm_paging_target2() &&
	2554	!vm_paging_min_dnc(vm_page_free_hysteresis)) {
	2555	avail_shortage = 0;
	2556	break;
	2557	}
	2558	}
	2559	if (vm_pageout_debug) {
	2560	static time_t save_time2;
	2561	if (save_time2 != time_uptime) {
	2562	save_time2 = time_uptime;
	2563	kprintf("flsh %ld cln %ld "
	2564	"lru2 %ld react %ld "
	2565	"delta %ld\n",
	2566	counts[0], counts[1],
	2567	counts[2], counts[3],
	2568	delta);
	2569	}
	2570	}
	2571	avail_shortage -= delta;
	2572	q1iterator = qq;
	2573	}
	2574
	2575	/*
	2576	* Figure out how many active pages we must deactivate. If
	2577	* we were able to reach our target with just the inactive
	2578	* scan above we limit the number of active pages we
	2579	* deactivate to reduce unnecessary work.
	2580	*
	2581	* When calculating inactive_shortage notice that we are
	2582	* departing from what vm_paging_inactive_count() does.
	2583	* During paging, the free + cache queues are assumed to
	2584	* be under stress, so only a pure inactive target is
	2585	* calculated without taking into account v_free_min,
	2586	* v_free_count, or v_cache_count.
	2587	*/
	2588	vmstats_rollup();
	2589	if (isep == 0)
	2590	vm_pagedaemon_uptime = time_uptime;
	2591	inactive_shortage = vmstats.v_inactive_target -
	2592	vmstats.v_inactive_count;
	2593
	2594	/*
	2595	* If we were unable to free sufficient inactive pages to
	2596	* satisfy the free/cache queue requirements then simply
	2597	* reaching the inactive target may not be good enough.
	2598	* Try to deactivate pages in excess of the target based
	2599	* on the shortfall.
	2600	*
	2601	* However to prevent thrashing the VM system do not
	2602	* deactivate more than an additional 1/10 the inactive
	2603	* target's worth of active pages.
	2604	*/
	2605	if (avail_shortage > 0) {
	2606	tmp = avail_shortage * 2;
	2607	if (tmp > vmstats.v_inactive_target / 10)
	2608	tmp = vmstats.v_inactive_target / 10;
	2609	inactive_shortage += tmp;
	2610	}
	2611
	2612	/*
	2613	* Only trigger a pmap cleanup on inactive shortage.
	2614	*/
	2615	if (isep == 0 && inactive_shortage > 0) {
	2616	pmap_collect();
	2617	}
	2618
	2619	/*
	2620	* Scan for ACTIVE->INACTIVE
	2621	*
	2622	* Only trigger on inactive shortage. Triggering on
	2623	* avail_shortage can starve the active queue with
	2624	* unnecessary active->inactive transitions and destroy
	2625	* performance.
	2626	*
	2627	* If this is the emergency pager, always try to move
	2628	* a few pages from active to inactive because the inactive
	2629	* queue might have enough pages, but not enough anonymous
	2630	* pages.
	2631	*/
	2632	if (isep && inactive_shortage < vm_emerg_launder)
	2633	inactive_shortage = vm_emerg_launder;
	2634
	2635	if (/avail_shortage > 0 \|\|/ inactive_shortage > 0) {
	2636	long delta = 0;
	2637	int qq;
	2638
	2639	qq = q2iterator;
	2640	for (q = 0; q < PQ_L2_SIZE; ++q) {
	2641	delta += vm_pageout_scan_active(
	2642	pass / MAXSCAN_DIVIDER,
	2643	qq & PQ_L2_MASK,
	2644	PQAVERAGE(avail_shortage),
	2645	PQAVERAGE(inactive_shortage),
	2646	&markers[qq & PQ_L2_MASK].pact,
	2647	&recycle_count);
	2648	if (isep)
	2649	--qq;
	2650	else
	2651	++qq;
	2652	if (inactive_shortage - delta <= 0 &&
	2653	avail_shortage - delta <= 0) {
	2654	break;
	2655	}
	2656
	2657	/*
	2658	* inactive_shortage can be a very large
	2659	* number. This is intended to break out
	2660	* early if our inactive_target has been
	2661	* reached due to other system activity.
	2662	*/
	2663	if (vmstats.v_inactive_count >
	2664	vmstats.v_inactive_target)
	2665	{
	2666	inactive_shortage = 0;
	2667	break;
	2668	}
	2669	}
	2670	inactive_shortage -= delta;
	2671	avail_shortage -= delta;
	2672	q2iterator = qq;
	2673	}
	2674
	2675	/*
	2676	* Scan for CACHE->FREE
	2677	*
	2678	* Finally free enough cache pages to meet our free page
	2679	* requirement and take more drastic measures if we are
	2680	* still in trouble.
	2681	*/
	2682	vmstats_rollup();
	2683	if (isep == 0)
	2684	vm_pagedaemon_uptime = time_uptime;
	2685	vm_pageout_scan_cache(avail_shortage, pass / MAXSCAN_DIVIDER,
	2686	vnodes_skipped, recycle_count);
	2687
	2688	/*
	2689	* This is a bit sophisticated because we do not necessarily
	2690	* want to force paging until our targets are reached if we
	2691	* were able to successfully retire the shortage we calculated.
	2692	*/
	2693	if (avail_shortage > 0) {
	2694	/*
	2695	* If we did not retire enough pages continue the
	2696	* pageout operation until we are able to. It
	2697	* takes MAXSCAN_DIVIDER passes to cover the entire
	2698	* inactive list.
	2699	*
	2700	* We used to throw delays in here if paging went on
	2701	* continuously but that really just makes things
	2702	* worse. Just keep going.
	2703	*/
	2704	if (pass == 0)
	2705	warn_time = time_uptime;
	2706	++pass;
	2707	if (isep == 0 && time_uptime - warn_time >= 60) {
	2708	kprintf("pagedaemon: WARNING! Continuous "
	2709	"paging for %ld minutes\n",
	2710	(time_uptime - warn_time ) / 60);
	2711	warn_time = time_uptime;
	2712	}
	2713
	2714	if (vm_pages_needed) {
	2715	/*
	2716	* Normal operation, additional processes
	2717	* have already kicked us. Retry immediately
	2718	* unless swap space is completely full in
	2719	* which case delay a bit.
	2720	*/
	2721	if (swap_pager_full) {
	2722	tsleep(&vm_pages_needed, 0, "pdelay",
	2723	hz / 5);
	2724	} /* else immediate loop */
	2725	} /* else immediate loop */
	2726	} else {
	2727	/*
	2728	* Reset pass
	2729	*/
	2730	pass = 0;
	2731
	2732	if (vm_paging_start(0) \|\|
	2733	vm_paging_min_dnc(vm_page_free_hysteresis))
	2734	{
	2735	/*
	2736	* Pages sufficiently exhausted to start
	2737	* page-daemon in TARGET1 mode
	2738	*/
	2739	state = PAGING_TARGET1;
	2740	vm_pages_needed = 2;
	2741
	2742	/*
	2743	* We can wakeup waiters if we are above
	2744	* the wait point.
	2745	*/
	2746	if (!vm_paging_wait())
	2747	wakeup(&vmstats.v_free_count);
	2748	} else if (vm_pages_needed) {
	2749	/*
	2750	* Continue paging until TARGET2 reached,
	2751	* but waiters can be woken up.
	2752	*
	2753	* The PAGING_TARGET2 state tells the
	2754	* pagedaemon to work a little less hard.
	2755	*/
	2756	if (vm_paging_target1()) {
	2757	state = PAGING_TARGET1;
	2758	vm_pages_needed = 2;
	2759	} else if (vm_paging_target2()) {
	2760	state = PAGING_TARGET2;
	2761	vm_pages_needed = 2;
	2762	} else {
	2763	vm_pages_needed = 0;
	2764	}
	2765	wakeup(&vmstats.v_free_count);
	2766	} /* else nothing to do here */
	2767	}
	2768	}
	2769	}
	2770
	2771	static struct kproc_desc pg1_kp = {
	2772	"pagedaemon",
	2773	vm_pageout_thread,
	2774	&pagethread
	2775	};
	2776	SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &pg1_kp);
	2777
	2778	static struct kproc_desc pg2_kp = {
	2779	"emergpager",
	2780	vm_pageout_thread,
	2781	&emergpager
	2782	};
	2783	SYSINIT(emergpager, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, kproc_start, &pg2_kp);
	2784
	2785
	2786	/*
	2787	* Called after allocating a page out of the cache or free queue
	2788	* to possibly wake the pagedaemon up to replentish our supply.
	2789	*
	2790	* We try to generate some hysteresis by waking the pagedaemon up
	2791	* when our free+cache pages go below the free_min+cache_min level.
	2792	* The pagedaemon tries to get the count back up to at least the
	2793	* minimum, and through to the target level if possible.
	2794	*
	2795	* If the pagedaemon is already active bump vm_pages_needed as a hint
	2796	* that there are even more requests pending.
	2797	*
	2798	* SMP races ok?
	2799	* No requirements.
	2800	*/
	2801	void
	2802	pagedaemon_wakeup(void)
	2803	{
	2804	if (vm_paging_start(0) && curthread != pagethread) {
	2805	if (vm_pages_needed <= 1) {
	2806	vm_pages_needed = 1; /* SMP race ok */
	2807	wakeup(&vm_pages_needed); /* tickle pageout */
	2808	} else if (vm_paging_min()) {
	2809	++vm_pages_needed; /* SMP race ok */
	2810	/* a wakeup() would be wasted here */
	2811	}
	2812	}
	2813	}
	2814
	2815	#if !defined(NO_SWAPPING)
	2816
	2817	/*
	2818	* SMP races ok?
	2819	* No requirements.
	2820	*/
	2821	static void
	2822	vm_req_vmdaemon(void)
	2823	{
	2824	static int lastrun = 0;
	2825
	2826	if ((ticks > (lastrun + hz)) \|\| (ticks < lastrun)) {
	2827	wakeup(&vm_daemon_needed);
	2828	lastrun = ticks;
	2829	}
	2830	}
	2831
	2832	static int vm_daemon_callback(struct proc p, void data __unused);
	2833
	2834	/*
	2835	* No requirements.
	2836	*
	2837	* Scan processes for exceeding their rlimits, deactivate pages
	2838	* when RSS is exceeded.
	2839	*/
	2840	static void
	2841	vm_daemon(void)
	2842	{
	2843	while (TRUE) {
	2844	tsleep(&vm_daemon_needed, 0, "psleep", 0);
	2845	allproc_scan(vm_daemon_callback, NULL, 0);
	2846	}
	2847	}
	2848
	2849	static int
	2850	vm_daemon_callback(struct proc p, void data __unused)
	2851	{
	2852	struct vmspace *vm;
	2853	vm_pindex_t limit, size;
	2854
	2855	/*
	2856	* if this is a system process or if we have already
	2857	* looked at this process, skip it.
	2858	*/
	2859	lwkt_gettoken(&p->p_token);
	2860
	2861	if (p->p_flags & (P_SYSTEM \| P_WEXIT)) {
	2862	lwkt_reltoken(&p->p_token);
	2863	return (0);
	2864	}
	2865
	2866	/*
	2867	* if the process is in a non-running type state,
	2868	* don't touch it.
	2869	*/
	2870	if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) {
	2871	lwkt_reltoken(&p->p_token);
	2872	return (0);
	2873	}
	2874
	2875	/*
	2876	* get a limit
	2877	*/
	2878	limit = OFF_TO_IDX(qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur,
	2879	p->p_rlimit[RLIMIT_RSS].rlim_max));
	2880
	2881	vm = p->p_vmspace;
	2882	vmspace_hold(vm);
	2883	size = pmap_resident_tlnw_count(&vm->vm_pmap);
	2884	if (limit >= 0 && size > 4096 &&
	2885	size - 4096 >= limit && vm_pageout_memuse_mode >= 1) {
	2886	vm_pageout_map_deactivate_pages(&vm->vm_map, limit);
	2887	}
	2888	vmspace_drop(vm);
	2889
	2890	lwkt_reltoken(&p->p_token);
	2891
	2892	return (0);
	2893	}
	2894
	2895	#endif