gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* (MPSAFE)
	3	*
	4	* Copyright (c) 1991 Regents of the University of California.
	5	* All rights reserved.
	6	* Copyright (c) 1994 John S. Dyson
	7	* All rights reserved.
	8	* Copyright (c) 1994 David Greenman
	9	* All rights reserved.
	10	*
	11	* This code is derived from software contributed to Berkeley by
	12	* The Mach Operating System project at Carnegie-Mellon University.
	13	*
	14	* Redistribution and use in source and binary forms, with or without
	15	* modification, are permitted provided that the following conditions
	16	* are met:
	17	* 1. Redistributions of source code must retain the above copyright
	18	* notice, this list of conditions and the following disclaimer.
	19	* 2. Redistributions in binary form must reproduce the above copyright
	20	* notice, this list of conditions and the following disclaimer in the
	21	* documentation and/or other materials provided with the distribution.
	22	* 4. Neither the name of the University nor the names of its contributors
	23	* may be used to endorse or promote products derived from this software
	24	* without specific prior written permission.
	25	*
	26	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	27	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	28	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	29	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	30	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	31	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	32	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	33	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	34	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	35	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	36	* SUCH DAMAGE.
	37	*
	38	* from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91
	39	*
	40	*
	41	* Copyright (c) 1987, 1990 Carnegie-Mellon University.
	42	* All rights reserved.
	43	*
	44	* Authors: Avadis Tevanian, Jr., Michael Wayne Young
	45	*
	46	* Permission to use, copy, modify and distribute this software and
	47	* its documentation is hereby granted, provided that both the copyright
	48	* notice and this permission notice appear in all copies of the
	49	* software, derivative works or modified versions, and any portions
	50	* thereof, and that both notices appear in supporting documentation.
	51	*
	52	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
	53	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
	54	* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
	55	*
	56	* Carnegie Mellon requests users of this software to return to
	57	*
	58	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
	59	* School of Computer Science
	60	* Carnegie Mellon University
	61	* Pittsburgh PA 15213-3890
	62	*
	63	* any improvements or extensions that they make and grant Carnegie the
	64	* rights to redistribute these changes.
	65	*
	66	* $FreeBSD: src/sys/vm/vm_pageout.c,v 1.151.2.15 2002/12/29 18:21:04 dillon Exp $
	67	*/
	68
	69	/*
	70	* The proverbial page-out daemon.
	71	*/
	72
	73	#include "opt_vm.h"
	74	#include <sys/param.h>
	75	#include <sys/systm.h>
	76	#include <sys/kernel.h>
	77	#include <sys/proc.h>
	78	#include <sys/kthread.h>
	79	#include <sys/resourcevar.h>
	80	#include <sys/signalvar.h>
	81	#include <sys/vnode.h>
	82	#include <sys/vmmeter.h>
	83	#include <sys/sysctl.h>
	84
	85	#include <vm/vm.h>
	86	#include <vm/vm_param.h>
	87	#include <sys/lock.h>
	88	#include <vm/vm_object.h>
	89	#include <vm/vm_page.h>
	90	#include <vm/vm_map.h>
	91	#include <vm/vm_pageout.h>
	92	#include <vm/vm_pager.h>
	93	#include <vm/swap_pager.h>
	94	#include <vm/vm_extern.h>
	95
	96	#include <sys/thread2.h>
	97	#include <sys/spinlock2.h>
	98	#include <vm/vm_page2.h>
	99
	100	/*
	101	* System initialization
	102	*/
	103
	104	/* the kernel process "vm_pageout"*/
	105	static int vm_pageout_clean (vm_page_t);
	106	static int vm_pageout_scan (int pass);
	107	static int vm_pageout_free_page_calc (vm_size_t count);
	108	struct thread *pagethread;
	109
	110	#if !defined(NO_SWAPPING)
	111	/* the kernel process "vm_daemon"*/
	112	static void vm_daemon (void);
	113	static struct thread *vmthread;
	114
	115	static struct kproc_desc vm_kp = {
	116	"vmdaemon",
	117	vm_daemon,
	118	&vmthread
	119	};
	120	SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp)
	121	#endif
	122
	123
	124	int vm_pages_needed=0; /* Event on which pageout daemon sleeps */
	125	int vm_pageout_deficit=0; /* Estimated number of pages deficit */
	126	int vm_pageout_pages_needed=0; /* flag saying that the pageout daemon needs pages */
	127
	128	#if !defined(NO_SWAPPING)
	129	static int vm_pageout_req_swapout; /* XXX */
	130	static int vm_daemon_needed;
	131	#endif
	132	static int vm_max_launder = 32;
	133	static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0;
	134	static int vm_pageout_full_stats_interval = 0;
	135	static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0;
	136	static int defer_swap_pageouts=0;
	137	static int disable_swap_pageouts=0;
	138
	139	#if defined(NO_SWAPPING)
	140	static int vm_swap_enabled=0;
	141	static int vm_swap_idle_enabled=0;
	142	#else
	143	static int vm_swap_enabled=1;
	144	static int vm_swap_idle_enabled=0;
	145	#endif
	146
	147	SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, pageout_algorithm,
	148	CTLFLAG_RW, &vm_pageout_algorithm, 0, "LRU page mgmt");
	149
	150	SYSCTL_INT(_vm, OID_AUTO, max_launder,
	151	CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout");
	152
	153	SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max,
	154	CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length");
	155
	156	SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval,
	157	CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan");
	158
	159	SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval,
	160	CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan");
	161
	162	SYSCTL_INT(_vm, OID_AUTO, pageout_stats_free_max,
	163	CTLFLAG_RW, &vm_pageout_stats_free_max, 0, "Not implemented");
	164
	165	#if defined(NO_SWAPPING)
	166	SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
	167	CTLFLAG_RD, &vm_swap_enabled, 0, "");
	168	SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
	169	CTLFLAG_RD, &vm_swap_idle_enabled, 0, "");
	170	#else
	171	SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
	172	CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout");
	173	SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
	174	CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
	175	#endif
	176
	177	SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts,
	178	CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem");
	179
	180	SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
	181	CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
	182
	183	static int pageout_lock_miss;
	184	SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
	185	CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
	186
	187	int vm_load;
	188	SYSCTL_INT(_vm, OID_AUTO, vm_load,
	189	CTLFLAG_RD, &vm_load, 0, "load on the VM system");
	190	int vm_load_enable = 1;
	191	SYSCTL_INT(_vm, OID_AUTO, vm_load_enable,
	192	CTLFLAG_RW, &vm_load_enable, 0, "enable vm_load rate limiting");
	193	#ifdef INVARIANTS
	194	int vm_load_debug;
	195	SYSCTL_INT(_vm, OID_AUTO, vm_load_debug,
	196	CTLFLAG_RW, &vm_load_debug, 0, "debug vm_load");
	197	#endif
	198
	199	#define VM_PAGEOUT_PAGE_COUNT 16
	200	int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT;
	201
	202	int vm_page_max_wired; /* XXX max # of wired pages system-wide */
	203
	204	#if !defined(NO_SWAPPING)
	205	typedef void freeer_fcn_t (vm_map_t, vm_object_t, vm_pindex_t, int);
	206	static void vm_pageout_map_deactivate_pages (vm_map_t, vm_pindex_t);
	207	static freeer_fcn_t vm_pageout_object_deactivate_pages;
	208	static void vm_req_vmdaemon (void);
	209	#endif
	210	static void vm_pageout_page_stats(void);
	211
	212	/*
	213	* Update vm_load to slow down faulting processes.
	214	*
	215	* SMP races ok.
	216	* No requirements.
	217	*/
	218	void
	219	vm_fault_ratecheck(void)
	220	{
	221	if (vm_pages_needed) {
	222	if (vm_load < 1000)
	223	++vm_load;
	224	} else {
	225	if (vm_load > 0)
	226	--vm_load;
	227	}
	228	}
	229
	230	/*
	231	* vm_pageout_clean:
	232	*
	233	* Clean the page and remove it from the laundry. The page must not be
	234	* busy on-call.
	235	*
	236	* We set the busy bit to cause potential page faults on this page to
	237	* block. Note the careful timing, however, the busy bit isn't set till
	238	* late and we cannot do anything that will mess with the page.
	239	*/
	240	static int
	241	vm_pageout_clean(vm_page_t m)
	242	{
	243	vm_object_t object;
	244	vm_page_t mc[2*vm_pageout_page_count];
	245	int pageout_count;
	246	int error;
	247	int ib, is, page_base;
	248	vm_pindex_t pindex = m->pindex;
	249
	250	object = m->object;
	251
	252	/*
	253	* It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP
	254	* with the new swapper, but we could have serious problems paging
	255	* out other object types if there is insufficient memory.
	256	*
	257	* Unfortunately, checking free memory here is far too late, so the
	258	* check has been moved up a procedural level.
	259	*/
	260
	261	/*
	262	* Don't mess with the page if it's busy, held, or special
	263	*
	264	* XXX do we really need to check hold_count here? hold_count
	265	* isn't supposed to mess with vm_page ops except prevent the
	266	* page from being reused.
	267	*/
	268	if (m->hold_count != 0 \|\| (m->flags & PG_UNMANAGED)) {
	269	vm_page_wakeup(m);
	270	return 0;
	271	}
	272
	273	mc[vm_pageout_page_count] = m;
	274	pageout_count = 1;
	275	page_base = vm_pageout_page_count;
	276	ib = 1;
	277	is = 1;
	278
	279	/*
	280	* Scan object for clusterable pages.
	281	*
	282	* We can cluster ONLY if: ->> the page is NOT
	283	* clean, wired, busy, held, or mapped into a
	284	* buffer, and one of the following:
	285	* 1) The page is inactive, or a seldom used
	286	* active page.
	287	* -or-
	288	* 2) we force the issue.
	289	*
	290	* During heavy mmap/modification loads the pageout
	291	* daemon can really fragment the underlying file
	292	* due to flushing pages out of order and not trying
	293	* align the clusters (which leave sporatic out-of-order
	294	* holes). To solve this problem we do the reverse scan
	295	* first and attempt to align our cluster, then do a
	296	* forward scan if room remains.
	297	*/
	298
	299	vm_object_hold(object);
	300	more:
	301	while (ib && pageout_count < vm_pageout_page_count) {
	302	vm_page_t p;
	303
	304	if (ib > pindex) {
	305	ib = 0;
	306	break;
	307	}
	308
	309	p = vm_page_lookup_busy_try(object, pindex - ib, TRUE, &error);
	310	if (error \|\| p == NULL) {
	311	ib = 0;
	312	break;
	313	}
	314	if ((p->queue - p->pc) == PQ_CACHE \|\|
	315	(p->flags & PG_UNMANAGED)) {
	316	vm_page_wakeup(p);
	317	ib = 0;
	318	break;
	319	}
	320	vm_page_test_dirty(p);
	321	if ((p->dirty & p->valid) == 0 \|\|
	322	p->queue != PQ_INACTIVE \|\|
	323	p->wire_count != 0 \|\| /* may be held by buf cache */
	324	p->hold_count != 0) { /* may be undergoing I/O */
	325	vm_page_wakeup(p);
	326	ib = 0;
	327	break;
	328	}
	329	mc[--page_base] = p;
	330	++pageout_count;
	331	++ib;
	332	/*
	333	* alignment boundry, stop here and switch directions. Do
	334	* not clear ib.
	335	*/
	336	if ((pindex - (ib - 1)) % vm_pageout_page_count == 0)
	337	break;
	338	}
	339
	340	while (pageout_count < vm_pageout_page_count &&
	341	pindex + is < object->size) {
	342	vm_page_t p;
	343
	344	p = vm_page_lookup_busy_try(object, pindex + is, TRUE, &error);
	345	if (error \|\| p == NULL)
	346	break;
	347	if (((p->queue - p->pc) == PQ_CACHE) \|\|
	348	(p->flags & (PG_BUSY\|PG_UNMANAGED)) \|\| p->busy) {
	349	vm_page_wakeup(p);
	350	break;
	351	}
	352	vm_page_test_dirty(p);
	353	if ((p->dirty & p->valid) == 0 \|\|
	354	p->queue != PQ_INACTIVE \|\|
	355	p->wire_count != 0 \|\| /* may be held by buf cache */
	356	p->hold_count != 0) { /* may be undergoing I/O */
	357	vm_page_wakeup(p);
	358	break;
	359	}
	360	mc[page_base + pageout_count] = p;
	361	++pageout_count;
	362	++is;
	363	}
	364
	365	/*
	366	* If we exhausted our forward scan, continue with the reverse scan
	367	* when possible, even past a page boundry. This catches boundry
	368	* conditions.
	369	*/
	370	if (ib && pageout_count < vm_pageout_page_count)
	371	goto more;
	372
	373	vm_object_drop(object);
	374
	375	/*
	376	* we allow reads during pageouts...
	377	*/
	378	return vm_pageout_flush(&mc[page_base], pageout_count, 0);
	379	}
	380
	381	/*
	382	* vm_pageout_flush() - launder the given pages
	383	*
	384	* The given pages are laundered. Note that we setup for the start of
	385	* I/O ( i.e. busy the page ), mark it read-only, and bump the object
	386	* reference count all in here rather then in the parent. If we want
	387	* the parent to do more sophisticated things we may have to change
	388	* the ordering.
	389	*
	390	* The pages in the array must be busied by the caller and will be
	391	* unbusied by this function.
	392	*/
	393	int
	394	vm_pageout_flush(vm_page_t *mc, int count, int flags)
	395	{
	396	vm_object_t object;
	397	int pageout_status[count];
	398	int numpagedout = 0;
	399	int i;
	400
	401	/*
	402	* Initiate I/O. Bump the vm_page_t->busy counter.
	403	*/
	404	for (i = 0; i < count; i++) {
	405	KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL,
	406	("vm_pageout_flush page %p index %d/%d: partially "
	407	"invalid page", mc[i], i, count));
	408	vm_page_io_start(mc[i]);
	409	}
	410
	411	/*
	412	* We must make the pages read-only. This will also force the
	413	* modified bit in the related pmaps to be cleared. The pager
	414	* cannot clear the bit for us since the I/O completion code
	415	* typically runs from an interrupt. The act of making the page
	416	* read-only handles the case for us.
	417	*
	418	* Then we can unbusy the pages, we still hold a reference by virtue
	419	* of our soft-busy.
	420	*/
	421	for (i = 0; i < count; i++) {
	422	vm_page_protect(mc[i], VM_PROT_READ);
	423	vm_page_wakeup(mc[i]);
	424	}
	425
	426	object = mc[0]->object;
	427	vm_object_pip_add(object, count);
	428
	429	vm_pager_put_pages(object, mc, count,
	430	(flags \| ((object == &kernel_object) ? VM_PAGER_PUT_SYNC : 0)),
	431	pageout_status);
	432
	433	for (i = 0; i < count; i++) {
	434	vm_page_t mt = mc[i];
	435
	436	switch (pageout_status[i]) {
	437	case VM_PAGER_OK:
	438	numpagedout++;
	439	break;
	440	case VM_PAGER_PEND:
	441	numpagedout++;
	442	break;
	443	case VM_PAGER_BAD:
	444	/*
	445	* Page outside of range of object. Right now we
	446	* essentially lose the changes by pretending it
	447	* worked.
	448	*/
	449	vm_page_busy_wait(mt, FALSE, "pgbad");
	450	pmap_clear_modify(mt);
	451	vm_page_undirty(mt);
	452	vm_page_wakeup(mt);
	453	break;
	454	case VM_PAGER_ERROR:
	455	case VM_PAGER_FAIL:
	456	/*
	457	* A page typically cannot be paged out when we
	458	* have run out of swap. We leave the page
	459	* marked inactive and will try to page it out
	460	* again later.
	461	*
	462	* Starvation of the active page list is used to
	463	* determine when the system is massively memory
	464	* starved.
	465	*/
	466	break;
	467	case VM_PAGER_AGAIN:
	468	break;
	469	}
	470
	471	/*
	472	* If the operation is still going, leave the page busy to
	473	* block all other accesses. Also, leave the paging in
	474	* progress indicator set so that we don't attempt an object
	475	* collapse.
	476	*
	477	* For any pages which have completed synchronously,
	478	* deactivate the page if we are under a severe deficit.
	479	* Do not try to enter them into the cache, though, they
	480	* might still be read-heavy.
	481	*/
	482	if (pageout_status[i] != VM_PAGER_PEND) {
	483	vm_page_busy_wait(mt, FALSE, "pgouw");
	484	if (vm_page_count_severe())
	485	vm_page_deactivate(mt);
	486	#if 0
	487	if (!vm_page_count_severe() \|\| !vm_page_try_to_cache(mt))
	488	vm_page_protect(mt, VM_PROT_READ);
	489	#endif
	490	vm_page_io_finish(mt);
	491	vm_page_wakeup(mt);
	492	vm_object_pip_wakeup(object);
	493	}
	494	}
	495	return numpagedout;
	496	}
	497
	498	#if !defined(NO_SWAPPING)
	499	/*
	500	* deactivate enough pages to satisfy the inactive target
	501	* requirements or if vm_page_proc_limit is set, then
	502	* deactivate all of the pages in the object and its
	503	* backing_objects.
	504	*
	505	* The map must be locked.
	506	* The caller must hold the vm_object.
	507	*/
	508	static int vm_pageout_object_deactivate_pages_callback(vm_page_t, void *);
	509
	510	static void
	511	vm_pageout_object_deactivate_pages(vm_map_t map, vm_object_t object,
	512	vm_pindex_t desired, int map_remove_only)
	513	{
	514	struct rb_vm_page_scan_info info;
	515	vm_object_t lobject;
	516	vm_object_t tobject;
	517	int remove_mode;
	518
	519	lobject = object;
	520
	521	while (lobject) {
	522	if (pmap_resident_count(vm_map_pmap(map)) <= desired)
	523	break;
	524	if (lobject->type == OBJT_DEVICE \|\| lobject->type == OBJT_PHYS)
	525	break;
	526	if (lobject->paging_in_progress)
	527	break;
	528
	529	remove_mode = map_remove_only;
	530	if (lobject->shadow_count > 1)
	531	remove_mode = 1;
	532
	533	/*
	534	* scan the objects entire memory queue. We hold the
	535	* object's token so the scan should not race anything.
	536	*/
	537	info.limit = remove_mode;
	538	info.map = map;
	539	info.desired = desired;
	540	vm_page_rb_tree_RB_SCAN(&lobject->rb_memq, NULL,
	541	vm_pageout_object_deactivate_pages_callback,
	542	&info
	543	);
	544	while ((tobject = lobject->backing_object) != NULL) {
	545	KKASSERT(tobject != object);
	546	vm_object_hold(tobject);
	547	if (tobject == lobject->backing_object)
	548	break;
	549	vm_object_drop(tobject);
	550	}
	551	if (lobject != object)
	552	vm_object_drop(lobject);
	553	lobject = tobject;
	554	}
	555	if (lobject != object)
	556	vm_object_drop(lobject);
	557	}
	558
	559	/*
	560	* The caller must hold the vm_object.
	561	*/
	562	static int
	563	vm_pageout_object_deactivate_pages_callback(vm_page_t p, void *data)
	564	{
	565	struct rb_vm_page_scan_info *info = data;
	566	int actcount;
	567
	568	if (pmap_resident_count(vm_map_pmap(info->map)) <= info->desired) {
	569	return(-1);
	570	}
	571	mycpu->gd_cnt.v_pdpages++;
	572
	573	if (vm_page_busy_try(p, TRUE))
	574	return(0);
	575	if (p->wire_count \|\| p->hold_count \|\| (p->flags & PG_UNMANAGED)) {
	576	vm_page_wakeup(p);
	577	return(0);
	578	}
	579	if (!pmap_page_exists_quick(vm_map_pmap(info->map), p)) {
	580	vm_page_wakeup(p);
	581	return(0);
	582	}
	583
	584	actcount = pmap_ts_referenced(p);
	585	if (actcount) {
	586	vm_page_flag_set(p, PG_REFERENCED);
	587	} else if (p->flags & PG_REFERENCED) {
	588	actcount = 1;
	589	}
	590
	591	vm_page_and_queue_spin_lock(p);
	592	if (p->queue != PQ_ACTIVE && (p->flags & PG_REFERENCED)) {
	593	vm_page_and_queue_spin_unlock(p);
	594	vm_page_activate(p);
	595	p->act_count += actcount;
	596	vm_page_flag_clear(p, PG_REFERENCED);
	597	} else if (p->queue == PQ_ACTIVE) {
	598	if ((p->flags & PG_REFERENCED) == 0) {
	599	p->act_count -= min(p->act_count, ACT_DECLINE);
	600	if (!info->limit &&
	601	(vm_pageout_algorithm \|\| (p->act_count == 0))) {
	602	vm_page_and_queue_spin_unlock(p);
	603	vm_page_protect(p, VM_PROT_NONE);
	604	vm_page_deactivate(p);
	605	} else {
	606	TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl, p, pageq);
	607	TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, p, pageq);
	608	vm_page_and_queue_spin_unlock(p);
	609	}
	610	} else {
	611	vm_page_and_queue_spin_unlock(p);
	612	vm_page_activate(p);
	613	vm_page_flag_clear(p, PG_REFERENCED);
	614
	615	vm_page_and_queue_spin_lock(p);
	616	if (p->queue == PQ_ACTIVE) {
	617	if (p->act_count < (ACT_MAX - ACT_ADVANCE))
	618	p->act_count += ACT_ADVANCE;
	619	TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl, p, pageq);
	620	TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, p, pageq);
	621	}
	622	vm_page_and_queue_spin_unlock(p);
	623	}
	624	} else if (p->queue == PQ_INACTIVE) {
	625	vm_page_and_queue_spin_unlock(p);
	626	vm_page_protect(p, VM_PROT_NONE);
	627	} else {
	628	vm_page_and_queue_spin_unlock(p);
	629	}
	630	vm_page_wakeup(p);
	631	return(0);
	632	}
	633
	634	/*
	635	* Deactivate some number of pages in a map, try to do it fairly, but
	636	* that is really hard to do.
	637	*/
	638	static void
	639	vm_pageout_map_deactivate_pages(vm_map_t map, vm_pindex_t desired)
	640	{
	641	vm_map_entry_t tmpe;
	642	vm_object_t obj, bigobj;
	643	int nothingwired;
	644
	645	if (lockmgr(&map->lock, LK_EXCLUSIVE \| LK_NOWAIT)) {
	646	return;
	647	}
	648
	649	bigobj = NULL;
	650	nothingwired = TRUE;
	651
	652	/*
	653	* first, search out the biggest object, and try to free pages from
	654	* that.
	655	*/
	656	tmpe = map->header.next;
	657	while (tmpe != &map->header) {
	658	switch(tmpe->maptype) {
	659	case VM_MAPTYPE_NORMAL:
	660	case VM_MAPTYPE_VPAGETABLE:
	661	obj = tmpe->object.vm_object;
	662	if ((obj != NULL) && (obj->shadow_count <= 1) &&
	663	((bigobj == NULL) \|\|
	664	(bigobj->resident_page_count < obj->resident_page_count))) {
	665	bigobj = obj;
	666	}
	667	break;
	668	default:
	669	break;
	670	}
	671	if (tmpe->wired_count > 0)
	672	nothingwired = FALSE;
	673	tmpe = tmpe->next;
	674	}
	675
	676	if (bigobj)
	677	vm_pageout_object_deactivate_pages(map, bigobj, desired, 0);
	678
	679	/*
	680	* Next, hunt around for other pages to deactivate. We actually
	681	* do this search sort of wrong -- .text first is not the best idea.
	682	*/
	683	tmpe = map->header.next;
	684	while (tmpe != &map->header) {
	685	if (pmap_resident_count(vm_map_pmap(map)) <= desired)
	686	break;
	687	switch(tmpe->maptype) {
	688	case VM_MAPTYPE_NORMAL:
	689	case VM_MAPTYPE_VPAGETABLE:
	690	obj = tmpe->object.vm_object;
	691	if (obj)
	692	vm_pageout_object_deactivate_pages(map, obj, desired, 0);
	693	break;
	694	default:
	695	break;
	696	}
	697	tmpe = tmpe->next;
	698	};
	699
	700	/*
	701	* Remove all mappings if a process is swapped out, this will free page
	702	* table pages.
	703	*/
	704	if (desired == 0 && nothingwired)
	705	pmap_remove(vm_map_pmap(map),
	706	VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
	707	vm_map_unlock(map);
	708	}
	709	#endif
	710
	711	/*
	712	* Called when the pageout scan wants to free a page. We no longer
	713	* try to cycle the vm_object here with a reference & dealloc, which can
	714	* cause a non-trivial object collapse in a critical path.
	715	*
	716	* It is unclear why we cycled the ref_count in the past, perhaps to try
	717	* to optimize shadow chain collapses but I don't quite see why it would
	718	* be necessary. An OBJ_DEAD object should terminate any and all vm_pages
	719	* synchronously and not have to be kicked-start.
	720	*/
	721	static void
	722	vm_pageout_page_free(vm_page_t m)
	723	{
	724	vm_page_protect(m, VM_PROT_NONE);
	725	vm_page_free(m);
	726	}
	727
	728	/*
	729	* vm_pageout_scan does the dirty work for the pageout daemon.
	730	*/
	731	struct vm_pageout_scan_info {
	732	struct proc *bigproc;
	733	vm_offset_t bigsize;
	734	};
	735
	736	static int vm_pageout_scan_callback(struct proc p, void data);
	737
	738	static int
	739	vm_pageout_scan(int pass)
	740	{
	741	struct vm_pageout_scan_info info;
	742	vm_page_t m;
	743	struct vm_page marker;
	744	struct vnode vpfailed; / warning, allowed to be stale */
	745	int maxscan, pcount;
	746	int recycle_count;
	747	int inactive_shortage, active_shortage;
	748	int inactive_original_shortage;
	749	vm_object_t object;
	750	int actcount;
	751	int vnodes_skipped = 0;
	752	int maxlaunder;
	753
	754	/*
	755	* Do whatever cleanup that the pmap code can.
	756	*/
	757	pmap_collect();
	758
	759	/*
	760	* Calculate our target for the number of free+cache pages we
	761	* want to get to. This is higher then the number that causes
	762	* allocations to stall (severe) in order to provide hysteresis,
	763	* and if we don't make it all the way but get to the minimum
	764	* we're happy.
	765	*/
	766	inactive_shortage = vm_paging_target() + vm_pageout_deficit;
	767	inactive_original_shortage = inactive_shortage;
	768	vm_pageout_deficit = 0;
	769
	770	/*
	771	* Start scanning the inactive queue for pages we can move to the
	772	* cache or free. The scan will stop when the target is reached or
	773	* we have scanned the entire inactive queue. Note that m->act_count
	774	* is not used to form decisions for the inactive queue, only for the
	775	* active queue.
	776	*
	777	* maxlaunder limits the number of dirty pages we flush per scan.
	778	* For most systems a smaller value (16 or 32) is more robust under
	779	* extreme memory and disk pressure because any unnecessary writes
	780	* to disk can result in extreme performance degredation. However,
	781	* systems with excessive dirty pages (especially when MAP_NOSYNC is
	782	* used) will die horribly with limited laundering. If the pageout
	783	* daemon cannot clean enough pages in the first pass, we let it go
	784	* all out in succeeding passes.
	785	*/
	786	if ((maxlaunder = vm_max_launder) <= 1)
	787	maxlaunder = 1;
	788	if (pass)
	789	maxlaunder = 10000;
	790
	791	/*
	792	* Initialize our marker
	793	*/
	794	bzero(&marker, sizeof(marker));
	795	marker.flags = PG_BUSY \| PG_FICTITIOUS \| PG_MARKER;
	796	marker.queue = PQ_INACTIVE;
	797	marker.wire_count = 1;
	798
	799	/*
	800	* Inactive queue scan.
	801	*
	802	* NOTE: The vm_page must be spinlocked before the queue to avoid
	803	* deadlocks, so it is easiest to simply iterate the loop
	804	* with the queue unlocked at the top.
	805	*/
	806	vpfailed = NULL;
	807
	808	vm_page_queues_spin_lock(PQ_INACTIVE);
	809	TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE].pl, &marker, pageq);
	810	maxscan = vmstats.v_inactive_count;
	811	vm_page_queues_spin_unlock(PQ_INACTIVE);
	812
	813	while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
	814	maxscan-- > 0 && inactive_shortage > 0)
	815	{
	816	vm_page_and_queue_spin_lock(m);
	817	if (m != TAILQ_NEXT(&marker, pageq)) {
	818	vm_page_and_queue_spin_unlock(m);
	819	++maxscan;
	820	continue;
	821	}
	822	KKASSERT(m->queue == PQ_INACTIVE);
	823	TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl,
	824	&marker, pageq);
	825	TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE].pl, m,
	826	&marker, pageq);
	827	mycpu->gd_cnt.v_pdpages++;
	828
	829	/*
	830	* Skip marker pages
	831	*/
	832	if (m->flags & PG_MARKER) {
	833	vm_page_and_queue_spin_unlock(m);
	834	continue;
	835	}
	836
	837	/*
	838	* Try to busy the page. Don't mess with pages which are
	839	* already busy or reorder them in the queue.
	840	*/
	841	if (vm_page_busy_try(m, TRUE)) {
	842	vm_page_and_queue_spin_unlock(m);
	843	continue;
	844	}
	845	vm_page_and_queue_spin_unlock(m);
	846	KKASSERT(m->queue == PQ_INACTIVE);
	847
	848	lwkt_yield();
	849
	850	/*
	851	* The page has been successfully busied and is now no
	852	* longer spinlocked. The queue is no longer spinlocked
	853	* either.
	854	*/
	855
	856	/*
	857	* A held page may be undergoing I/O, so skip it.
	858	*/
	859	if (m->hold_count) {
	860	vm_page_and_queue_spin_lock(m);
	861	if (m->queue == PQ_INACTIVE) {
	862	TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl,
	863	m, pageq);
	864	TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl,
	865	m, pageq);
	866	}
	867	vm_page_and_queue_spin_unlock(m);
	868	++vm_swapcache_inactive_heuristic;
	869	vm_page_wakeup(m);
	870	continue;
	871	}
	872
	873	if (m->object->ref_count == 0) {
	874	/*
	875	* If the object is not being used, we ignore previous
	876	* references.
	877	*/
	878	vm_page_flag_clear(m, PG_REFERENCED);
	879	pmap_clear_reference(m);
	880	/* fall through to end */
	881	} else if (((m->flags & PG_REFERENCED) == 0) &&
	882	(actcount = pmap_ts_referenced(m))) {
	883	/*
	884	* Otherwise, if the page has been referenced while
	885	* in the inactive queue, we bump the "activation
	886	* count" upwards, making it less likely that the
	887	* page will be added back to the inactive queue
	888	* prematurely again. Here we check the page tables
	889	* (or emulated bits, if any), given the upper level
	890	* VM system not knowing anything about existing
	891	* references.
	892	*/
	893	vm_page_activate(m);
	894	m->act_count += (actcount + ACT_ADVANCE);
	895	vm_page_wakeup(m);
	896	continue;
	897	}
	898
	899	/*
	900	* (m) is still busied.
	901	*
	902	* If the upper level VM system knows about any page
	903	* references, we activate the page. We also set the
	904	* "activation count" higher than normal so that we will less
	905	* likely place pages back onto the inactive queue again.
	906	*/
	907	if ((m->flags & PG_REFERENCED) != 0) {
	908	vm_page_flag_clear(m, PG_REFERENCED);
	909	actcount = pmap_ts_referenced(m);
	910	vm_page_activate(m);
	911	m->act_count += (actcount + ACT_ADVANCE + 1);
	912	vm_page_wakeup(m);
	913	continue;
	914	}
	915
	916	/*
	917	* If the upper level VM system doesn't know anything about
	918	* the page being dirty, we have to check for it again. As
	919	* far as the VM code knows, any partially dirty pages are
	920	* fully dirty.
	921	*
	922	* Pages marked PG_WRITEABLE may be mapped into the user
	923	* address space of a process running on another cpu. A
	924	* user process (without holding the MP lock) running on
	925	* another cpu may be able to touch the page while we are
	926	* trying to remove it. vm_page_cache() will handle this
	927	* case for us.
	928	*/
	929	if (m->dirty == 0) {
	930	vm_page_test_dirty(m);
	931	} else {
	932	vm_page_dirty(m);
	933	}
	934
	935	if (m->valid == 0) {
	936	/*
	937	* Invalid pages can be easily freed
	938	*/
	939	vm_pageout_page_free(m);
	940	mycpu->gd_cnt.v_dfree++;
	941	--inactive_shortage;
	942	} else if (m->dirty == 0) {
	943	/*
	944	* Clean pages can be placed onto the cache queue.
	945	* This effectively frees them.
	946	*/
	947	vm_page_cache(m);
	948	--inactive_shortage;
	949	} else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) {
	950	/*
	951	* Dirty pages need to be paged out, but flushing
	952	* a page is extremely expensive verses freeing
	953	* a clean page. Rather then artificially limiting
	954	* the number of pages we can flush, we instead give
	955	* dirty pages extra priority on the inactive queue
	956	* by forcing them to be cycled through the queue
	957	* twice before being flushed, after which the
	958	* (now clean) page will cycle through once more
	959	* before being freed. This significantly extends
	960	* the thrash point for a heavily loaded machine.
	961	*/
	962	vm_page_flag_set(m, PG_WINATCFLS);
	963	vm_page_and_queue_spin_lock(m);
	964	if (m->queue == PQ_INACTIVE) {
	965	TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
	966	TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
	967	}
	968	vm_page_and_queue_spin_unlock(m);
	969	++vm_swapcache_inactive_heuristic;
	970	vm_page_wakeup(m);
	971	} else if (maxlaunder > 0) {
	972	/*
	973	* We always want to try to flush some dirty pages if
	974	* we encounter them, to keep the system stable.
	975	* Normally this number is small, but under extreme
	976	* pressure where there are insufficient clean pages
	977	* on the inactive queue, we may have to go all out.
	978	*/
	979	int swap_pageouts_ok;
	980	struct vnode *vp = NULL;
	981
	982	object = m->object;
	983
	984	if ((object->type != OBJT_SWAP) && (object->type != OBJT_DEFAULT)) {
	985	swap_pageouts_ok = 1;
	986	} else {
	987	swap_pageouts_ok = !(defer_swap_pageouts \|\| disable_swap_pageouts);
	988	swap_pageouts_ok \|= (!disable_swap_pageouts && defer_swap_pageouts &&
	989	vm_page_count_min(0));
	990
	991	}
	992
	993	/*
	994	* We don't bother paging objects that are "dead".
	995	* Those objects are in a "rundown" state.
	996	*/
	997	if (!swap_pageouts_ok \|\| (object->flags & OBJ_DEAD)) {
	998	vm_page_and_queue_spin_lock(m);
	999	if (m->queue == PQ_INACTIVE) {
	1000	TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
	1001	TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
	1002	}
	1003	vm_page_and_queue_spin_unlock(m);
	1004	++vm_swapcache_inactive_heuristic;
	1005	vm_page_wakeup(m);
	1006	continue;
	1007	}
	1008
	1009	/*
	1010	* (m) is still busied.
	1011	*
	1012	* The object is already known NOT to be dead. It
	1013	* is possible for the vget() to block the whole
	1014	* pageout daemon, but the new low-memory handling
	1015	* code should prevent it.
	1016	*
	1017	* The previous code skipped locked vnodes and, worse,
	1018	* reordered pages in the queue. This results in
	1019	* completely non-deterministic operation because,
	1020	* quite often, a vm_fault has initiated an I/O and
	1021	* is holding a locked vnode at just the point where
	1022	* the pageout daemon is woken up.
	1023	*
	1024	* We can't wait forever for the vnode lock, we might
	1025	* deadlock due to a vn_read() getting stuck in
	1026	* vm_wait while holding this vnode. We skip the
	1027	* vnode if we can't get it in a reasonable amount
	1028	* of time.
	1029	*
	1030	* vpfailed is used to (try to) avoid the case where
	1031	* a large number of pages are associated with a
	1032	* locked vnode, which could cause the pageout daemon
	1033	* to stall for an excessive amount of time.
	1034	*/
	1035	if (object->type == OBJT_VNODE) {
	1036	int flags;
	1037
	1038	vp = object->handle;
	1039	flags = LK_EXCLUSIVE \| LK_NOOBJ;
	1040	if (vp == vpfailed)
	1041	flags \|= LK_NOWAIT;
	1042	else
	1043	flags \|= LK_TIMELOCK;
	1044	vm_page_hold(m);
	1045	vm_page_wakeup(m);
	1046
	1047	/*
	1048	* We have unbusied (m) temporarily so we can
	1049	* acquire the vp lock without deadlocking.
	1050	* (m) is held to prevent destruction.
	1051	*/
	1052	if (vget(vp, flags) != 0) {
	1053	vpfailed = vp;
	1054	++pageout_lock_miss;
	1055	if (object->flags & OBJ_MIGHTBEDIRTY)
	1056	vnodes_skipped++;
	1057	vm_page_unhold(m);
	1058	continue;
	1059	}
	1060
	1061	/*
	1062	* The page might have been moved to another
	1063	* queue during potential blocking in vget()
	1064	* above. The page might have been freed and
	1065	* reused for another vnode. The object might
	1066	* have been reused for another vnode.
	1067	*/
	1068	if (m->queue != PQ_INACTIVE \|\|
	1069	m->object != object \|\|
	1070	object->handle != vp) {
	1071	if (object->flags & OBJ_MIGHTBEDIRTY)
	1072	vnodes_skipped++;
	1073	vput(vp);
	1074	vm_page_unhold(m);
	1075	continue;
	1076	}
	1077
	1078	/*
	1079	* The page may have been busied during the
	1080	* blocking in vput(); We don't move the
	1081	* page back onto the end of the queue so that
	1082	* statistics are more correct if we don't.
	1083	*/
	1084	if (vm_page_busy_try(m, TRUE)) {
	1085	vput(vp);
	1086	vm_page_unhold(m);
	1087	continue;
	1088	}
	1089	vm_page_unhold(m);
	1090
	1091	/*
	1092	* (m) is busied again
	1093	*
	1094	* We own the busy bit and remove our hold
	1095	* bit. If the page is still held it
	1096	* might be undergoing I/O, so skip it.
	1097	*/
	1098	if (m->hold_count) {
	1099	vm_page_and_queue_spin_lock(m);
	1100	if (m->queue == PQ_INACTIVE) {
	1101	TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
	1102	TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
	1103	}
	1104	vm_page_and_queue_spin_unlock(m);
	1105	++vm_swapcache_inactive_heuristic;
	1106	if (object->flags & OBJ_MIGHTBEDIRTY)
	1107	vnodes_skipped++;
	1108	vm_page_wakeup(m);
	1109	vput(vp);
	1110	continue;
	1111	}
	1112	/* (m) is left busied as we fall through */
	1113	}
	1114
	1115	/*
	1116	* page is busy and not held here.
	1117	*
	1118	* If a page is dirty, then it is either being washed
	1119	* (but not yet cleaned) or it is still in the
	1120	* laundry. If it is still in the laundry, then we
	1121	* start the cleaning operation.
	1122	*
	1123	* decrement inactive_shortage on success to account
	1124	* for the (future) cleaned page. Otherwise we
	1125	* could wind up laundering or cleaning too many
	1126	* pages.
	1127	*/
	1128	if (vm_pageout_clean(m) != 0) {
	1129	--inactive_shortage;
	1130	--maxlaunder;
	1131	}
	1132	/* clean ate busy, page no longer accessible */
	1133	if (vp != NULL)
	1134	vput(vp);
	1135	} else {
	1136	vm_page_wakeup(m);
	1137	}
	1138	}
	1139	vm_page_queues_spin_lock(PQ_INACTIVE);
	1140	TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, &marker, pageq);
	1141	vm_page_queues_spin_unlock(PQ_INACTIVE);
	1142
	1143	/*
	1144	* We want to move pages from the active queue to the inactive
	1145	* queue to get the inactive queue to the inactive target. If
	1146	* we still have a page shortage from above we try to directly free
	1147	* clean pages instead of moving them.
	1148	*
	1149	* If we do still have a shortage we keep track of the number of
	1150	* pages we free or cache (recycle_count) as a measure of thrashing
	1151	* between the active and inactive queues.
	1152	*
	1153	* If we were able to completely satisfy the free+cache targets
	1154	* from the inactive pool we limit the number of pages we move
	1155	* from the active pool to the inactive pool to 2x the pages we
	1156	* had removed from the inactive pool (with a minimum of 1/5 the
	1157	* inactive target). If we were not able to completely satisfy
	1158	* the free+cache targets we go for the whole target aggressively.
	1159	*
	1160	* NOTE: Both variables can end up negative.
	1161	* NOTE: We are still in a critical section.
	1162	*/
	1163	active_shortage = vmstats.v_inactive_target - vmstats.v_inactive_count;
	1164	if (inactive_original_shortage < vmstats.v_inactive_target / 10)
	1165	inactive_original_shortage = vmstats.v_inactive_target / 10;
	1166	if (inactive_shortage <= 0 &&
	1167	active_shortage > inactive_original_shortage * 2) {
	1168	active_shortage = inactive_original_shortage * 2;
	1169	}
	1170
	1171	recycle_count = 0;
	1172	marker.queue = PQ_ACTIVE;
	1173
	1174	vm_page_queues_spin_lock(PQ_ACTIVE);
	1175	TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE].pl, &marker, pageq);
	1176	vm_page_queues_spin_unlock(PQ_ACTIVE);
	1177	pcount = vmstats.v_active_count;
	1178
	1179	while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
	1180	pcount-- > 0 && (inactive_shortage > 0 \|\| active_shortage > 0))
	1181	{
	1182	vm_page_and_queue_spin_lock(m);
	1183	if (m != TAILQ_NEXT(&marker, pageq)) {
	1184	vm_page_and_queue_spin_unlock(m);
	1185	++pcount;
	1186	continue;
	1187	}
	1188	KKASSERT(m->queue == PQ_ACTIVE);
	1189	TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl,
	1190	&marker, pageq);
	1191	TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE].pl, m,
	1192	&marker, pageq);
	1193
	1194	/*
	1195	* Skip marker pages
	1196	*/
	1197	if (m->flags & PG_MARKER) {
	1198	vm_page_and_queue_spin_unlock(m);
	1199	continue;
	1200	}
	1201
	1202	/*
	1203	* Try to busy the page. Don't mess with pages which are
	1204	* already busy or reorder them in the queue.
	1205	*/
	1206	if (vm_page_busy_try(m, TRUE)) {
	1207	vm_page_and_queue_spin_unlock(m);
	1208	continue;
	1209	}
	1210
	1211	/*
	1212	* Don't deactivate pages that are held, even if we can
	1213	* busy them. (XXX why not?)
	1214	*/
	1215	if (m->hold_count != 0) {
	1216	TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl,
	1217	m, pageq);
	1218	TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl,
	1219	m, pageq);
	1220	vm_page_and_queue_spin_unlock(m);
	1221	vm_page_wakeup(m);
	1222	continue;
	1223	}
	1224	vm_page_and_queue_spin_unlock(m);
	1225	lwkt_yield();
	1226
	1227	/*
	1228	* The page has been successfully busied and the page and
	1229	* queue are no longer locked.
	1230	*/
	1231
	1232	/*
	1233	* The count for pagedaemon pages is done after checking the
	1234	* page for eligibility...
	1235	*/
	1236	mycpu->gd_cnt.v_pdpages++;
	1237
	1238	/*
	1239	* Check to see "how much" the page has been used and clear
	1240	* the tracking access bits. If the object has no references
	1241	* don't bother paying the expense.
	1242	*/
	1243	actcount = 0;
	1244	if (m->object->ref_count != 0) {
	1245	if (m->flags & PG_REFERENCED)
	1246	++actcount;
	1247	actcount += pmap_ts_referenced(m);
	1248	if (actcount) {
	1249	m->act_count += ACT_ADVANCE + actcount;
	1250	if (m->act_count > ACT_MAX)
	1251	m->act_count = ACT_MAX;
	1252	}
	1253	}
	1254	vm_page_flag_clear(m, PG_REFERENCED);
	1255
	1256	/*
	1257	* actcount is only valid if the object ref_count is non-zero.
	1258	*/
	1259	if (actcount && m->object->ref_count != 0) {
	1260	vm_page_and_queue_spin_lock(m);
	1261	if (m->queue == PQ_ACTIVE) {
	1262	TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl,
	1263	m, pageq);
	1264	TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl,
	1265	m, pageq);
	1266	}
	1267	vm_page_and_queue_spin_unlock(m);
	1268	vm_page_wakeup(m);
	1269	} else {
	1270	m->act_count -= min(m->act_count, ACT_DECLINE);
	1271	if (vm_pageout_algorithm \|\|
	1272	m->object->ref_count == 0 \|\|
	1273	m->act_count < pass + 1
	1274	) {
	1275	/*
	1276	* Deactivate the page. If we had a
	1277	* shortage from our inactive scan try to
	1278	* free (cache) the page instead.
	1279	*
	1280	* Don't just blindly cache the page if
	1281	* we do not have a shortage from the
	1282	* inactive scan, that could lead to
	1283	* gigabytes being moved.
	1284	*/
	1285	--active_shortage;
	1286	if (inactive_shortage > 0 \|\|
	1287	m->object->ref_count == 0) {
	1288	if (inactive_shortage > 0)
	1289	++recycle_count;
	1290	vm_page_protect(m, VM_PROT_NONE);
	1291	if (m->dirty == 0 &&
	1292	inactive_shortage > 0) {
	1293	--inactive_shortage;
	1294	vm_page_cache(m);
	1295	} else {
	1296	vm_page_deactivate(m);
	1297	vm_page_wakeup(m);
	1298	}
	1299	} else {
	1300	vm_page_deactivate(m);
	1301	vm_page_wakeup(m);
	1302	}
	1303	} else {
	1304	vm_page_and_queue_spin_lock(m);
	1305	if (m->queue == PQ_ACTIVE) {
	1306	TAILQ_REMOVE(
	1307	&vm_page_queues[PQ_ACTIVE].pl,
	1308	m, pageq);
	1309	TAILQ_INSERT_TAIL(
	1310	&vm_page_queues[PQ_ACTIVE].pl,
	1311	m, pageq);
	1312	}
	1313	vm_page_and_queue_spin_unlock(m);
	1314	vm_page_wakeup(m);
	1315	}
	1316	}
	1317	}
	1318
	1319	/*
	1320	* Clean out our local marker.
	1321	*/
	1322	vm_page_queues_spin_lock(PQ_ACTIVE);
	1323	TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl, &marker, pageq);
	1324	vm_page_queues_spin_unlock(PQ_ACTIVE);
	1325
	1326	/*
	1327	* The number of actually free pages can drop down to v_free_reserved,
	1328	* we try to build the free count back above v_free_min. Note that
	1329	* vm_paging_needed() also returns TRUE if v_free_count is not at
	1330	* least v_free_min so that is the minimum we must build the free
	1331	* count to.
	1332	*
	1333	* We use a slightly higher target to improve hysteresis,
	1334	* ((v_free_target + v_free_min) / 2). Since v_free_target
	1335	* is usually the same as v_cache_min this maintains about
	1336	* half the pages in the free queue as are in the cache queue,
	1337	* providing pretty good pipelining for pageout operation.
	1338	*
	1339	* The system operator can manipulate vm.v_cache_min and
	1340	* vm.v_free_target to tune the pageout demon. Be sure
	1341	* to keep vm.v_free_min < vm.v_free_target.
	1342	*
	1343	* Note that the original paging target is to get at least
	1344	* (free_min + cache_min) into (free + cache). The slightly
	1345	* higher target will shift additional pages from cache to free
	1346	* without effecting the original paging target in order to
	1347	* maintain better hysteresis and not have the free count always
	1348	* be dead-on v_free_min.
	1349	*
	1350	* NOTE: we are still in a critical section.
	1351	*
	1352	* Pages moved from PQ_CACHE to totally free are not counted in the
	1353	* pages_freed counter.
	1354	*/
	1355	while (vmstats.v_free_count <
	1356	(vmstats.v_free_min + vmstats.v_free_target) / 2) {
	1357	/*
	1358	* This steals some code from vm/vm_page.c
	1359	*/
	1360	static int cache_rover = 0;
	1361
	1362	m = vm_page_list_find(PQ_CACHE, cache_rover & PQ_L2_MASK, FALSE);
	1363	if (m == NULL)
	1364	break;
	1365	/* page is returned removed from its queue and spinlocked */
	1366	if (vm_page_busy_try(m, TRUE)) {
	1367	vm_page_deactivate_locked(m);
	1368	vm_page_spin_unlock(m);
	1369	#ifdef INVARIANTS
	1370	kprintf("Warning: busy page %p found in cache\n", m);
	1371	#endif
	1372	continue;
	1373	}
	1374	vm_page_spin_unlock(m);
	1375	pagedaemon_wakeup();
	1376	lwkt_yield();
	1377
	1378	/*
	1379	* Page has been successfully busied and it and its queue
	1380	* is no longer spinlocked.
	1381	*/
	1382	if ((m->flags & PG_UNMANAGED) \|\|
	1383	m->hold_count \|\|
	1384	m->wire_count) {
	1385	vm_page_deactivate(m);
	1386	vm_page_wakeup(m);
	1387	continue;
	1388	}
	1389	KKASSERT((m->flags & PG_MAPPED) == 0);
	1390	KKASSERT(m->dirty == 0);
	1391	cache_rover += PQ_PRIME2;
	1392	vm_pageout_page_free(m);
	1393	mycpu->gd_cnt.v_dfree++;
	1394	}
	1395
	1396	#if !defined(NO_SWAPPING)
	1397	/*
	1398	* Idle process swapout -- run once per second.
	1399	*/
	1400	if (vm_swap_idle_enabled) {
	1401	static long lsec;
	1402	if (time_second != lsec) {
	1403	vm_pageout_req_swapout \|= VM_SWAP_IDLE;
	1404	vm_req_vmdaemon();
	1405	lsec = time_second;
	1406	}
	1407	}
	1408	#endif
	1409
	1410	/*
	1411	* If we didn't get enough free pages, and we have skipped a vnode
	1412	* in a writeable object, wakeup the sync daemon. And kick swapout
	1413	* if we did not get enough free pages.
	1414	*/
	1415	if (vm_paging_target() > 0) {
	1416	if (vnodes_skipped && vm_page_count_min(0))
	1417	speedup_syncer();
	1418	#if !defined(NO_SWAPPING)
	1419	if (vm_swap_enabled && vm_page_count_target()) {
	1420	vm_req_vmdaemon();
	1421	vm_pageout_req_swapout \|= VM_SWAP_NORMAL;
	1422	}
	1423	#endif
	1424	}
	1425
	1426	/*
	1427	* Handle catastrophic conditions. Under good conditions we should
	1428	* be at the target, well beyond our minimum. If we could not even
	1429	* reach our minimum the system is under heavy stress.
	1430	*
	1431	* Determine whether we have run out of memory. This occurs when
	1432	* swap_pager_full is TRUE and the only pages left in the page
	1433	* queues are dirty. We will still likely have page shortages.
	1434	*
	1435	* - swap_pager_full is set if insufficient swap was
	1436	* available to satisfy a requested pageout.
	1437	*
	1438	* - the inactive queue is bloated (4 x size of active queue),
	1439	* meaning it is unable to get rid of dirty pages and.
	1440	*
	1441	* - vm_page_count_min() without counting pages recycled from the
	1442	* active queue (recycle_count) means we could not recover
	1443	* enough pages to meet bare minimum needs. This test only
	1444	* works if the inactive queue is bloated.
	1445	*
	1446	* - due to a positive inactive_shortage we shifted the remaining
	1447	* dirty pages from the active queue to the inactive queue
	1448	* trying to find clean ones to free.
	1449	*/
	1450	if (swap_pager_full && vm_page_count_min(recycle_count))
	1451	kprintf("Warning: system low on memory+swap!\n");
	1452	if (swap_pager_full && vm_page_count_min(recycle_count) &&
	1453	vmstats.v_inactive_count > vmstats.v_active_count * 4 &&
	1454	inactive_shortage > 0) {
	1455	/*
	1456	* Kill something.
	1457	*/
	1458	info.bigproc = NULL;
	1459	info.bigsize = 0;
	1460	allproc_scan(vm_pageout_scan_callback, &info);
	1461	if (info.bigproc != NULL) {
	1462	killproc(info.bigproc, "out of swap space");
	1463	info.bigproc->p_nice = PRIO_MIN;
	1464	info.bigproc->p_usched->resetpriority(
	1465	FIRST_LWP_IN_PROC(info.bigproc));
	1466	wakeup(&vmstats.v_free_count);
	1467	PRELE(info.bigproc);
	1468	}
	1469	}
	1470	return(inactive_shortage);
	1471	}
	1472
	1473	/*
	1474	* The caller must hold proc_token.
	1475	*/
	1476	static int
	1477	vm_pageout_scan_callback(struct proc p, void data)
	1478	{
	1479	struct vm_pageout_scan_info *info = data;
	1480	vm_offset_t size;
	1481
	1482	/*
	1483	* Never kill system processes or init. If we have configured swap
	1484	* then try to avoid killing low-numbered pids.
	1485	*/
	1486	if ((p->p_flag & P_SYSTEM) \|\| (p->p_pid == 1) \|\|
	1487	((p->p_pid < 48) && (vm_swap_size != 0))) {
	1488	return (0);
	1489	}
	1490
	1491	/*
	1492	* if the process is in a non-running type state,
	1493	* don't touch it.
	1494	*/
	1495	if (p->p_stat != SACTIVE && p->p_stat != SSTOP)
	1496	return (0);
	1497
	1498	/*
	1499	* Get the approximate process size. Note that anonymous pages
	1500	* with backing swap will be counted twice, but there should not
	1501	* be too many such pages due to the stress the VM system is
	1502	* under at this point.
	1503	*/
	1504	size = vmspace_anonymous_count(p->p_vmspace) +
	1505	vmspace_swap_count(p->p_vmspace);
	1506
	1507	/*
	1508	* If the this process is bigger than the biggest one
	1509	* remember it.
	1510	*/
	1511	if (info->bigsize < size) {
	1512	if (info->bigproc)
	1513	PRELE(info->bigproc);
	1514	PHOLD(p);
	1515	info->bigproc = p;
	1516	info->bigsize = size;
	1517	}
	1518	lwkt_yield();
	1519	return(0);
	1520	}
	1521
	1522	/*
	1523	* This routine tries to maintain the pseudo LRU active queue,
	1524	* so that during long periods of time where there is no paging,
	1525	* that some statistic accumulation still occurs. This code
	1526	* helps the situation where paging just starts to occur.
	1527	*/
	1528	static void
	1529	vm_pageout_page_stats(void)
	1530	{
	1531	static int fullintervalcount = 0;
	1532	struct vm_page marker;
	1533	vm_page_t m;
	1534	int pcount, tpcount; /* Number of pages to check */
	1535	int page_shortage;
	1536
	1537	page_shortage = (vmstats.v_inactive_target + vmstats.v_cache_max +
	1538	vmstats.v_free_min) -
	1539	(vmstats.v_free_count + vmstats.v_inactive_count +
	1540	vmstats.v_cache_count);
	1541
	1542	if (page_shortage <= 0)
	1543	return;
	1544
	1545	pcount = vmstats.v_active_count;
	1546	fullintervalcount += vm_pageout_stats_interval;
	1547	if (fullintervalcount < vm_pageout_full_stats_interval) {
	1548	tpcount = (vm_pageout_stats_max * vmstats.v_active_count) /
	1549	vmstats.v_page_count;
	1550	if (pcount > tpcount)
	1551	pcount = tpcount;
	1552	} else {
	1553	fullintervalcount = 0;
	1554	}
	1555
	1556	bzero(&marker, sizeof(marker));
	1557	marker.flags = PG_BUSY \| PG_FICTITIOUS \| PG_MARKER;
	1558	marker.queue = PQ_ACTIVE;
	1559	marker.wire_count = 1;
	1560
	1561	vm_page_queues_spin_lock(PQ_ACTIVE);
	1562	TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE].pl, &marker, pageq);
	1563	vm_page_queues_spin_unlock(PQ_ACTIVE);
	1564
	1565	while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
	1566	pcount-- > 0)
	1567	{
	1568	int actcount;
	1569
	1570	vm_page_and_queue_spin_lock(m);
	1571	if (m != TAILQ_NEXT(&marker, pageq)) {
	1572	vm_page_and_queue_spin_unlock(m);
	1573	++pcount;
	1574	continue;
	1575	}
	1576	KKASSERT(m->queue == PQ_ACTIVE);
	1577	TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl, &marker, pageq);
	1578	TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE].pl, m,
	1579	&marker, pageq);
	1580
	1581	/*
	1582	* Ignore markers
	1583	*/
	1584	if (m->flags & PG_MARKER) {
	1585	vm_page_and_queue_spin_unlock(m);
	1586	continue;
	1587	}
	1588
	1589	/*
	1590	* Ignore pages we can't busy
	1591	*/
	1592	if (vm_page_busy_try(m, TRUE)) {
	1593	vm_page_and_queue_spin_unlock(m);
	1594	continue;
	1595	}
	1596	vm_page_and_queue_spin_unlock(m);
	1597	KKASSERT(m->queue == PQ_ACTIVE);
	1598
	1599	/*
	1600	* We now have a safely busied page, the page and queue
	1601	* spinlocks have been released.
	1602	*
	1603	* Ignore held pages
	1604	*/
	1605	if (m->hold_count) {
	1606	vm_page_wakeup(m);
	1607	continue;
	1608	}
	1609
	1610	/*
	1611	* Calculate activity
	1612	*/
	1613	actcount = 0;
	1614	if (m->flags & PG_REFERENCED) {
	1615	vm_page_flag_clear(m, PG_REFERENCED);
	1616	actcount += 1;
	1617	}
	1618	actcount += pmap_ts_referenced(m);
	1619
	1620	/*
	1621	* Update act_count and move page to end of queue.
	1622	*/
	1623	if (actcount) {
	1624	m->act_count += ACT_ADVANCE + actcount;
	1625	if (m->act_count > ACT_MAX)
	1626	m->act_count = ACT_MAX;
	1627	vm_page_and_queue_spin_lock(m);
	1628	if (m->queue == PQ_ACTIVE) {
	1629	TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl,
	1630	m, pageq);
	1631	TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl,
	1632	m, pageq);
	1633	}
	1634	vm_page_and_queue_spin_unlock(m);
	1635	vm_page_wakeup(m);
	1636	continue;
	1637	}
	1638
	1639	if (m->act_count == 0) {
	1640	/*
	1641	* We turn off page access, so that we have
	1642	* more accurate RSS stats. We don't do this
	1643	* in the normal page deactivation when the
	1644	* system is loaded VM wise, because the
	1645	* cost of the large number of page protect
	1646	* operations would be higher than the value
	1647	* of doing the operation.
	1648	*
	1649	* We use the marker to save our place so
	1650	* we can release the spin lock. both (m)
	1651	* and (next) will be invalid.
	1652	*/
	1653	vm_page_protect(m, VM_PROT_NONE);
	1654	vm_page_deactivate(m);
	1655	} else {
	1656	m->act_count -= min(m->act_count, ACT_DECLINE);
	1657	vm_page_and_queue_spin_lock(m);
	1658	if (m->queue == PQ_ACTIVE) {
	1659	TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl,
	1660	m, pageq);
	1661	TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl,
	1662	m, pageq);
	1663	}
	1664	vm_page_and_queue_spin_unlock(m);
	1665	}
	1666	vm_page_wakeup(m);
	1667	}
	1668
	1669	/*
	1670	* Remove our local marker
	1671	*/
	1672	vm_page_queues_spin_lock(PQ_ACTIVE);
	1673	TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl, &marker, pageq);
	1674	vm_page_queues_spin_unlock(PQ_ACTIVE);
	1675
	1676	}
	1677
	1678	static int
	1679	vm_pageout_free_page_calc(vm_size_t count)
	1680	{
	1681	if (count < vmstats.v_page_count)
	1682	return 0;
	1683	/*
	1684	* free_reserved needs to include enough for the largest swap pager
	1685	* structures plus enough for any pv_entry structs when paging.
	1686	*
	1687	* v_free_min normal allocations
	1688	* v_free_reserved system allocations
	1689	* v_pageout_free_min allocations by pageout daemon
	1690	* v_interrupt_free_min low level allocations (e.g swap structures)
	1691	*/
	1692	if (vmstats.v_page_count > 1024)
	1693	vmstats.v_free_min = 64 + (vmstats.v_page_count - 1024) / 200;
	1694	else
	1695	vmstats.v_free_min = 64;
	1696	vmstats.v_free_reserved = vmstats.v_free_min * 4 / 8 + 7;
	1697	vmstats.v_free_severe = vmstats.v_free_min * 4 / 8 + 0;
	1698	vmstats.v_pageout_free_min = vmstats.v_free_min * 2 / 8 + 7;
	1699	vmstats.v_interrupt_free_min = vmstats.v_free_min * 1 / 8 + 7;
	1700
	1701	return 1;
	1702	}
	1703
	1704
	1705	/*
	1706	* vm_pageout is the high level pageout daemon.
	1707	*
	1708	* No requirements.
	1709	*/
	1710	static void
	1711	vm_pageout_thread(void)
	1712	{
	1713	int pass;
	1714	int inactive_shortage;
	1715
	1716	/*
	1717	* Initialize some paging parameters.
	1718	*/
	1719	curthread->td_flags \|= TDF_SYSTHREAD;
	1720
	1721	if (vmstats.v_page_count < 2000)
	1722	vm_pageout_page_count = 8;
	1723
	1724	vm_pageout_free_page_calc(vmstats.v_page_count);
	1725
	1726	/*
	1727	* v_free_target and v_cache_min control pageout hysteresis. Note
	1728	* that these are more a measure of the VM cache queue hysteresis
	1729	* then the VM free queue. Specifically, v_free_target is the
	1730	* high water mark (free+cache pages).
	1731	*
	1732	* v_free_reserved + v_cache_min (mostly means v_cache_min) is the
	1733	* low water mark, while v_free_min is the stop. v_cache_min must
	1734	* be big enough to handle memory needs while the pageout daemon
	1735	* is signalled and run to free more pages.
	1736	*/
	1737	if (vmstats.v_free_count > 6144)
	1738	vmstats.v_free_target = 4 * vmstats.v_free_min + vmstats.v_free_reserved;
	1739	else
	1740	vmstats.v_free_target = 2 * vmstats.v_free_min + vmstats.v_free_reserved;
	1741
	1742	/*
	1743	* NOTE: With the new buffer cache b_act_count we want the default
	1744	* inactive target to be a percentage of available memory.
	1745	*
	1746	* The inactive target essentially determines the minimum
	1747	* number of 'temporary' pages capable of caching one-time-use
	1748	* files when the VM system is otherwise full of pages
	1749	* belonging to multi-time-use files or active program data.
	1750	*
	1751	* NOTE: The inactive target is aggressively persued only if the
	1752	* inactive queue becomes too small. If the inactive queue
	1753	* is large enough to satisfy page movement to free+cache
	1754	* then it is repopulated more slowly from the active queue.
	1755	* This allows a general inactive_target default to be set.
	1756	*
	1757	* There is an issue here for processes which sit mostly idle
	1758	* 'overnight', such as sshd, tcsh, and X. Any movement from
	1759	* the active queue will eventually cause such pages to
	1760	* recycle eventually causing a lot of paging in the morning.
	1761	* To reduce the incidence of this pages cycled out of the
	1762	* buffer cache are moved directly to the inactive queue if
	1763	* they were only used once or twice.
	1764	*
	1765	* The vfs.vm_cycle_point sysctl can be used to adjust this.
	1766	* Increasing the value (up to 64) increases the number of
	1767	* buffer recyclements which go directly to the inactive queue.
	1768	*/
	1769	if (vmstats.v_free_count > 2048) {
	1770	vmstats.v_cache_min = vmstats.v_free_target;
	1771	vmstats.v_cache_max = 2 * vmstats.v_cache_min;
	1772	} else {
	1773	vmstats.v_cache_min = 0;
	1774	vmstats.v_cache_max = 0;
	1775	}
	1776	vmstats.v_inactive_target = vmstats.v_free_count / 4;
	1777
	1778	/* XXX does not really belong here */
	1779	if (vm_page_max_wired == 0)
	1780	vm_page_max_wired = vmstats.v_free_count / 3;
	1781
	1782	if (vm_pageout_stats_max == 0)
	1783	vm_pageout_stats_max = vmstats.v_free_target;
	1784
	1785	/*
	1786	* Set interval in seconds for stats scan.
	1787	*/
	1788	if (vm_pageout_stats_interval == 0)
	1789	vm_pageout_stats_interval = 5;
	1790	if (vm_pageout_full_stats_interval == 0)
	1791	vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4;
	1792
	1793
	1794	/*
	1795	* Set maximum free per pass
	1796	*/
	1797	if (vm_pageout_stats_free_max == 0)
	1798	vm_pageout_stats_free_max = 5;
	1799
	1800	swap_pager_swap_init();
	1801	pass = 0;
	1802
	1803	/*
	1804	* The pageout daemon is never done, so loop forever.
	1805	*/
	1806	while (TRUE) {
	1807	int error;
	1808
	1809	/*
	1810	* Wait for an action request. If we timeout check to
	1811	* see if paging is needed (in case the normal wakeup
	1812	* code raced us).
	1813	*/
	1814	if (vm_pages_needed == 0) {
	1815	error = tsleep(&vm_pages_needed,
	1816	0, "psleep",
	1817	vm_pageout_stats_interval * hz);
	1818	if (error &&
	1819	vm_paging_needed() == 0 &&
	1820	vm_pages_needed == 0) {
	1821	vm_pageout_page_stats();
	1822	continue;
	1823	}
	1824	vm_pages_needed = 1;
	1825	}
	1826
	1827	mycpu->gd_cnt.v_pdwakeups++;
	1828
	1829	/*
	1830	* Scan for pageout. Try to avoid thrashing the system
	1831	* with activity.
	1832	*/
	1833	inactive_shortage = vm_pageout_scan(pass);
	1834	if (inactive_shortage > 0) {
	1835	++pass;
	1836	if (swap_pager_full) {
	1837	/*
	1838	* Running out of memory, catastrophic back-off
	1839	* to one-second intervals.
	1840	*/
	1841	tsleep(&vm_pages_needed, 0, "pdelay", hz);
	1842	} else if (pass < 10 && vm_pages_needed > 1) {
	1843	/*
	1844	* Normal operation, additional processes
	1845	* have already kicked us. Retry immediately.
	1846	*/
	1847	} else if (pass < 10) {
	1848	/*
	1849	* Normal operation, fewer processes. Delay
	1850	* a bit but allow wakeups.
	1851	*/
	1852	vm_pages_needed = 0;
	1853	tsleep(&vm_pages_needed, 0, "pdelay", hz / 10);
	1854	vm_pages_needed = 1;
	1855	} else {
	1856	/*
	1857	* We've taken too many passes, forced delay.
	1858	*/
	1859	tsleep(&vm_pages_needed, 0, "pdelay", hz / 10);
	1860	}
	1861	} else {
	1862	/*
	1863	* Interlocked wakeup of waiters (non-optional)
	1864	*/
	1865	pass = 0;
	1866	if (vm_pages_needed && !vm_page_count_min(0)) {
	1867	wakeup(&vmstats.v_free_count);
	1868	vm_pages_needed = 0;
	1869	}
	1870	}
	1871	}
	1872	}
	1873
	1874	static struct kproc_desc page_kp = {
	1875	"pagedaemon",
	1876	vm_pageout_thread,
	1877	&pagethread
	1878	};
	1879	SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &page_kp)
	1880
	1881
	1882	/*
	1883	* Called after allocating a page out of the cache or free queue
	1884	* to possibly wake the pagedaemon up to replentish our supply.
	1885	*
	1886	* We try to generate some hysteresis by waking the pagedaemon up
	1887	* when our free+cache pages go below the free_min+cache_min level.
	1888	* The pagedaemon tries to get the count back up to at least the
	1889	* minimum, and through to the target level if possible.
	1890	*
	1891	* If the pagedaemon is already active bump vm_pages_needed as a hint
	1892	* that there are even more requests pending.
	1893	*
	1894	* SMP races ok?
	1895	* No requirements.
	1896	*/
	1897	void
	1898	pagedaemon_wakeup(void)
	1899	{
	1900	if (vm_paging_needed() && curthread != pagethread) {
	1901	if (vm_pages_needed == 0) {
	1902	vm_pages_needed = 1; /* SMP race ok */
	1903	wakeup(&vm_pages_needed);
	1904	} else if (vm_page_count_min(0)) {
	1905	++vm_pages_needed; /* SMP race ok */
	1906	}
	1907	}
	1908	}
	1909
	1910	#if !defined(NO_SWAPPING)
	1911
	1912	/*
	1913	* SMP races ok?
	1914	* No requirements.
	1915	*/
	1916	static void
	1917	vm_req_vmdaemon(void)
	1918	{
	1919	static int lastrun = 0;
	1920
	1921	if ((ticks > (lastrun + hz)) \|\| (ticks < lastrun)) {
	1922	wakeup(&vm_daemon_needed);
	1923	lastrun = ticks;
	1924	}
	1925	}
	1926
	1927	static int vm_daemon_callback(struct proc p, void data __unused);
	1928
	1929	/*
	1930	* No requirements.
	1931	*/
	1932	static void
	1933	vm_daemon(void)
	1934	{
	1935	/*
	1936	* XXX vm_daemon_needed specific token?
	1937	*/
	1938	while (TRUE) {
	1939	tsleep(&vm_daemon_needed, 0, "psleep", 0);
	1940	if (vm_pageout_req_swapout) {
	1941	swapout_procs(vm_pageout_req_swapout);
	1942	vm_pageout_req_swapout = 0;
	1943	}
	1944	/*
	1945	* scan the processes for exceeding their rlimits or if
	1946	* process is swapped out -- deactivate pages
	1947	*/
	1948	allproc_scan(vm_daemon_callback, NULL);
	1949	}
	1950	}
	1951
	1952	/*
	1953	* Caller must hold proc_token.
	1954	*/
	1955	static int
	1956	vm_daemon_callback(struct proc p, void data __unused)
	1957	{
	1958	vm_pindex_t limit, size;
	1959
	1960	/*
	1961	* if this is a system process or if we have already
	1962	* looked at this process, skip it.
	1963	*/
	1964	if (p->p_flag & (P_SYSTEM \| P_WEXIT))
	1965	return (0);
	1966
	1967	/*
	1968	* if the process is in a non-running type state,
	1969	* don't touch it.
	1970	*/
	1971	if (p->p_stat != SACTIVE && p->p_stat != SSTOP)
	1972	return (0);
	1973
	1974	/*
	1975	* get a limit
	1976	*/
	1977	limit = OFF_TO_IDX(qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur,
	1978	p->p_rlimit[RLIMIT_RSS].rlim_max));
	1979
	1980	/*
	1981	* let processes that are swapped out really be
	1982	* swapped out. Set the limit to nothing to get as
	1983	* many pages out to swap as possible.
	1984	*/
	1985	if (p->p_flag & P_SWAPPEDOUT)
	1986	limit = 0;
	1987
	1988	lwkt_gettoken(&p->p_vmspace->vm_map.token);
	1989	size = vmspace_resident_count(p->p_vmspace);
	1990	if (limit >= 0 && size >= limit) {
	1991	vm_pageout_map_deactivate_pages(&p->p_vmspace->vm_map, limit);
	1992	}
	1993	lwkt_reltoken(&p->p_vmspace->vm_map.token);
	1994	return (0);
	1995	}
	1996
	1997	#endif