gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* (MPSAFE)
	3	*
	4	* Copyright (c) 1991 Regents of the University of California.
	5	* All rights reserved.
	6	* Copyright (c) 1994 John S. Dyson
	7	* All rights reserved.
	8	* Copyright (c) 1994 David Greenman
	9	* All rights reserved.
	10	*
	11	* This code is derived from software contributed to Berkeley by
	12	* The Mach Operating System project at Carnegie-Mellon University.
	13	*
	14	* Redistribution and use in source and binary forms, with or without
	15	* modification, are permitted provided that the following conditions
	16	* are met:
	17	* 1. Redistributions of source code must retain the above copyright
	18	* notice, this list of conditions and the following disclaimer.
	19	* 2. Redistributions in binary form must reproduce the above copyright
	20	* notice, this list of conditions and the following disclaimer in the
	21	* documentation and/or other materials provided with the distribution.
	22	* 4. Neither the name of the University nor the names of its contributors
	23	* may be used to endorse or promote products derived from this software
	24	* without specific prior written permission.
	25	*
	26	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	27	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	28	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	29	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	30	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	31	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	32	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	33	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	34	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	35	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	36	* SUCH DAMAGE.
	37	*
	38	* from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91
	39	*
	40	*
	41	* Copyright (c) 1987, 1990 Carnegie-Mellon University.
	42	* All rights reserved.
	43	*
	44	* Authors: Avadis Tevanian, Jr., Michael Wayne Young
	45	*
	46	* Permission to use, copy, modify and distribute this software and
	47	* its documentation is hereby granted, provided that both the copyright
	48	* notice and this permission notice appear in all copies of the
	49	* software, derivative works or modified versions, and any portions
	50	* thereof, and that both notices appear in supporting documentation.
	51	*
	52	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
	53	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
	54	* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
	55	*
	56	* Carnegie Mellon requests users of this software to return to
	57	*
	58	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
	59	* School of Computer Science
	60	* Carnegie Mellon University
	61	* Pittsburgh PA 15213-3890
	62	*
	63	* any improvements or extensions that they make and grant Carnegie the
	64	* rights to redistribute these changes.
	65	*
	66	* $FreeBSD: src/sys/vm/vm_pageout.c,v 1.151.2.15 2002/12/29 18:21:04 dillon Exp $
	67	*/
	68
	69	/*
	70	* The proverbial page-out daemon.
	71	*/
	72
	73	#include "opt_vm.h"
	74	#include <sys/param.h>
	75	#include <sys/systm.h>
	76	#include <sys/kernel.h>
	77	#include <sys/proc.h>
	78	#include <sys/kthread.h>
	79	#include <sys/resourcevar.h>
	80	#include <sys/signalvar.h>
	81	#include <sys/vnode.h>
	82	#include <sys/vmmeter.h>
	83	#include <sys/sysctl.h>
	84
	85	#include <vm/vm.h>
	86	#include <vm/vm_param.h>
	87	#include <sys/lock.h>
	88	#include <vm/vm_object.h>
	89	#include <vm/vm_page.h>
	90	#include <vm/vm_map.h>
	91	#include <vm/vm_pageout.h>
	92	#include <vm/vm_pager.h>
	93	#include <vm/swap_pager.h>
	94	#include <vm/vm_extern.h>
	95
	96	#include <sys/thread2.h>
	97	#include <sys/spinlock2.h>
	98	#include <vm/vm_page2.h>
	99
	100	/*
	101	* System initialization
	102	*/
	103
	104	/* the kernel process "vm_pageout"*/
	105	static int vm_pageout_clean (vm_page_t);
	106	static int vm_pageout_free_page_calc (vm_size_t count);
	107	struct thread *pagethread;
	108
	109	#if !defined(NO_SWAPPING)
	110	/* the kernel process "vm_daemon"*/
	111	static void vm_daemon (void);
	112	static struct thread *vmthread;
	113
	114	static struct kproc_desc vm_kp = {
	115	"vmdaemon",
	116	vm_daemon,
	117	&vmthread
	118	};
	119	SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp)
	120	#endif
	121
	122	int vm_pages_needed=0; /* Event on which pageout daemon sleeps */
	123	int vm_pageout_deficit=0; /* Estimated number of pages deficit */
	124	int vm_pageout_pages_needed=0; /* flag saying that the pageout daemon needs pages */
	125
	126	#if !defined(NO_SWAPPING)
	127	static int vm_pageout_req_swapout; /* XXX */
	128	static int vm_daemon_needed;
	129	#endif
	130	static int vm_max_launder = 32;
	131	static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0;
	132	static int vm_pageout_full_stats_interval = 0;
	133	static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0;
	134	static int defer_swap_pageouts=0;
	135	static int disable_swap_pageouts=0;
	136
	137	#if defined(NO_SWAPPING)
	138	static int vm_swap_enabled=0;
	139	static int vm_swap_idle_enabled=0;
	140	#else
	141	static int vm_swap_enabled=1;
	142	static int vm_swap_idle_enabled=0;
	143	#endif
	144
	145	SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, pageout_algorithm,
	146	CTLFLAG_RW, &vm_pageout_algorithm, 0, "LRU page mgmt");
	147
	148	SYSCTL_INT(_vm, OID_AUTO, max_launder,
	149	CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout");
	150
	151	SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max,
	152	CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length");
	153
	154	SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval,
	155	CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan");
	156
	157	SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval,
	158	CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan");
	159
	160	SYSCTL_INT(_vm, OID_AUTO, pageout_stats_free_max,
	161	CTLFLAG_RW, &vm_pageout_stats_free_max, 0, "Not implemented");
	162
	163	#if defined(NO_SWAPPING)
	164	SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
	165	CTLFLAG_RD, &vm_swap_enabled, 0, "");
	166	SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
	167	CTLFLAG_RD, &vm_swap_idle_enabled, 0, "");
	168	#else
	169	SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
	170	CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout");
	171	SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
	172	CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
	173	#endif
	174
	175	SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts,
	176	CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem");
	177
	178	SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
	179	CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
	180
	181	static int pageout_lock_miss;
	182	SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
	183	CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
	184
	185	#define VM_PAGEOUT_PAGE_COUNT 16
	186	int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT;
	187
	188	int vm_page_max_wired; /* XXX max # of wired pages system-wide */
	189
	190	#if !defined(NO_SWAPPING)
	191	typedef void freeer_fcn_t (vm_map_t, vm_object_t, vm_pindex_t, int);
	192	static void vm_pageout_map_deactivate_pages (vm_map_t, vm_pindex_t);
	193	static freeer_fcn_t vm_pageout_object_deactivate_pages;
	194	static void vm_req_vmdaemon (void);
	195	#endif
	196	static void vm_pageout_page_stats(int q);
	197
	198	static __inline int
	199	PQAVERAGE(int n)
	200	{
	201	if (n >= 0)
	202	return((n + (PQ_L2_SIZE - 1)) / PQ_L2_SIZE + 1);
	203	else
	204	return((n - (PQ_L2_SIZE - 1)) / PQ_L2_SIZE - 1);
	205	}
	206
	207	/*
	208	* vm_pageout_clean:
	209	*
	210	* Clean the page and remove it from the laundry. The page must not be
	211	* busy on-call.
	212	*
	213	* We set the busy bit to cause potential page faults on this page to
	214	* block. Note the careful timing, however, the busy bit isn't set till
	215	* late and we cannot do anything that will mess with the page.
	216	*/
	217	static int
	218	vm_pageout_clean(vm_page_t m)
	219	{
	220	vm_object_t object;
	221	vm_page_t mc[2*vm_pageout_page_count];
	222	int pageout_count;
	223	int error;
	224	int ib, is, page_base;
	225	vm_pindex_t pindex = m->pindex;
	226
	227	object = m->object;
	228
	229	/*
	230	* It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP
	231	* with the new swapper, but we could have serious problems paging
	232	* out other object types if there is insufficient memory.
	233	*
	234	* Unfortunately, checking free memory here is far too late, so the
	235	* check has been moved up a procedural level.
	236	*/
	237
	238	/*
	239	* Don't mess with the page if it's busy, held, or special
	240	*
	241	* XXX do we really need to check hold_count here? hold_count
	242	* isn't supposed to mess with vm_page ops except prevent the
	243	* page from being reused.
	244	*/
	245	if (m->hold_count != 0 \|\| (m->flags & PG_UNMANAGED)) {
	246	vm_page_wakeup(m);
	247	return 0;
	248	}
	249
	250	mc[vm_pageout_page_count] = m;
	251	pageout_count = 1;
	252	page_base = vm_pageout_page_count;
	253	ib = 1;
	254	is = 1;
	255
	256	/*
	257	* Scan object for clusterable pages.
	258	*
	259	* We can cluster ONLY if: ->> the page is NOT
	260	* clean, wired, busy, held, or mapped into a
	261	* buffer, and one of the following:
	262	* 1) The page is inactive, or a seldom used
	263	* active page.
	264	* -or-
	265	* 2) we force the issue.
	266	*
	267	* During heavy mmap/modification loads the pageout
	268	* daemon can really fragment the underlying file
	269	* due to flushing pages out of order and not trying
	270	* align the clusters (which leave sporatic out-of-order
	271	* holes). To solve this problem we do the reverse scan
	272	* first and attempt to align our cluster, then do a
	273	* forward scan if room remains.
	274	*/
	275
	276	vm_object_hold(object);
	277	more:
	278	while (ib && pageout_count < vm_pageout_page_count) {
	279	vm_page_t p;
	280
	281	if (ib > pindex) {
	282	ib = 0;
	283	break;
	284	}
	285
	286	p = vm_page_lookup_busy_try(object, pindex - ib, TRUE, &error);
	287	if (error \|\| p == NULL) {
	288	ib = 0;
	289	break;
	290	}
	291	if ((p->queue - p->pc) == PQ_CACHE \|\|
	292	(p->flags & PG_UNMANAGED)) {
	293	vm_page_wakeup(p);
	294	ib = 0;
	295	break;
	296	}
	297	vm_page_test_dirty(p);
	298	if ((p->dirty & p->valid) == 0 \|\|
	299	p->queue - p->pc != PQ_INACTIVE \|\|
	300	p->wire_count != 0 \|\| /* may be held by buf cache */
	301	p->hold_count != 0) { /* may be undergoing I/O */
	302	vm_page_wakeup(p);
	303	ib = 0;
	304	break;
	305	}
	306	mc[--page_base] = p;
	307	++pageout_count;
	308	++ib;
	309	/*
	310	* alignment boundry, stop here and switch directions. Do
	311	* not clear ib.
	312	*/
	313	if ((pindex - (ib - 1)) % vm_pageout_page_count == 0)
	314	break;
	315	}
	316
	317	while (pageout_count < vm_pageout_page_count &&
	318	pindex + is < object->size) {
	319	vm_page_t p;
	320
	321	p = vm_page_lookup_busy_try(object, pindex + is, TRUE, &error);
	322	if (error \|\| p == NULL)
	323	break;
	324	if (((p->queue - p->pc) == PQ_CACHE) \|\|
	325	(p->flags & (PG_BUSY\|PG_UNMANAGED)) \|\| p->busy) {
	326	vm_page_wakeup(p);
	327	break;
	328	}
	329	vm_page_test_dirty(p);
	330	if ((p->dirty & p->valid) == 0 \|\|
	331	p->queue - p->pc != PQ_INACTIVE \|\|
	332	p->wire_count != 0 \|\| /* may be held by buf cache */
	333	p->hold_count != 0) { /* may be undergoing I/O */
	334	vm_page_wakeup(p);
	335	break;
	336	}
	337	mc[page_base + pageout_count] = p;
	338	++pageout_count;
	339	++is;
	340	}
	341
	342	/*
	343	* If we exhausted our forward scan, continue with the reverse scan
	344	* when possible, even past a page boundry. This catches boundry
	345	* conditions.
	346	*/
	347	if (ib && pageout_count < vm_pageout_page_count)
	348	goto more;
	349
	350	vm_object_drop(object);
	351
	352	/*
	353	* we allow reads during pageouts...
	354	*/
	355	return vm_pageout_flush(&mc[page_base], pageout_count, 0);
	356	}
	357
	358	/*
	359	* vm_pageout_flush() - launder the given pages
	360	*
	361	* The given pages are laundered. Note that we setup for the start of
	362	* I/O ( i.e. busy the page ), mark it read-only, and bump the object
	363	* reference count all in here rather then in the parent. If we want
	364	* the parent to do more sophisticated things we may have to change
	365	* the ordering.
	366	*
	367	* The pages in the array must be busied by the caller and will be
	368	* unbusied by this function.
	369	*/
	370	int
	371	vm_pageout_flush(vm_page_t *mc, int count, int flags)
	372	{
	373	vm_object_t object;
	374	int pageout_status[count];
	375	int numpagedout = 0;
	376	int i;
	377
	378	/*
	379	* Initiate I/O. Bump the vm_page_t->busy counter.
	380	*/
	381	for (i = 0; i < count; i++) {
	382	KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL,
	383	("vm_pageout_flush page %p index %d/%d: partially "
	384	"invalid page", mc[i], i, count));
	385	vm_page_io_start(mc[i]);
	386	}
	387
	388	/*
	389	* We must make the pages read-only. This will also force the
	390	* modified bit in the related pmaps to be cleared. The pager
	391	* cannot clear the bit for us since the I/O completion code
	392	* typically runs from an interrupt. The act of making the page
	393	* read-only handles the case for us.
	394	*
	395	* Then we can unbusy the pages, we still hold a reference by virtue
	396	* of our soft-busy.
	397	*/
	398	for (i = 0; i < count; i++) {
	399	vm_page_protect(mc[i], VM_PROT_READ);
	400	vm_page_wakeup(mc[i]);
	401	}
	402
	403	object = mc[0]->object;
	404	vm_object_pip_add(object, count);
	405
	406	vm_pager_put_pages(object, mc, count,
	407	(flags \| ((object == &kernel_object) ? VM_PAGER_PUT_SYNC : 0)),
	408	pageout_status);
	409
	410	for (i = 0; i < count; i++) {
	411	vm_page_t mt = mc[i];
	412
	413	switch (pageout_status[i]) {
	414	case VM_PAGER_OK:
	415	numpagedout++;
	416	break;
	417	case VM_PAGER_PEND:
	418	numpagedout++;
	419	break;
	420	case VM_PAGER_BAD:
	421	/*
	422	* Page outside of range of object. Right now we
	423	* essentially lose the changes by pretending it
	424	* worked.
	425	*/
	426	vm_page_busy_wait(mt, FALSE, "pgbad");
	427	pmap_clear_modify(mt);
	428	vm_page_undirty(mt);
	429	vm_page_wakeup(mt);
	430	break;
	431	case VM_PAGER_ERROR:
	432	case VM_PAGER_FAIL:
	433	/*
	434	* A page typically cannot be paged out when we
	435	* have run out of swap. We leave the page
	436	* marked inactive and will try to page it out
	437	* again later.
	438	*
	439	* Starvation of the active page list is used to
	440	* determine when the system is massively memory
	441	* starved.
	442	*/
	443	break;
	444	case VM_PAGER_AGAIN:
	445	break;
	446	}
	447
	448	/*
	449	* If the operation is still going, leave the page busy to
	450	* block all other accesses. Also, leave the paging in
	451	* progress indicator set so that we don't attempt an object
	452	* collapse.
	453	*
	454	* For any pages which have completed synchronously,
	455	* deactivate the page if we are under a severe deficit.
	456	* Do not try to enter them into the cache, though, they
	457	* might still be read-heavy.
	458	*/
	459	if (pageout_status[i] != VM_PAGER_PEND) {
	460	vm_page_busy_wait(mt, FALSE, "pgouw");
	461	if (vm_page_count_severe())
	462	vm_page_deactivate(mt);
	463	#if 0
	464	if (!vm_page_count_severe() \|\| !vm_page_try_to_cache(mt))
	465	vm_page_protect(mt, VM_PROT_READ);
	466	#endif
	467	vm_page_io_finish(mt);
	468	vm_page_wakeup(mt);
	469	vm_object_pip_wakeup(object);
	470	}
	471	}
	472	return numpagedout;
	473	}
	474
	475	#if !defined(NO_SWAPPING)
	476	/*
	477	* deactivate enough pages to satisfy the inactive target
	478	* requirements or if vm_page_proc_limit is set, then
	479	* deactivate all of the pages in the object and its
	480	* backing_objects.
	481	*
	482	* The map must be locked.
	483	* The caller must hold the vm_object.
	484	*/
	485	static int vm_pageout_object_deactivate_pages_callback(vm_page_t, void *);
	486
	487	static void
	488	vm_pageout_object_deactivate_pages(vm_map_t map, vm_object_t object,
	489	vm_pindex_t desired, int map_remove_only)
	490	{
	491	struct rb_vm_page_scan_info info;
	492	vm_object_t lobject;
	493	vm_object_t tobject;
	494	int remove_mode;
	495
	496	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
	497	lobject = object;
	498
	499	while (lobject) {
	500	if (pmap_resident_count(vm_map_pmap(map)) <= desired)
	501	break;
	502	if (lobject->type == OBJT_DEVICE \|\| lobject->type == OBJT_PHYS)
	503	break;
	504	if (lobject->paging_in_progress)
	505	break;
	506
	507	remove_mode = map_remove_only;
	508	if (lobject->shadow_count > 1)
	509	remove_mode = 1;
	510
	511	/*
	512	* scan the objects entire memory queue. We hold the
	513	* object's token so the scan should not race anything.
	514	*/
	515	info.limit = remove_mode;
	516	info.map = map;
	517	info.desired = desired;
	518	vm_page_rb_tree_RB_SCAN(&lobject->rb_memq, NULL,
	519	vm_pageout_object_deactivate_pages_callback,
	520	&info
	521	);
	522	while ((tobject = lobject->backing_object) != NULL) {
	523	KKASSERT(tobject != object);
	524	vm_object_hold(tobject);
	525	if (tobject == lobject->backing_object)
	526	break;
	527	vm_object_drop(tobject);
	528	}
	529	if (lobject != object) {
	530	vm_object_lock_swap();
	531	vm_object_drop(lobject);
	532	}
	533	lobject = tobject;
	534	}
	535	if (lobject != object)
	536	vm_object_drop(lobject);
	537	}
	538
	539	/*
	540	* The caller must hold the vm_object.
	541	*/
	542	static int
	543	vm_pageout_object_deactivate_pages_callback(vm_page_t p, void *data)
	544	{
	545	struct rb_vm_page_scan_info *info = data;
	546	int actcount;
	547
	548	if (pmap_resident_count(vm_map_pmap(info->map)) <= info->desired) {
	549	return(-1);
	550	}
	551	mycpu->gd_cnt.v_pdpages++;
	552
	553	if (vm_page_busy_try(p, TRUE))
	554	return(0);
	555	if (p->wire_count \|\| p->hold_count \|\| (p->flags & PG_UNMANAGED)) {
	556	vm_page_wakeup(p);
	557	return(0);
	558	}
	559	if (!pmap_page_exists_quick(vm_map_pmap(info->map), p)) {
	560	vm_page_wakeup(p);
	561	return(0);
	562	}
	563
	564	actcount = pmap_ts_referenced(p);
	565	if (actcount) {
	566	vm_page_flag_set(p, PG_REFERENCED);
	567	} else if (p->flags & PG_REFERENCED) {
	568	actcount = 1;
	569	}
	570
	571	vm_page_and_queue_spin_lock(p);
	572	if (p->queue - p->pc != PQ_ACTIVE && (p->flags & PG_REFERENCED)) {
	573	vm_page_and_queue_spin_unlock(p);
	574	vm_page_activate(p);
	575	p->act_count += actcount;
	576	vm_page_flag_clear(p, PG_REFERENCED);
	577	} else if (p->queue - p->pc == PQ_ACTIVE) {
	578	if ((p->flags & PG_REFERENCED) == 0) {
	579	p->act_count -= min(p->act_count, ACT_DECLINE);
	580	if (!info->limit &&
	581	(vm_pageout_algorithm \|\| (p->act_count == 0))) {
	582	vm_page_and_queue_spin_unlock(p);
	583	vm_page_protect(p, VM_PROT_NONE);
	584	vm_page_deactivate(p);
	585	} else {
	586	TAILQ_REMOVE(&vm_page_queues[p->queue].pl,
	587	p, pageq);
	588	TAILQ_INSERT_TAIL(&vm_page_queues[p->queue].pl,
	589	p, pageq);
	590	vm_page_and_queue_spin_unlock(p);
	591	}
	592	} else {
	593	vm_page_and_queue_spin_unlock(p);
	594	vm_page_activate(p);
	595	vm_page_flag_clear(p, PG_REFERENCED);
	596
	597	vm_page_and_queue_spin_lock(p);
	598	if (p->queue - p->pc == PQ_ACTIVE) {
	599	if (p->act_count < (ACT_MAX - ACT_ADVANCE))
	600	p->act_count += ACT_ADVANCE;
	601	TAILQ_REMOVE(&vm_page_queues[p->queue].pl,
	602	p, pageq);
	603	TAILQ_INSERT_TAIL(&vm_page_queues[p->queue].pl,
	604	p, pageq);
	605	}
	606	vm_page_and_queue_spin_unlock(p);
	607	}
	608	} else if (p->queue - p->pc == PQ_INACTIVE) {
	609	vm_page_and_queue_spin_unlock(p);
	610	vm_page_protect(p, VM_PROT_NONE);
	611	} else {
	612	vm_page_and_queue_spin_unlock(p);
	613	}
	614	vm_page_wakeup(p);
	615	return(0);
	616	}
	617
	618	/*
	619	* Deactivate some number of pages in a map, try to do it fairly, but
	620	* that is really hard to do.
	621	*/
	622	static void
	623	vm_pageout_map_deactivate_pages(vm_map_t map, vm_pindex_t desired)
	624	{
	625	vm_map_entry_t tmpe;
	626	vm_object_t obj, bigobj;
	627	int nothingwired;
	628
	629	if (lockmgr(&map->lock, LK_EXCLUSIVE \| LK_NOWAIT)) {
	630	return;
	631	}
	632
	633	bigobj = NULL;
	634	nothingwired = TRUE;
	635
	636	/*
	637	* first, search out the biggest object, and try to free pages from
	638	* that.
	639	*/
	640	tmpe = map->header.next;
	641	while (tmpe != &map->header) {
	642	switch(tmpe->maptype) {
	643	case VM_MAPTYPE_NORMAL:
	644	case VM_MAPTYPE_VPAGETABLE:
	645	obj = tmpe->object.vm_object;
	646	if ((obj != NULL) && (obj->shadow_count <= 1) &&
	647	((bigobj == NULL) \|\|
	648	(bigobj->resident_page_count < obj->resident_page_count))) {
	649	bigobj = obj;
	650	}
	651	break;
	652	default:
	653	break;
	654	}
	655	if (tmpe->wired_count > 0)
	656	nothingwired = FALSE;
	657	tmpe = tmpe->next;
	658	}
	659
	660	if (bigobj) {
	661	vm_object_hold(bigobj);
	662	vm_pageout_object_deactivate_pages(map, bigobj, desired, 0);
	663	vm_object_drop(bigobj);
	664	}
	665
	666	/*
	667	* Next, hunt around for other pages to deactivate. We actually
	668	* do this search sort of wrong -- .text first is not the best idea.
	669	*/
	670	tmpe = map->header.next;
	671	while (tmpe != &map->header) {
	672	if (pmap_resident_count(vm_map_pmap(map)) <= desired)
	673	break;
	674	switch(tmpe->maptype) {
	675	case VM_MAPTYPE_NORMAL:
	676	case VM_MAPTYPE_VPAGETABLE:
	677	obj = tmpe->object.vm_object;
	678	if (obj) {
	679	vm_object_hold(obj);
	680	vm_pageout_object_deactivate_pages(map, obj, desired, 0);
	681	vm_object_drop(obj);
	682	}
	683	break;
	684	default:
	685	break;
	686	}
	687	tmpe = tmpe->next;
	688	};
	689
	690	/*
	691	* Remove all mappings if a process is swapped out, this will free page
	692	* table pages.
	693	*/
	694	if (desired == 0 && nothingwired)
	695	pmap_remove(vm_map_pmap(map),
	696	VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
	697	vm_map_unlock(map);
	698	}
	699	#endif
	700
	701	/*
	702	* Called when the pageout scan wants to free a page. We no longer
	703	* try to cycle the vm_object here with a reference & dealloc, which can
	704	* cause a non-trivial object collapse in a critical path.
	705	*
	706	* It is unclear why we cycled the ref_count in the past, perhaps to try
	707	* to optimize shadow chain collapses but I don't quite see why it would
	708	* be necessary. An OBJ_DEAD object should terminate any and all vm_pages
	709	* synchronously and not have to be kicked-start.
	710	*/
	711	static void
	712	vm_pageout_page_free(vm_page_t m)
	713	{
	714	vm_page_protect(m, VM_PROT_NONE);
	715	vm_page_free(m);
	716	}
	717
	718	/*
	719	* vm_pageout_scan does the dirty work for the pageout daemon.
	720	*/
	721	struct vm_pageout_scan_info {
	722	struct proc *bigproc;
	723	vm_offset_t bigsize;
	724	};
	725
	726	static int vm_pageout_scan_callback(struct proc p, void data);
	727
	728	static int
	729	vm_pageout_scan_inactive(int pass, int q, int avail_shortage,
	730	int *vnodes_skippedp)
	731	{
	732	vm_page_t m;
	733	struct vm_page marker;
	734	struct vnode vpfailed; / warning, allowed to be stale */
	735	int maxscan;
	736	int delta = 0;
	737	vm_object_t object;
	738	int actcount;
	739	int maxlaunder;
	740
	741	/*
	742	* Start scanning the inactive queue for pages we can move to the
	743	* cache or free. The scan will stop when the target is reached or
	744	* we have scanned the entire inactive queue. Note that m->act_count
	745	* is not used to form decisions for the inactive queue, only for the
	746	* active queue.
	747	*
	748	* maxlaunder limits the number of dirty pages we flush per scan.
	749	* For most systems a smaller value (16 or 32) is more robust under
	750	* extreme memory and disk pressure because any unnecessary writes
	751	* to disk can result in extreme performance degredation. However,
	752	* systems with excessive dirty pages (especially when MAP_NOSYNC is
	753	* used) will die horribly with limited laundering. If the pageout
	754	* daemon cannot clean enough pages in the first pass, we let it go
	755	* all out in succeeding passes.
	756	*/
	757	if ((maxlaunder = vm_max_launder) <= 1)
	758	maxlaunder = 1;
	759	if (pass)
	760	maxlaunder = 10000;
	761
	762	/*
	763	* Initialize our marker
	764	*/
	765	bzero(&marker, sizeof(marker));
	766	marker.flags = PG_BUSY \| PG_FICTITIOUS \| PG_MARKER;
	767	marker.queue = PQ_INACTIVE + q;
	768	marker.pc = q;
	769	marker.wire_count = 1;
	770
	771	/*
	772	* Inactive queue scan.
	773	*
	774	* NOTE: The vm_page must be spinlocked before the queue to avoid
	775	* deadlocks, so it is easiest to simply iterate the loop
	776	* with the queue unlocked at the top.
	777	*/
	778	vpfailed = NULL;
	779
	780	vm_page_queues_spin_lock(PQ_INACTIVE + q);
	781	TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq);
	782	maxscan = vm_page_queues[PQ_INACTIVE + q].lcnt;
	783	vm_page_queues_spin_unlock(PQ_INACTIVE + q);
	784
	785	while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
	786	maxscan-- > 0 && avail_shortage - delta > 0)
	787	{
	788	vm_page_and_queue_spin_lock(m);
	789	if (m != TAILQ_NEXT(&marker, pageq)) {
	790	vm_page_and_queue_spin_unlock(m);
	791	++maxscan;
	792	continue;
	793	}
	794	KKASSERT(m->queue - m->pc == PQ_INACTIVE);
	795	TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl,
	796	&marker, pageq);
	797	TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE + q].pl, m,
	798	&marker, pageq);
	799	mycpu->gd_cnt.v_pdpages++;
	800
	801	/*
	802	* Skip marker pages
	803	*/
	804	if (m->flags & PG_MARKER) {
	805	vm_page_and_queue_spin_unlock(m);
	806	continue;
	807	}
	808
	809	/*
	810	* Try to busy the page. Don't mess with pages which are
	811	* already busy or reorder them in the queue.
	812	*/
	813	if (vm_page_busy_try(m, TRUE)) {
	814	vm_page_and_queue_spin_unlock(m);
	815	continue;
	816	}
	817	vm_page_and_queue_spin_unlock(m);
	818	KKASSERT(m->queue - m->pc == PQ_INACTIVE);
	819
	820	lwkt_yield();
	821
	822	/*
	823	* The page has been successfully busied and is now no
	824	* longer spinlocked. The queue is no longer spinlocked
	825	* either.
	826	*/
	827
	828	/*
	829	* It is possible for a page to be busied ad-hoc (e.g. the
	830	* pmap_collect() code) and wired and race against the
	831	* allocation of a new page. vm_page_alloc() may be forced
	832	* to deactivate the wired page in which case it winds up
	833	* on the inactive queue and must be handled here. We
	834	* correct the problem simply by unqueuing the page.
	835	*/
	836	if (m->wire_count) {
	837	vm_page_unqueue_nowakeup(m);
	838	vm_page_wakeup(m);
	839	kprintf("WARNING: pagedaemon: wired page on "
	840	"inactive queue %p\n", m);
	841	continue;
	842	}
	843
	844	/*
	845	* A held page may be undergoing I/O, so skip it.
	846	*/
	847	if (m->hold_count) {
	848	vm_page_and_queue_spin_lock(m);
	849	if (m->queue - m->pc == PQ_INACTIVE) {
	850	TAILQ_REMOVE(
	851	&vm_page_queues[PQ_INACTIVE + q].pl,
	852	m, pageq);
	853	TAILQ_INSERT_TAIL(
	854	&vm_page_queues[PQ_INACTIVE + q].pl,
	855	m, pageq);
	856	}
	857	vm_page_and_queue_spin_unlock(m);
	858	++vm_swapcache_inactive_heuristic;
	859	vm_page_wakeup(m);
	860	continue;
	861	}
	862
	863	if (m->object->ref_count == 0) {
	864	/*
	865	* If the object is not being used, we ignore previous
	866	* references.
	867	*/
	868	vm_page_flag_clear(m, PG_REFERENCED);
	869	pmap_clear_reference(m);
	870	/* fall through to end */
	871	} else if (((m->flags & PG_REFERENCED) == 0) &&
	872	(actcount = pmap_ts_referenced(m))) {
	873	/*
	874	* Otherwise, if the page has been referenced while
	875	* in the inactive queue, we bump the "activation
	876	* count" upwards, making it less likely that the
	877	* page will be added back to the inactive queue
	878	* prematurely again. Here we check the page tables
	879	* (or emulated bits, if any), given the upper level
	880	* VM system not knowing anything about existing
	881	* references.
	882	*/
	883	vm_page_activate(m);
	884	m->act_count += (actcount + ACT_ADVANCE);
	885	vm_page_wakeup(m);
	886	continue;
	887	}
	888
	889	/*
	890	* (m) is still busied.
	891	*
	892	* If the upper level VM system knows about any page
	893	* references, we activate the page. We also set the
	894	* "activation count" higher than normal so that we will less
	895	* likely place pages back onto the inactive queue again.
	896	*/
	897	if ((m->flags & PG_REFERENCED) != 0) {
	898	vm_page_flag_clear(m, PG_REFERENCED);
	899	actcount = pmap_ts_referenced(m);
	900	vm_page_activate(m);
	901	m->act_count += (actcount + ACT_ADVANCE + 1);
	902	vm_page_wakeup(m);
	903	continue;
	904	}
	905
	906	/*
	907	* If the upper level VM system doesn't know anything about
	908	* the page being dirty, we have to check for it again. As
	909	* far as the VM code knows, any partially dirty pages are
	910	* fully dirty.
	911	*
	912	* Pages marked PG_WRITEABLE may be mapped into the user
	913	* address space of a process running on another cpu. A
	914	* user process (without holding the MP lock) running on
	915	* another cpu may be able to touch the page while we are
	916	* trying to remove it. vm_page_cache() will handle this
	917	* case for us.
	918	*/
	919	if (m->dirty == 0) {
	920	vm_page_test_dirty(m);
	921	} else {
	922	vm_page_dirty(m);
	923	}
	924
	925	if (m->valid == 0) {
	926	/*
	927	* Invalid pages can be easily freed
	928	*/
	929	vm_pageout_page_free(m);
	930	mycpu->gd_cnt.v_dfree++;
	931	++delta;
	932	} else if (m->dirty == 0) {
	933	/*
	934	* Clean pages can be placed onto the cache queue.
	935	* This effectively frees them.
	936	*/
	937	vm_page_cache(m);
	938	++delta;
	939	} else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) {
	940	/*
	941	* Dirty pages need to be paged out, but flushing
	942	* a page is extremely expensive verses freeing
	943	* a clean page. Rather then artificially limiting
	944	* the number of pages we can flush, we instead give
	945	* dirty pages extra priority on the inactive queue
	946	* by forcing them to be cycled through the queue
	947	* twice before being flushed, after which the
	948	* (now clean) page will cycle through once more
	949	* before being freed. This significantly extends
	950	* the thrash point for a heavily loaded machine.
	951	*/
	952	vm_page_flag_set(m, PG_WINATCFLS);
	953	vm_page_and_queue_spin_lock(m);
	954	if (m->queue - m->pc == PQ_INACTIVE) {
	955	TAILQ_REMOVE(
	956	&vm_page_queues[PQ_INACTIVE + q].pl,
	957	m, pageq);
	958	TAILQ_INSERT_TAIL(
	959	&vm_page_queues[PQ_INACTIVE + q].pl,
	960	m, pageq);
	961	}
	962	vm_page_and_queue_spin_unlock(m);
	963	++vm_swapcache_inactive_heuristic;
	964	vm_page_wakeup(m);
	965	} else if (maxlaunder > 0) {
	966	/*
	967	* We always want to try to flush some dirty pages if
	968	* we encounter them, to keep the system stable.
	969	* Normally this number is small, but under extreme
	970	* pressure where there are insufficient clean pages
	971	* on the inactive queue, we may have to go all out.
	972	*/
	973	int swap_pageouts_ok;
	974	struct vnode *vp = NULL;
	975
	976	object = m->object;
	977
	978	if ((object->type != OBJT_SWAP) && (object->type != OBJT_DEFAULT)) {
	979	swap_pageouts_ok = 1;
	980	} else {
	981	swap_pageouts_ok = !(defer_swap_pageouts \|\| disable_swap_pageouts);
	982	swap_pageouts_ok \|= (!disable_swap_pageouts && defer_swap_pageouts &&
	983	vm_page_count_min(0));
	984
	985	}
	986
	987	/*
	988	* We don't bother paging objects that are "dead".
	989	* Those objects are in a "rundown" state.
	990	*/
	991	if (!swap_pageouts_ok \|\| (object->flags & OBJ_DEAD)) {
	992	vm_page_and_queue_spin_lock(m);
	993	if (m->queue - m->pc == PQ_INACTIVE) {
	994	TAILQ_REMOVE(
	995	&vm_page_queues[PQ_INACTIVE + q].pl,
	996	m, pageq);
	997	TAILQ_INSERT_TAIL(
	998	&vm_page_queues[PQ_INACTIVE + q].pl,
	999	m, pageq);
	1000	}
	1001	vm_page_and_queue_spin_unlock(m);
	1002	++vm_swapcache_inactive_heuristic;
	1003	vm_page_wakeup(m);
	1004	continue;
	1005	}
	1006
	1007	/*
	1008	* (m) is still busied.
	1009	*
	1010	* The object is already known NOT to be dead. It
	1011	* is possible for the vget() to block the whole
	1012	* pageout daemon, but the new low-memory handling
	1013	* code should prevent it.
	1014	*
	1015	* The previous code skipped locked vnodes and, worse,
	1016	* reordered pages in the queue. This results in
	1017	* completely non-deterministic operation because,
	1018	* quite often, a vm_fault has initiated an I/O and
	1019	* is holding a locked vnode at just the point where
	1020	* the pageout daemon is woken up.
	1021	*
	1022	* We can't wait forever for the vnode lock, we might
	1023	* deadlock due to a vn_read() getting stuck in
	1024	* vm_wait while holding this vnode. We skip the
	1025	* vnode if we can't get it in a reasonable amount
	1026	* of time.
	1027	*
	1028	* vpfailed is used to (try to) avoid the case where
	1029	* a large number of pages are associated with a
	1030	* locked vnode, which could cause the pageout daemon
	1031	* to stall for an excessive amount of time.
	1032	*/
	1033	if (object->type == OBJT_VNODE) {
	1034	int flags;
	1035
	1036	vp = object->handle;
	1037	flags = LK_EXCLUSIVE \| LK_NOOBJ;
	1038	if (vp == vpfailed)
	1039	flags \|= LK_NOWAIT;
	1040	else
	1041	flags \|= LK_TIMELOCK;
	1042	vm_page_hold(m);
	1043	vm_page_wakeup(m);
	1044
	1045	/*
	1046	* We have unbusied (m) temporarily so we can
	1047	* acquire the vp lock without deadlocking.
	1048	* (m) is held to prevent destruction.
	1049	*/
	1050	if (vget(vp, flags) != 0) {
	1051	vpfailed = vp;
	1052	++pageout_lock_miss;
	1053	if (object->flags & OBJ_MIGHTBEDIRTY)
	1054	++*vnodes_skippedp;
	1055	vm_page_unhold(m);
	1056	continue;
	1057	}
	1058
	1059	/*
	1060	* The page might have been moved to another
	1061	* queue during potential blocking in vget()
	1062	* above. The page might have been freed and
	1063	* reused for another vnode. The object might
	1064	* have been reused for another vnode.
	1065	*/
	1066	if (m->queue - m->pc != PQ_INACTIVE \|\|
	1067	m->object != object \|\|
	1068	object->handle != vp) {
	1069	if (object->flags & OBJ_MIGHTBEDIRTY)
	1070	++*vnodes_skippedp;
	1071	vput(vp);
	1072	vm_page_unhold(m);
	1073	continue;
	1074	}
	1075
	1076	/*
	1077	* The page may have been busied during the
	1078	* blocking in vput(); We don't move the
	1079	* page back onto the end of the queue so that
	1080	* statistics are more correct if we don't.
	1081	*/
	1082	if (vm_page_busy_try(m, TRUE)) {
	1083	vput(vp);
	1084	vm_page_unhold(m);
	1085	continue;
	1086	}
	1087	vm_page_unhold(m);
	1088
	1089	/*
	1090	* (m) is busied again
	1091	*
	1092	* We own the busy bit and remove our hold
	1093	* bit. If the page is still held it
	1094	* might be undergoing I/O, so skip it.
	1095	*/
	1096	if (m->hold_count) {
	1097	vm_page_and_queue_spin_lock(m);
	1098	if (m->queue - m->pc == PQ_INACTIVE) {
	1099	TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, m, pageq);
	1100	TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE + q].pl, m, pageq);
	1101	}
	1102	vm_page_and_queue_spin_unlock(m);
	1103	++vm_swapcache_inactive_heuristic;
	1104	if (object->flags & OBJ_MIGHTBEDIRTY)
	1105	++*vnodes_skippedp;
	1106	vm_page_wakeup(m);
	1107	vput(vp);
	1108	continue;
	1109	}
	1110	/* (m) is left busied as we fall through */
	1111	}
	1112
	1113	/*
	1114	* page is busy and not held here.
	1115	*
	1116	* If a page is dirty, then it is either being washed
	1117	* (but not yet cleaned) or it is still in the
	1118	* laundry. If it is still in the laundry, then we
	1119	* start the cleaning operation.
	1120	*
	1121	* decrement inactive_shortage on success to account
	1122	* for the (future) cleaned page. Otherwise we
	1123	* could wind up laundering or cleaning too many
	1124	* pages.
	1125	*/
	1126	if (vm_pageout_clean(m) != 0) {
	1127	++delta;
	1128	--maxlaunder;
	1129	}
	1130	/* clean ate busy, page no longer accessible */
	1131	if (vp != NULL)
	1132	vput(vp);
	1133	} else {
	1134	vm_page_wakeup(m);
	1135	}
	1136	}
	1137	vm_page_queues_spin_lock(PQ_INACTIVE + q);
	1138	TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq);
	1139	vm_page_queues_spin_unlock(PQ_INACTIVE + q);
	1140	return (delta);
	1141	}
	1142
	1143	static int
	1144	vm_pageout_scan_active(int pass, int q,
	1145	int avail_shortage, int inactive_shortage,
	1146	int *recycle_countp)
	1147	{
	1148	struct vm_page marker;
	1149	vm_page_t m;
	1150	int actcount;
	1151	int delta = 0;
	1152	int maxscan;
	1153
	1154	/*
	1155	* We want to move pages from the active queue to the inactive
	1156	* queue to get the inactive queue to the inactive target. If
	1157	* we still have a page shortage from above we try to directly free
	1158	* clean pages instead of moving them.
	1159	*
	1160	* If we do still have a shortage we keep track of the number of
	1161	* pages we free or cache (recycle_count) as a measure of thrashing
	1162	* between the active and inactive queues.
	1163	*
	1164	* If we were able to completely satisfy the free+cache targets
	1165	* from the inactive pool we limit the number of pages we move
	1166	* from the active pool to the inactive pool to 2x the pages we
	1167	* had removed from the inactive pool (with a minimum of 1/5 the
	1168	* inactive target). If we were not able to completely satisfy
	1169	* the free+cache targets we go for the whole target aggressively.
	1170	*
	1171	* NOTE: Both variables can end up negative.
	1172	* NOTE: We are still in a critical section.
	1173	*/
	1174
	1175	bzero(&marker, sizeof(marker));
	1176	marker.flags = PG_BUSY \| PG_FICTITIOUS \| PG_MARKER;
	1177	marker.queue = PQ_ACTIVE + q;
	1178	marker.pc = q;
	1179	marker.wire_count = 1;
	1180
	1181	vm_page_queues_spin_lock(PQ_ACTIVE + q);
	1182	TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
	1183	maxscan = vm_page_queues[PQ_ACTIVE + q].lcnt;
	1184	vm_page_queues_spin_unlock(PQ_ACTIVE + q);
	1185
	1186	while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
	1187	maxscan-- > 0 && (avail_shortage - delta > 0 \|\|
	1188	inactive_shortage > 0))
	1189	{
	1190	vm_page_and_queue_spin_lock(m);
	1191	if (m != TAILQ_NEXT(&marker, pageq)) {
	1192	vm_page_and_queue_spin_unlock(m);
	1193	++maxscan;
	1194	continue;
	1195	}
	1196	KKASSERT(m->queue - m->pc == PQ_ACTIVE);
	1197	TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl,
	1198	&marker, pageq);
	1199	TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m,
	1200	&marker, pageq);
	1201
	1202	/*
	1203	* Skip marker pages
	1204	*/
	1205	if (m->flags & PG_MARKER) {
	1206	vm_page_and_queue_spin_unlock(m);
	1207	continue;
	1208	}
	1209
	1210	/*
	1211	* Try to busy the page. Don't mess with pages which are
	1212	* already busy or reorder them in the queue.
	1213	*/
	1214	if (vm_page_busy_try(m, TRUE)) {
	1215	vm_page_and_queue_spin_unlock(m);
	1216	continue;
	1217	}
	1218
	1219	/*
	1220	* Don't deactivate pages that are held, even if we can
	1221	* busy them. (XXX why not?)
	1222	*/
	1223	if (m->hold_count != 0) {
	1224	TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl,
	1225	m, pageq);
	1226	TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE + q].pl,
	1227	m, pageq);
	1228	vm_page_and_queue_spin_unlock(m);
	1229	vm_page_wakeup(m);
	1230	continue;
	1231	}
	1232	vm_page_and_queue_spin_unlock(m);
	1233	lwkt_yield();
	1234
	1235	/*
	1236	* The page has been successfully busied and the page and
	1237	* queue are no longer locked.
	1238	*/
	1239
	1240	/*
	1241	* The count for pagedaemon pages is done after checking the
	1242	* page for eligibility...
	1243	*/
	1244	mycpu->gd_cnt.v_pdpages++;
	1245
	1246	/*
	1247	* Check to see "how much" the page has been used and clear
	1248	* the tracking access bits. If the object has no references
	1249	* don't bother paying the expense.
	1250	*/
	1251	actcount = 0;
	1252	if (m->object->ref_count != 0) {
	1253	if (m->flags & PG_REFERENCED)
	1254	++actcount;
	1255	actcount += pmap_ts_referenced(m);
	1256	if (actcount) {
	1257	m->act_count += ACT_ADVANCE + actcount;
	1258	if (m->act_count > ACT_MAX)
	1259	m->act_count = ACT_MAX;
	1260	}
	1261	}
	1262	vm_page_flag_clear(m, PG_REFERENCED);
	1263
	1264	/*
	1265	* actcount is only valid if the object ref_count is non-zero.
	1266	*/
	1267	if (actcount && m->object->ref_count != 0) {
	1268	vm_page_and_queue_spin_lock(m);
	1269	if (m->queue - m->pc == PQ_ACTIVE) {
	1270	TAILQ_REMOVE(
	1271	&vm_page_queues[PQ_ACTIVE + q].pl,
	1272	m, pageq);
	1273	TAILQ_INSERT_TAIL(
	1274	&vm_page_queues[PQ_ACTIVE + q].pl,
	1275	m, pageq);
	1276	}
	1277	vm_page_and_queue_spin_unlock(m);
	1278	vm_page_wakeup(m);
	1279	} else {
	1280	m->act_count -= min(m->act_count, ACT_DECLINE);
	1281	if (vm_pageout_algorithm \|\|
	1282	m->object->ref_count == 0 \|\|
	1283	m->act_count < pass + 1
	1284	) {
	1285	/*
	1286	* Deactivate the page. If we had a
	1287	* shortage from our inactive scan try to
	1288	* free (cache) the page instead.
	1289	*
	1290	* Don't just blindly cache the page if
	1291	* we do not have a shortage from the
	1292	* inactive scan, that could lead to
	1293	* gigabytes being moved.
	1294	*/
	1295	--inactive_shortage;
	1296	if (avail_shortage - delta > 0 \|\|
	1297	m->object->ref_count == 0) {
	1298	if (avail_shortage - delta > 0)
	1299	++*recycle_countp;
	1300	vm_page_protect(m, VM_PROT_NONE);
	1301	if (m->dirty == 0 &&
	1302	avail_shortage - delta > 0) {
	1303	vm_page_cache(m);
	1304	} else {
	1305	vm_page_deactivate(m);
	1306	vm_page_wakeup(m);
	1307	}
	1308	} else {
	1309	vm_page_deactivate(m);
	1310	vm_page_wakeup(m);
	1311	}
	1312	++delta;
	1313	} else {
	1314	vm_page_and_queue_spin_lock(m);
	1315	if (m->queue - m->pc == PQ_ACTIVE) {
	1316	TAILQ_REMOVE(
	1317	&vm_page_queues[PQ_ACTIVE + q].pl,
	1318	m, pageq);
	1319	TAILQ_INSERT_TAIL(
	1320	&vm_page_queues[PQ_ACTIVE + q].pl,
	1321	m, pageq);
	1322	}
	1323	vm_page_and_queue_spin_unlock(m);
	1324	vm_page_wakeup(m);
	1325	}
	1326	}
	1327	}
	1328
	1329	/*
	1330	* Clean out our local marker.
	1331	*/
	1332	vm_page_queues_spin_lock(PQ_ACTIVE + q);
	1333	TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
	1334	vm_page_queues_spin_unlock(PQ_ACTIVE + q);
	1335
	1336	return (delta);
	1337	}
	1338
	1339	/*
	1340	* The number of actually free pages can drop down to v_free_reserved,
	1341	* we try to build the free count back above v_free_min. Note that
	1342	* vm_paging_needed() also returns TRUE if v_free_count is not at
	1343	* least v_free_min so that is the minimum we must build the free
	1344	* count to.
	1345	*
	1346	* We use a slightly higher target to improve hysteresis,
	1347	* ((v_free_target + v_free_min) / 2). Since v_free_target
	1348	* is usually the same as v_cache_min this maintains about
	1349	* half the pages in the free queue as are in the cache queue,
	1350	* providing pretty good pipelining for pageout operation.
	1351	*
	1352	* The system operator can manipulate vm.v_cache_min and
	1353	* vm.v_free_target to tune the pageout demon. Be sure
	1354	* to keep vm.v_free_min < vm.v_free_target.
	1355	*
	1356	* Note that the original paging target is to get at least
	1357	* (free_min + cache_min) into (free + cache). The slightly
	1358	* higher target will shift additional pages from cache to free
	1359	* without effecting the original paging target in order to
	1360	* maintain better hysteresis and not have the free count always
	1361	* be dead-on v_free_min.
	1362	*
	1363	* NOTE: we are still in a critical section.
	1364	*
	1365	* Pages moved from PQ_CACHE to totally free are not counted in the
	1366	* pages_freed counter.
	1367	*/
	1368	static void
	1369	vm_pageout_scan_cache(int avail_shortage, int vnodes_skipped, int recycle_count)
	1370	{
	1371	struct vm_pageout_scan_info info;
	1372	vm_page_t m;
	1373
	1374	while (vmstats.v_free_count <
	1375	(vmstats.v_free_min + vmstats.v_free_target) / 2) {
	1376	/*
	1377	* This steals some code from vm/vm_page.c
	1378	*/
	1379	static int cache_rover = 0;
	1380
	1381	m = vm_page_list_find(PQ_CACHE, cache_rover & PQ_L2_MASK, FALSE);
	1382	if (m == NULL)
	1383	break;
	1384	/* page is returned removed from its queue and spinlocked */
	1385	if (vm_page_busy_try(m, TRUE)) {
	1386	vm_page_deactivate_locked(m);
	1387	vm_page_spin_unlock(m);
	1388	#ifdef INVARIANTS
	1389	kprintf("Warning: busy page %p found in cache\n", m);
	1390	#endif
	1391	continue;
	1392	}
	1393	vm_page_spin_unlock(m);
	1394	pagedaemon_wakeup();
	1395	lwkt_yield();
	1396
	1397	/*
	1398	* Page has been successfully busied and it and its queue
	1399	* is no longer spinlocked.
	1400	*/
	1401	if ((m->flags & PG_UNMANAGED) \|\|
	1402	m->hold_count \|\|
	1403	m->wire_count) {
	1404	vm_page_deactivate(m);
	1405	vm_page_wakeup(m);
	1406	continue;
	1407	}
	1408	KKASSERT((m->flags & PG_MAPPED) == 0);
	1409	KKASSERT(m->dirty == 0);
	1410	cache_rover += PQ_PRIME2;
	1411	vm_pageout_page_free(m);
	1412	mycpu->gd_cnt.v_dfree++;
	1413	}
	1414
	1415	#if !defined(NO_SWAPPING)
	1416	/*
	1417	* Idle process swapout -- run once per second.
	1418	*/
	1419	if (vm_swap_idle_enabled) {
	1420	static long lsec;
	1421	if (time_second != lsec) {
	1422	vm_pageout_req_swapout \|= VM_SWAP_IDLE;
	1423	vm_req_vmdaemon();
	1424	lsec = time_second;
	1425	}
	1426	}
	1427	#endif
	1428
	1429	/*
	1430	* If we didn't get enough free pages, and we have skipped a vnode
	1431	* in a writeable object, wakeup the sync daemon. And kick swapout
	1432	* if we did not get enough free pages.
	1433	*/
	1434	if (vm_paging_target() > 0) {
	1435	if (vnodes_skipped && vm_page_count_min(0))
	1436	speedup_syncer();
	1437	#if !defined(NO_SWAPPING)
	1438	if (vm_swap_enabled && vm_page_count_target()) {
	1439	vm_req_vmdaemon();
	1440	vm_pageout_req_swapout \|= VM_SWAP_NORMAL;
	1441	}
	1442	#endif
	1443	}
	1444
	1445	/*
	1446	* Handle catastrophic conditions. Under good conditions we should
	1447	* be at the target, well beyond our minimum. If we could not even
	1448	* reach our minimum the system is under heavy stress.
	1449	*
	1450	* Determine whether we have run out of memory. This occurs when
	1451	* swap_pager_full is TRUE and the only pages left in the page
	1452	* queues are dirty. We will still likely have page shortages.
	1453	*
	1454	* - swap_pager_full is set if insufficient swap was
	1455	* available to satisfy a requested pageout.
	1456	*
	1457	* - the inactive queue is bloated (4 x size of active queue),
	1458	* meaning it is unable to get rid of dirty pages and.
	1459	*
	1460	* - vm_page_count_min() without counting pages recycled from the
	1461	* active queue (recycle_count) means we could not recover
	1462	* enough pages to meet bare minimum needs. This test only
	1463	* works if the inactive queue is bloated.
	1464	*
	1465	* - due to a positive avail_shortage we shifted the remaining
	1466	* dirty pages from the active queue to the inactive queue
	1467	* trying to find clean ones to free.
	1468	*/
	1469	if (swap_pager_full && vm_page_count_min(recycle_count))
	1470	kprintf("Warning: system low on memory+swap!\n");
	1471	if (swap_pager_full && vm_page_count_min(recycle_count) &&
	1472	vmstats.v_inactive_count > vmstats.v_active_count * 4 &&
	1473	avail_shortage > 0) {
	1474	/*
	1475	* Kill something.
	1476	*/
	1477	info.bigproc = NULL;
	1478	info.bigsize = 0;
	1479	allproc_scan(vm_pageout_scan_callback, &info);
	1480	if (info.bigproc != NULL) {
	1481	killproc(info.bigproc, "out of swap space");
	1482	info.bigproc->p_nice = PRIO_MIN;
	1483	info.bigproc->p_usched->resetpriority(
	1484	FIRST_LWP_IN_PROC(info.bigproc));
	1485	wakeup(&vmstats.v_free_count);
	1486	PRELE(info.bigproc);
	1487	}
	1488	}
	1489	}
	1490
	1491	/*
	1492	* The caller must hold proc_token.
	1493	*/
	1494	static int
	1495	vm_pageout_scan_callback(struct proc p, void data)
	1496	{
	1497	struct vm_pageout_scan_info *info = data;
	1498	vm_offset_t size;
	1499
	1500	/*
	1501	* Never kill system processes or init. If we have configured swap
	1502	* then try to avoid killing low-numbered pids.
	1503	*/
	1504	if ((p->p_flags & P_SYSTEM) \|\| (p->p_pid == 1) \|\|
	1505	((p->p_pid < 48) && (vm_swap_size != 0))) {
	1506	return (0);
	1507	}
	1508
	1509	/*
	1510	* if the process is in a non-running type state,
	1511	* don't touch it.
	1512	*/
	1513	if (p->p_stat != SACTIVE && p->p_stat != SSTOP)
	1514	return (0);
	1515
	1516	/*
	1517	* Get the approximate process size. Note that anonymous pages
	1518	* with backing swap will be counted twice, but there should not
	1519	* be too many such pages due to the stress the VM system is
	1520	* under at this point.
	1521	*/
	1522	size = vmspace_anonymous_count(p->p_vmspace) +
	1523	vmspace_swap_count(p->p_vmspace);
	1524
	1525	/*
	1526	* If the this process is bigger than the biggest one
	1527	* remember it.
	1528	*/
	1529	if (info->bigsize < size) {
	1530	if (info->bigproc)
	1531	PRELE(info->bigproc);
	1532	PHOLD(p);
	1533	info->bigproc = p;
	1534	info->bigsize = size;
	1535	}
	1536	lwkt_yield();
	1537	return(0);
	1538	}
	1539
	1540	/*
	1541	* This routine tries to maintain the pseudo LRU active queue,
	1542	* so that during long periods of time where there is no paging,
	1543	* that some statistic accumulation still occurs. This code
	1544	* helps the situation where paging just starts to occur.
	1545	*/
	1546	static void
	1547	vm_pageout_page_stats(int q)
	1548	{
	1549	static int fullintervalcount = 0;
	1550	struct vm_page marker;
	1551	vm_page_t m;
	1552	int pcount, tpcount; /* Number of pages to check */
	1553	int page_shortage;
	1554
	1555	page_shortage = (vmstats.v_inactive_target + vmstats.v_cache_max +
	1556	vmstats.v_free_min) -
	1557	(vmstats.v_free_count + vmstats.v_inactive_count +
	1558	vmstats.v_cache_count);
	1559
	1560	if (page_shortage <= 0)
	1561	return;
	1562
	1563	pcount = vm_page_queues[PQ_ACTIVE + q].lcnt;
	1564	fullintervalcount += vm_pageout_stats_interval;
	1565	if (fullintervalcount < vm_pageout_full_stats_interval) {
	1566	tpcount = (vm_pageout_stats_max * pcount) /
	1567	vmstats.v_page_count + 1;
	1568	if (pcount > tpcount)
	1569	pcount = tpcount;
	1570	} else {
	1571	fullintervalcount = 0;
	1572	}
	1573
	1574	bzero(&marker, sizeof(marker));
	1575	marker.flags = PG_BUSY \| PG_FICTITIOUS \| PG_MARKER;
	1576	marker.queue = PQ_ACTIVE + q;
	1577	marker.pc = q;
	1578	marker.wire_count = 1;
	1579
	1580	vm_page_queues_spin_lock(PQ_ACTIVE + q);
	1581	TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
	1582	vm_page_queues_spin_unlock(PQ_ACTIVE + q);
	1583
	1584	while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
	1585	pcount-- > 0)
	1586	{
	1587	int actcount;
	1588
	1589	vm_page_and_queue_spin_lock(m);
	1590	if (m != TAILQ_NEXT(&marker, pageq)) {
	1591	vm_page_and_queue_spin_unlock(m);
	1592	++pcount;
	1593	continue;
	1594	}
	1595	KKASSERT(m->queue - m->pc == PQ_ACTIVE);
	1596	TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
	1597	TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m,
	1598	&marker, pageq);
	1599
	1600	/*
	1601	* Ignore markers
	1602	*/
	1603	if (m->flags & PG_MARKER) {
	1604	vm_page_and_queue_spin_unlock(m);
	1605	continue;
	1606	}
	1607
	1608	/*
	1609	* Ignore pages we can't busy
	1610	*/
	1611	if (vm_page_busy_try(m, TRUE)) {
	1612	vm_page_and_queue_spin_unlock(m);
	1613	continue;
	1614	}
	1615	vm_page_and_queue_spin_unlock(m);
	1616	KKASSERT(m->queue - m->pc == PQ_ACTIVE);
	1617
	1618	/*
	1619	* We now have a safely busied page, the page and queue
	1620	* spinlocks have been released.
	1621	*
	1622	* Ignore held pages
	1623	*/
	1624	if (m->hold_count) {
	1625	vm_page_wakeup(m);
	1626	continue;
	1627	}
	1628
	1629	/*
	1630	* Calculate activity
	1631	*/
	1632	actcount = 0;
	1633	if (m->flags & PG_REFERENCED) {
	1634	vm_page_flag_clear(m, PG_REFERENCED);
	1635	actcount += 1;
	1636	}
	1637	actcount += pmap_ts_referenced(m);
	1638
	1639	/*
	1640	* Update act_count and move page to end of queue.
	1641	*/
	1642	if (actcount) {
	1643	m->act_count += ACT_ADVANCE + actcount;
	1644	if (m->act_count > ACT_MAX)
	1645	m->act_count = ACT_MAX;
	1646	vm_page_and_queue_spin_lock(m);
	1647	if (m->queue - m->pc == PQ_ACTIVE) {
	1648	TAILQ_REMOVE(
	1649	&vm_page_queues[PQ_ACTIVE + q].pl,
	1650	m, pageq);
	1651	TAILQ_INSERT_TAIL(
	1652	&vm_page_queues[PQ_ACTIVE + q].pl,
	1653	m, pageq);
	1654	}
	1655	vm_page_and_queue_spin_unlock(m);
	1656	vm_page_wakeup(m);
	1657	continue;
	1658	}
	1659
	1660	if (m->act_count == 0) {
	1661	/*
	1662	* We turn off page access, so that we have
	1663	* more accurate RSS stats. We don't do this
	1664	* in the normal page deactivation when the
	1665	* system is loaded VM wise, because the
	1666	* cost of the large number of page protect
	1667	* operations would be higher than the value
	1668	* of doing the operation.
	1669	*
	1670	* We use the marker to save our place so
	1671	* we can release the spin lock. both (m)
	1672	* and (next) will be invalid.
	1673	*/
	1674	vm_page_protect(m, VM_PROT_NONE);
	1675	vm_page_deactivate(m);
	1676	} else {
	1677	m->act_count -= min(m->act_count, ACT_DECLINE);
	1678	vm_page_and_queue_spin_lock(m);
	1679	if (m->queue - m->pc == PQ_ACTIVE) {
	1680	TAILQ_REMOVE(
	1681	&vm_page_queues[PQ_ACTIVE + q].pl,
	1682	m, pageq);
	1683	TAILQ_INSERT_TAIL(
	1684	&vm_page_queues[PQ_ACTIVE + q].pl,
	1685	m, pageq);
	1686	}
	1687	vm_page_and_queue_spin_unlock(m);
	1688	}
	1689	vm_page_wakeup(m);
	1690	}
	1691
	1692	/*
	1693	* Remove our local marker
	1694	*/
	1695	vm_page_queues_spin_lock(PQ_ACTIVE + q);
	1696	TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
	1697	vm_page_queues_spin_unlock(PQ_ACTIVE + q);
	1698	}
	1699
	1700	static int
	1701	vm_pageout_free_page_calc(vm_size_t count)
	1702	{
	1703	if (count < vmstats.v_page_count)
	1704	return 0;
	1705	/*
	1706	* free_reserved needs to include enough for the largest swap pager
	1707	* structures plus enough for any pv_entry structs when paging.
	1708	*
	1709	* v_free_min normal allocations
	1710	* v_free_reserved system allocations
	1711	* v_pageout_free_min allocations by pageout daemon
	1712	* v_interrupt_free_min low level allocations (e.g swap structures)
	1713	*/
	1714	if (vmstats.v_page_count > 1024)
	1715	vmstats.v_free_min = 64 + (vmstats.v_page_count - 1024) / 200;
	1716	else
	1717	vmstats.v_free_min = 64;
	1718	vmstats.v_free_reserved = vmstats.v_free_min * 4 / 8 + 7;
	1719	vmstats.v_free_severe = vmstats.v_free_min * 4 / 8 + 0;
	1720	vmstats.v_pageout_free_min = vmstats.v_free_min * 2 / 8 + 7;
	1721	vmstats.v_interrupt_free_min = vmstats.v_free_min * 1 / 8 + 7;
	1722
	1723	return 1;
	1724	}
	1725
	1726
	1727	/*
	1728	* vm_pageout is the high level pageout daemon.
	1729	*
	1730	* No requirements.
	1731	*/
	1732	static void
	1733	vm_pageout_thread(void)
	1734	{
	1735	int pass;
	1736	int q;
	1737
	1738	/*
	1739	* Initialize some paging parameters.
	1740	*/
	1741	curthread->td_flags \|= TDF_SYSTHREAD;
	1742
	1743	if (vmstats.v_page_count < 2000)
	1744	vm_pageout_page_count = 8;
	1745
	1746	vm_pageout_free_page_calc(vmstats.v_page_count);
	1747
	1748	/*
	1749	* v_free_target and v_cache_min control pageout hysteresis. Note
	1750	* that these are more a measure of the VM cache queue hysteresis
	1751	* then the VM free queue. Specifically, v_free_target is the
	1752	* high water mark (free+cache pages).
	1753	*
	1754	* v_free_reserved + v_cache_min (mostly means v_cache_min) is the
	1755	* low water mark, while v_free_min is the stop. v_cache_min must
	1756	* be big enough to handle memory needs while the pageout daemon
	1757	* is signalled and run to free more pages.
	1758	*/
	1759	if (vmstats.v_free_count > 6144)
	1760	vmstats.v_free_target = 4 * vmstats.v_free_min + vmstats.v_free_reserved;
	1761	else
	1762	vmstats.v_free_target = 2 * vmstats.v_free_min + vmstats.v_free_reserved;
	1763
	1764	/*
	1765	* NOTE: With the new buffer cache b_act_count we want the default
	1766	* inactive target to be a percentage of available memory.
	1767	*
	1768	* The inactive target essentially determines the minimum
	1769	* number of 'temporary' pages capable of caching one-time-use
	1770	* files when the VM system is otherwise full of pages
	1771	* belonging to multi-time-use files or active program data.
	1772	*
	1773	* NOTE: The inactive target is aggressively persued only if the
	1774	* inactive queue becomes too small. If the inactive queue
	1775	* is large enough to satisfy page movement to free+cache
	1776	* then it is repopulated more slowly from the active queue.
	1777	* This allows a general inactive_target default to be set.
	1778	*
	1779	* There is an issue here for processes which sit mostly idle
	1780	* 'overnight', such as sshd, tcsh, and X. Any movement from
	1781	* the active queue will eventually cause such pages to
	1782	* recycle eventually causing a lot of paging in the morning.
	1783	* To reduce the incidence of this pages cycled out of the
	1784	* buffer cache are moved directly to the inactive queue if
	1785	* they were only used once or twice.
	1786	*
	1787	* The vfs.vm_cycle_point sysctl can be used to adjust this.
	1788	* Increasing the value (up to 64) increases the number of
	1789	* buffer recyclements which go directly to the inactive queue.
	1790	*/
	1791	if (vmstats.v_free_count > 2048) {
	1792	vmstats.v_cache_min = vmstats.v_free_target;
	1793	vmstats.v_cache_max = 2 * vmstats.v_cache_min;
	1794	} else {
	1795	vmstats.v_cache_min = 0;
	1796	vmstats.v_cache_max = 0;
	1797	}
	1798	vmstats.v_inactive_target = vmstats.v_free_count / 4;
	1799
	1800	/* XXX does not really belong here */
	1801	if (vm_page_max_wired == 0)
	1802	vm_page_max_wired = vmstats.v_free_count / 3;
	1803
	1804	if (vm_pageout_stats_max == 0)
	1805	vm_pageout_stats_max = vmstats.v_free_target;
	1806
	1807	/*
	1808	* Set interval in seconds for stats scan.
	1809	*/
	1810	if (vm_pageout_stats_interval == 0)
	1811	vm_pageout_stats_interval = 5;
	1812	if (vm_pageout_full_stats_interval == 0)
	1813	vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4;
	1814
	1815
	1816	/*
	1817	* Set maximum free per pass
	1818	*/
	1819	if (vm_pageout_stats_free_max == 0)
	1820	vm_pageout_stats_free_max = 5;
	1821
	1822	swap_pager_swap_init();
	1823	pass = 0;
	1824
	1825	/*
	1826	* The pageout daemon is never done, so loop forever.
	1827	*/
	1828	while (TRUE) {
	1829	int error;
	1830	int delta1;
	1831	int delta2;
	1832	int avail_shortage;
	1833	int inactive_shortage;
	1834	int vnodes_skipped = 0;
	1835	int recycle_count = 0;
	1836	int tmp;
	1837
	1838	/*
	1839	* Wait for an action request. If we timeout check to
	1840	* see if paging is needed (in case the normal wakeup
	1841	* code raced us).
	1842	*/
	1843	if (vm_pages_needed == 0) {
	1844	error = tsleep(&vm_pages_needed,
	1845	0, "psleep",
	1846	vm_pageout_stats_interval * hz);
	1847	if (error &&
	1848	vm_paging_needed() == 0 &&
	1849	vm_pages_needed == 0) {
	1850	for (q = 0; q < PQ_L2_SIZE; ++q)
	1851	vm_pageout_page_stats(q);
	1852	continue;
	1853	}
	1854	vm_pages_needed = 1;
	1855	}
	1856
	1857	mycpu->gd_cnt.v_pdwakeups++;
	1858
	1859	/*
	1860	* Do whatever cleanup that the pmap code can.
	1861	*/
	1862	pmap_collect();
	1863
	1864	/*
	1865	* Scan for pageout. Try to avoid thrashing the system
	1866	* with activity.
	1867	*
	1868	* Calculate our target for the number of free+cache pages we
	1869	* want to get to. This is higher then the number that causes
	1870	* allocations to stall (severe) in order to provide hysteresis,
	1871	* and if we don't make it all the way but get to the minimum
	1872	* we're happy. Goose it a bit if there are multipler
	1873	* requests for memory.
	1874	*/
	1875	avail_shortage = vm_paging_target() + vm_pageout_deficit;
	1876	vm_pageout_deficit = 0;
	1877	delta1 = 0;
	1878	if (avail_shortage > 0) {
	1879	for (q = 0; q < PQ_L2_SIZE; ++q) {
	1880	delta1 += vm_pageout_scan_inactive(
	1881	pass, q,
	1882	PQAVERAGE(avail_shortage),
	1883	&vnodes_skipped);
	1884	}
	1885	avail_shortage -= delta1;
	1886	}
	1887
	1888	/*
	1889	* Figure out how many active pages we must deactivate. If
	1890	* we were able to reach our target with just the inactive
	1891	* scan above we limit the number of active pages we
	1892	* deactivate to reduce unnecessary work.
	1893	*/
	1894	inactive_shortage = vmstats.v_inactive_target -
	1895	vmstats.v_inactive_count;
	1896
	1897	/*
	1898	* If we were unable to free sufficient inactive pages to
	1899	* satisfy the free/cache queue requirements then simply
	1900	* reaching the inactive target may not be good enough.
	1901	* Try to deactivate pages in excess of the target based
	1902	* on the shortfall.
	1903	*
	1904	* However to prevent thrashing the VM system do not
	1905	* deactivate more than an additional 1/10 the inactive
	1906	* target's worth of active pages.
	1907	*/
	1908	if (avail_shortage > 0) {
	1909	tmp = avail_shortage * 2;
	1910	if (tmp > vmstats.v_inactive_target / 10)
	1911	tmp = vmstats.v_inactive_target / 10;
	1912	inactive_shortage += tmp;
	1913	}
	1914
	1915	if (avail_shortage > 0 \|\| inactive_shortage > 0) {
	1916	delta2 = 0;
	1917	for (q = 0; q < PQ_L2_SIZE; ++q) {
	1918	delta2 += vm_pageout_scan_active(
	1919	pass, q,
	1920	PQAVERAGE(avail_shortage),
	1921	PQAVERAGE(inactive_shortage),
	1922	&recycle_count);
	1923	}
	1924	inactive_shortage -= delta2;
	1925	avail_shortage -= delta2;
	1926	}
	1927
	1928	/*
	1929	* Finally free enough cache pages to meet our free page
	1930	* requirement and take more drastic measures if we are
	1931	* still in trouble.
	1932	*/
	1933	vm_pageout_scan_cache(avail_shortage, vnodes_skipped,
	1934	recycle_count);
	1935
	1936	/*
	1937	* Wait for more work.
	1938	*/
	1939	if (avail_shortage > 0) {
	1940	++pass;
	1941	if (swap_pager_full) {
	1942	/*
	1943	* Running out of memory, catastrophic back-off
	1944	* to one-second intervals.
	1945	*/
	1946	tsleep(&vm_pages_needed, 0, "pdelay", hz);
	1947	} else if (pass < 10 && vm_pages_needed > 1) {
	1948	/*
	1949	* Normal operation, additional processes
	1950	* have already kicked us. Retry immediately.
	1951	*/
	1952	} else if (pass < 10) {
	1953	/*
	1954	* Normal operation, fewer processes. Delay
	1955	* a bit but allow wakeups.
	1956	*/
	1957	vm_pages_needed = 0;
	1958	tsleep(&vm_pages_needed, 0, "pdelay", hz / 10);
	1959	vm_pages_needed = 1;
	1960	} else {
	1961	/*
	1962	* We've taken too many passes, forced delay.
	1963	*/
	1964	tsleep(&vm_pages_needed, 0, "pdelay", hz / 10);
	1965	}
	1966	} else {
	1967	/*
	1968	* Interlocked wakeup of waiters (non-optional)
	1969	*/
	1970	pass = 0;
	1971	if (vm_pages_needed && !vm_page_count_min(0)) {
	1972	wakeup(&vmstats.v_free_count);
	1973	vm_pages_needed = 0;
	1974	}
	1975	}
	1976	}
	1977	}
	1978
	1979	static struct kproc_desc page_kp = {
	1980	"pagedaemon",
	1981	vm_pageout_thread,
	1982	&pagethread
	1983	};
	1984	SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &page_kp)
	1985
	1986
	1987	/*
	1988	* Called after allocating a page out of the cache or free queue
	1989	* to possibly wake the pagedaemon up to replentish our supply.
	1990	*
	1991	* We try to generate some hysteresis by waking the pagedaemon up
	1992	* when our free+cache pages go below the free_min+cache_min level.
	1993	* The pagedaemon tries to get the count back up to at least the
	1994	* minimum, and through to the target level if possible.
	1995	*
	1996	* If the pagedaemon is already active bump vm_pages_needed as a hint
	1997	* that there are even more requests pending.
	1998	*
	1999	* SMP races ok?
	2000	* No requirements.
	2001	*/
	2002	void
	2003	pagedaemon_wakeup(void)
	2004	{
	2005	if (vm_paging_needed() && curthread != pagethread) {
	2006	if (vm_pages_needed == 0) {
	2007	vm_pages_needed = 1; /* SMP race ok */
	2008	wakeup(&vm_pages_needed);
	2009	} else if (vm_page_count_min(0)) {
	2010	++vm_pages_needed; /* SMP race ok */
	2011	}
	2012	}
	2013	}
	2014
	2015	#if !defined(NO_SWAPPING)
	2016
	2017	/*
	2018	* SMP races ok?
	2019	* No requirements.
	2020	*/
	2021	static void
	2022	vm_req_vmdaemon(void)
	2023	{
	2024	static int lastrun = 0;
	2025
	2026	if ((ticks > (lastrun + hz)) \|\| (ticks < lastrun)) {
	2027	wakeup(&vm_daemon_needed);
	2028	lastrun = ticks;
	2029	}
	2030	}
	2031
	2032	static int vm_daemon_callback(struct proc p, void data __unused);
	2033
	2034	/*
	2035	* No requirements.
	2036	*/
	2037	static void
	2038	vm_daemon(void)
	2039	{
	2040	/*
	2041	* XXX vm_daemon_needed specific token?
	2042	*/
	2043	while (TRUE) {
	2044	tsleep(&vm_daemon_needed, 0, "psleep", 0);
	2045	if (vm_pageout_req_swapout) {
	2046	swapout_procs(vm_pageout_req_swapout);
	2047	vm_pageout_req_swapout = 0;
	2048	}
	2049	/*
	2050	* scan the processes for exceeding their rlimits or if
	2051	* process is swapped out -- deactivate pages
	2052	*/
	2053	allproc_scan(vm_daemon_callback, NULL);
	2054	}
	2055	}
	2056
	2057	/*
	2058	* Caller must hold proc_token.
	2059	*/
	2060	static int
	2061	vm_daemon_callback(struct proc p, void data __unused)
	2062	{
	2063	vm_pindex_t limit, size;
	2064
	2065	/*
	2066	* if this is a system process or if we have already
	2067	* looked at this process, skip it.
	2068	*/
	2069	if (p->p_flags & (P_SYSTEM \| P_WEXIT))
	2070	return (0);
	2071
	2072	/*
	2073	* if the process is in a non-running type state,
	2074	* don't touch it.
	2075	*/
	2076	if (p->p_stat != SACTIVE && p->p_stat != SSTOP)
	2077	return (0);
	2078
	2079	/*
	2080	* get a limit
	2081	*/
	2082	limit = OFF_TO_IDX(qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur,
	2083	p->p_rlimit[RLIMIT_RSS].rlim_max));
	2084
	2085	/*
	2086	* let processes that are swapped out really be
	2087	* swapped out. Set the limit to nothing to get as
	2088	* many pages out to swap as possible.
	2089	*/
	2090	if (p->p_flags & P_SWAPPEDOUT)
	2091	limit = 0;
	2092
	2093	lwkt_gettoken(&p->p_vmspace->vm_map.token);
	2094	size = vmspace_resident_count(p->p_vmspace);
	2095	if (limit >= 0 && size >= limit) {
	2096	vm_pageout_map_deactivate_pages(&p->p_vmspace->vm_map, limit);
	2097	}
	2098	lwkt_reltoken(&p->p_vmspace->vm_map.token);
	2099	return (0);
	2100	}
	2101
	2102	#endif