gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* (MPSAFE)
	3	*
	4	* Copyright (c) 1991 Regents of the University of California.
	5	* All rights reserved.
	6	*
	7	* This code is derived from software contributed to Berkeley by
	8	* The Mach Operating System project at Carnegie-Mellon University.
	9	*
	10	* Redistribution and use in source and binary forms, with or without
	11	* modification, are permitted provided that the following conditions
	12	* are met:
	13	* 1. Redistributions of source code must retain the above copyright
	14	* notice, this list of conditions and the following disclaimer.
	15	* 2. Redistributions in binary form must reproduce the above copyright
	16	* notice, this list of conditions and the following disclaimer in the
	17	* documentation and/or other materials provided with the distribution.
	18	* 4. Neither the name of the University nor the names of its contributors
	19	* may be used to endorse or promote products derived from this software
	20	* without specific prior written permission.
	21	*
	22	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	23	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	24	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	25	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	26	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	27	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	28	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	29	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	30	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	31	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*
	34	* from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91
	35	* $FreeBSD: src/sys/vm/vm_page.c,v 1.147.2.18 2002/03/10 05:03:19 alc Exp $
	36	*/
	37
	38	/*
	39	* Copyright (c) 1987, 1990 Carnegie-Mellon University.
	40	* All rights reserved.
	41	*
	42	* Authors: Avadis Tevanian, Jr., Michael Wayne Young
	43	*
	44	* Permission to use, copy, modify and distribute this software and
	45	* its documentation is hereby granted, provided that both the copyright
	46	* notice and this permission notice appear in all copies of the
	47	* software, derivative works or modified versions, and any portions
	48	* thereof, and that both notices appear in supporting documentation.
	49	*
	50	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
	51	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
	52	* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
	53	*
	54	* Carnegie Mellon requests users of this software to return to
	55	*
	56	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
	57	* School of Computer Science
	58	* Carnegie Mellon University
	59	* Pittsburgh PA 15213-3890
	60	*
	61	* any improvements or extensions that they make and grant Carnegie the
	62	* rights to redistribute these changes.
	63	*/
	64	/*
	65	* Resident memory management module. The module manipulates 'VM pages'.
	66	* A VM page is the core building block for memory management.
	67	*/
	68
	69	#include <sys/param.h>
	70	#include <sys/systm.h>
	71	#include <sys/malloc.h>
	72	#include <sys/proc.h>
	73	#include <sys/vmmeter.h>
	74	#include <sys/vnode.h>
	75	#include <sys/kernel.h>
	76	#include <sys/alist.h>
	77	#include <sys/sysctl.h>
	78
	79	#include <vm/vm.h>
	80	#include <vm/vm_param.h>
	81	#include <sys/lock.h>
	82	#include <vm/vm_kern.h>
	83	#include <vm/pmap.h>
	84	#include <vm/vm_map.h>
	85	#include <vm/vm_object.h>
	86	#include <vm/vm_page.h>
	87	#include <vm/vm_pageout.h>
	88	#include <vm/vm_pager.h>
	89	#include <vm/vm_extern.h>
	90	#include <vm/swap_pager.h>
	91
	92	#include <machine/inttypes.h>
	93	#include <machine/md_var.h>
	94
	95	#include <vm/vm_page2.h>
	96	#include <sys/spinlock2.h>
	97
	98	#define VMACTION_HSIZE 256
	99	#define VMACTION_HMASK (VMACTION_HSIZE - 1)
	100
	101	static void vm_page_queue_init(void);
	102	static void vm_page_free_wakeup(void);
	103	static vm_page_t vm_page_select_cache(u_short pg_color);
	104	static vm_page_t _vm_page_list_find2(int basequeue, int index);
	105	static void _vm_page_deactivate_locked(vm_page_t m, int athead);
	106
	107	/*
	108	* Array of tailq lists
	109	*/
	110	__cachealign struct vpgqueues vm_page_queues[PQ_COUNT];
	111
	112	LIST_HEAD(vm_page_action_list, vm_page_action);
	113	struct vm_page_action_list action_list[VMACTION_HSIZE];
	114	static volatile int vm_pages_waiting;
	115
	116	static struct alist vm_contig_alist;
	117	static struct almeta vm_contig_ameta[ALIST_RECORDS_65536];
	118	static struct spinlock vm_contig_spin = SPINLOCK_INITIALIZER(&vm_contig_spin);
	119
	120	static u_long vm_dma_reserved = 0;
	121	TUNABLE_ULONG("vm.dma_reserved", &vm_dma_reserved);
	122	SYSCTL_ULONG(_vm, OID_AUTO, dma_reserved, CTLFLAG_RD, &vm_dma_reserved, 0,
	123	"Memory reserved for DMA");
	124	SYSCTL_UINT(_vm, OID_AUTO, dma_free_pages, CTLFLAG_RD,
	125	&vm_contig_alist.bl_free, 0, "Memory reserved for DMA");
	126
	127	static int vm_contig_verbose = 0;
	128	TUNABLE_INT("vm.contig_verbose", &vm_contig_verbose);
	129
	130	RB_GENERATE2(vm_page_rb_tree, vm_page, rb_entry, rb_vm_page_compare,
	131	vm_pindex_t, pindex);
	132
	133	static void
	134	vm_page_queue_init(void)
	135	{
	136	int i;
	137
	138	for (i = 0; i < PQ_L2_SIZE; i++)
	139	vm_page_queues[PQ_FREE+i].cnt = &vmstats.v_free_count;
	140	for (i = 0; i < PQ_L2_SIZE; i++)
	141	vm_page_queues[PQ_CACHE+i].cnt = &vmstats.v_cache_count;
	142	for (i = 0; i < PQ_L2_SIZE; i++)
	143	vm_page_queues[PQ_INACTIVE+i].cnt = &vmstats.v_inactive_count;
	144	for (i = 0; i < PQ_L2_SIZE; i++)
	145	vm_page_queues[PQ_ACTIVE+i].cnt = &vmstats.v_active_count;
	146	for (i = 0; i < PQ_L2_SIZE; i++)
	147	vm_page_queues[PQ_HOLD+i].cnt = &vmstats.v_active_count;
	148	/* PQ_NONE has no queue */
	149
	150	for (i = 0; i < PQ_COUNT; i++) {
	151	TAILQ_INIT(&vm_page_queues[i].pl);
	152	spin_init(&vm_page_queues[i].spin);
	153	}
	154
	155	for (i = 0; i < VMACTION_HSIZE; i++)
	156	LIST_INIT(&action_list[i]);
	157	}
	158
	159	/*
	160	* note: place in initialized data section? Is this necessary?
	161	*/
	162	long first_page = 0;
	163	int vm_page_array_size = 0;
	164	int vm_page_zero_count = 0;
	165	vm_page_t vm_page_array = NULL;
	166	vm_paddr_t vm_low_phys_reserved;
	167
	168	/*
	169	* (low level boot)
	170	*
	171	* Sets the page size, perhaps based upon the memory size.
	172	* Must be called before any use of page-size dependent functions.
	173	*/
	174	void
	175	vm_set_page_size(void)
	176	{
	177	if (vmstats.v_page_size == 0)
	178	vmstats.v_page_size = PAGE_SIZE;
	179	if (((vmstats.v_page_size - 1) & vmstats.v_page_size) != 0)
	180	panic("vm_set_page_size: page size not a power of two");
	181	}
	182
	183	/*
	184	* (low level boot)
	185	*
	186	* Add a new page to the freelist for use by the system. New pages
	187	* are added to both the head and tail of the associated free page
	188	* queue in a bottom-up fashion, so both zero'd and non-zero'd page
	189	* requests pull 'recent' adds (higher physical addresses) first.
	190	*
	191	* Beware that the page zeroing daemon will also be running soon after
	192	* boot, moving pages from the head to the tail of the PQ_FREE queues.
	193	*
	194	* Must be called in a critical section.
	195	*/
	196	static void
	197	vm_add_new_page(vm_paddr_t pa)
	198	{
	199	struct vpgqueues *vpq;
	200	vm_page_t m;
	201
	202	m = PHYS_TO_VM_PAGE(pa);
	203	m->phys_addr = pa;
	204	m->flags = 0;
	205	m->pc = (pa >> PAGE_SHIFT) & PQ_L2_MASK;
	206	/*
	207	* Twist for cpu localization in addition to page coloring, so
	208	* different cpus selecting by m->queue get different page colors.
	209	*/
	210	m->pc ^= ((pa >> PAGE_SHIFT) / PQ_L2_SIZE) & PQ_L2_MASK;
	211	m->pc ^= ((pa >> PAGE_SHIFT) / (PQ_L2_SIZE * PQ_L2_SIZE)) & PQ_L2_MASK;
	212	/*
	213	* Reserve a certain number of contiguous low memory pages for
	214	* contigmalloc() to use.
	215	*/
	216	if (pa < vm_low_phys_reserved) {
	217	atomic_add_int(&vmstats.v_page_count, 1);
	218	atomic_add_int(&vmstats.v_dma_pages, 1);
	219	m->queue = PQ_NONE;
	220	m->wire_count = 1;
	221	atomic_add_int(&vmstats.v_wire_count, 1);
	222	alist_free(&vm_contig_alist, pa >> PAGE_SHIFT, 1);
	223	return;
	224	}
	225
	226	/*
	227	* General page
	228	*/
	229	m->queue = m->pc + PQ_FREE;
	230	KKASSERT(m->dirty == 0);
	231
	232	atomic_add_int(&vmstats.v_page_count, 1);
	233	atomic_add_int(&vmstats.v_free_count, 1);
	234	vpq = &vm_page_queues[m->queue];
	235	if ((vpq->flipflop & 15) == 0) {
	236	pmap_zero_page(VM_PAGE_TO_PHYS(m));
	237	m->flags \|= PG_ZERO;
	238	TAILQ_INSERT_TAIL(&vpq->pl, m, pageq);
	239	atomic_add_int(&vm_page_zero_count, 1);
	240	} else {
	241	TAILQ_INSERT_HEAD(&vpq->pl, m, pageq);
	242	}
	243	++vpq->flipflop;
	244	++vpq->lcnt;
	245	}
	246
	247	/*
	248	* (low level boot)
	249	*
	250	* Initializes the resident memory module.
	251	*
	252	* Preallocates memory for critical VM structures and arrays prior to
	253	* kernel_map becoming available.
	254	*
	255	* Memory is allocated from (virtual2_start, virtual2_end) if available,
	256	* otherwise memory is allocated from (virtual_start, virtual_end).
	257	*
	258	* On x86-64 (virtual_start, virtual_end) is only 2GB and may not be
	259	* large enough to hold vm_page_array & other structures for machines with
	260	* large amounts of ram, so we want to use virtual2* when available.
	261	*/
	262	void
	263	vm_page_startup(void)
	264	{
	265	vm_offset_t vaddr = virtual2_start ? virtual2_start : virtual_start;
	266	vm_offset_t mapped;
	267	vm_size_t npages;
	268	vm_paddr_t page_range;
	269	vm_paddr_t new_end;
	270	int i;
	271	vm_paddr_t pa;
	272	int nblocks;
	273	vm_paddr_t last_pa;
	274	vm_paddr_t end;
	275	vm_paddr_t biggestone, biggestsize;
	276	vm_paddr_t total;
	277
	278	total = 0;
	279	biggestsize = 0;
	280	biggestone = 0;
	281	nblocks = 0;
	282	vaddr = round_page(vaddr);
	283
	284	for (i = 0; phys_avail[i + 1]; i += 2) {
	285	phys_avail[i] = round_page64(phys_avail[i]);
	286	phys_avail[i + 1] = trunc_page64(phys_avail[i + 1]);
	287	}
	288
	289	for (i = 0; phys_avail[i + 1]; i += 2) {
	290	vm_paddr_t size = phys_avail[i + 1] - phys_avail[i];
	291
	292	if (size > biggestsize) {
	293	biggestone = i;
	294	biggestsize = size;
	295	}
	296	++nblocks;
	297	total += size;
	298	}
	299
	300	end = phys_avail[biggestone+1];
	301	end = trunc_page(end);
	302
	303	/*
	304	* Initialize the queue headers for the free queue, the active queue
	305	* and the inactive queue.
	306	*/
	307	vm_page_queue_init();
	308
	309	#if !defined(_KERNEL_VIRTUAL)
	310	/*
	311	* VKERNELs don't support minidumps and as such don't need
	312	* vm_page_dump
	313	*
	314	* Allocate a bitmap to indicate that a random physical page
	315	* needs to be included in a minidump.
	316	*
	317	* The amd64 port needs this to indicate which direct map pages
	318	* need to be dumped, via calls to dump_add_page()/dump_drop_page().
	319	*
	320	* However, i386 still needs this workspace internally within the
	321	* minidump code. In theory, they are not needed on i386, but are
	322	* included should the sf_buf code decide to use them.
	323	*/
	324	page_range = phys_avail[(nblocks - 1) * 2 + 1] / PAGE_SIZE;
	325	vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY);
	326	end -= vm_page_dump_size;
	327	vm_page_dump = (void *)pmap_map(&vaddr, end, end + vm_page_dump_size,
	328	VM_PROT_READ \| VM_PROT_WRITE);
	329	bzero((void *)vm_page_dump, vm_page_dump_size);
	330	#endif
	331	/*
	332	* Compute the number of pages of memory that will be available for
	333	* use (taking into account the overhead of a page structure per
	334	* page).
	335	*/
	336	first_page = phys_avail[0] / PAGE_SIZE;
	337	page_range = phys_avail[(nblocks - 1) * 2 + 1] / PAGE_SIZE - first_page;
	338	npages = (total - (page_range * sizeof(struct vm_page))) / PAGE_SIZE;
	339
	340	#ifndef _KERNEL_VIRTUAL
	341	/*
	342	* (only applies to real kernels)
	343	*
	344	* Initialize the contiguous reserve map. We initially reserve up
	345	* to 1/4 available physical memory or 65536 pages (~256MB), whichever
	346	* is lower.
	347	*
	348	* Once device initialization is complete we return most of the
	349	* reserved memory back to the normal page queues but leave some
	350	* in reserve for things like usb attachments.
	351	*/
	352	vm_low_phys_reserved = (vm_paddr_t)65536 << PAGE_SHIFT;
	353	if (vm_low_phys_reserved > total / 4)
	354	vm_low_phys_reserved = total / 4;
	355	if (vm_dma_reserved == 0) {
	356	vm_dma_reserved = 16 * 1024 * 1024; /* 16MB */
	357	if (vm_dma_reserved > total / 16)
	358	vm_dma_reserved = total / 16;
	359	}
	360	#endif
	361	alist_init(&vm_contig_alist, 65536, vm_contig_ameta,
	362	ALIST_RECORDS_65536);
	363
	364	/*
	365	* Initialize the mem entry structures now, and put them in the free
	366	* queue.
	367	*/
	368	new_end = trunc_page(end - page_range * sizeof(struct vm_page));
	369	mapped = pmap_map(&vaddr, new_end, end, VM_PROT_READ \| VM_PROT_WRITE);
	370	vm_page_array = (vm_page_t)mapped;
	371
	372	#if defined(__x86_64__) && !defined(_KERNEL_VIRTUAL)
	373	/*
	374	* since pmap_map on amd64 returns stuff out of a direct-map region,
	375	* we have to manually add these pages to the minidump tracking so
	376	* that they can be dumped, including the vm_page_array.
	377	*/
	378	for (pa = new_end; pa < phys_avail[biggestone + 1]; pa += PAGE_SIZE)
	379	dump_add_page(pa);
	380	#endif
	381
	382	/*
	383	* Clear all of the page structures
	384	*/
	385	bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page));
	386	vm_page_array_size = page_range;
	387
	388	/*
	389	* Construct the free queue(s) in ascending order (by physical
	390	* address) so that the first 16MB of physical memory is allocated
	391	* last rather than first. On large-memory machines, this avoids
	392	* the exhaustion of low physical memory before isa_dmainit has run.
	393	*/
	394	vmstats.v_page_count = 0;
	395	vmstats.v_free_count = 0;
	396	for (i = 0; phys_avail[i + 1] && npages > 0; i += 2) {
	397	pa = phys_avail[i];
	398	if (i == biggestone)
	399	last_pa = new_end;
	400	else
	401	last_pa = phys_avail[i + 1];
	402	while (pa < last_pa && npages-- > 0) {
	403	vm_add_new_page(pa);
	404	pa += PAGE_SIZE;
	405	}
	406	}
	407	if (virtual2_start)
	408	virtual2_start = vaddr;
	409	else
	410	virtual_start = vaddr;
	411	}
	412
	413	/*
	414	* We tended to reserve a ton of memory for contigmalloc(). Now that most
	415	* drivers have initialized we want to return most the remaining free
	416	* reserve back to the VM page queues so they can be used for normal
	417	* allocations.
	418	*
	419	* We leave vm_dma_reserved bytes worth of free pages in the reserve pool.
	420	*/
	421	static void
	422	vm_page_startup_finish(void *dummy __unused)
	423	{
	424	alist_blk_t blk;
	425	alist_blk_t rblk;
	426	alist_blk_t count;
	427	alist_blk_t xcount;
	428	alist_blk_t bfree;
	429	vm_page_t m;
	430
	431	spin_lock(&vm_contig_spin);
	432	for (;;) {
	433	bfree = alist_free_info(&vm_contig_alist, &blk, &count);
	434	if (bfree <= vm_dma_reserved / PAGE_SIZE)
	435	break;
	436	if (count == 0)
	437	break;
	438
	439	/*
	440	* Figure out how much of the initial reserve we have to
	441	* free in order to reach our target.
	442	*/
	443	bfree -= vm_dma_reserved / PAGE_SIZE;
	444	if (count > bfree) {
	445	blk += count - bfree;
	446	count = bfree;
	447	}
	448
	449	/*
	450	* Calculate the nearest power of 2 <= count.
	451	*/
	452	for (xcount = 1; xcount <= count; xcount <<= 1)
	453	;
	454	xcount >>= 1;
	455	blk += count - xcount;
	456	count = xcount;
	457
	458	/*
	459	* Allocate the pages from the alist, then free them to
	460	* the normal VM page queues.
	461	*
	462	* Pages allocated from the alist are wired. We have to
	463	* busy, unwire, and free them. We must also adjust
	464	* vm_low_phys_reserved before freeing any pages to prevent
	465	* confusion.
	466	*/
	467	rblk = alist_alloc(&vm_contig_alist, blk, count);
	468	if (rblk != blk) {
	469	kprintf("vm_page_startup_finish: Unable to return "
	470	"dma space @0x%08x/%d -> 0x%08x\n",
	471	blk, count, rblk);
	472	break;
	473	}
	474	atomic_add_int(&vmstats.v_dma_pages, -count);
	475	spin_unlock(&vm_contig_spin);
	476
	477	m = PHYS_TO_VM_PAGE((vm_paddr_t)blk << PAGE_SHIFT);
	478	vm_low_phys_reserved = VM_PAGE_TO_PHYS(m);
	479	while (count) {
	480	vm_page_busy_wait(m, FALSE, "cpgfr");
	481	vm_page_unwire(m, 0);
	482	vm_page_free(m);
	483	--count;
	484	++m;
	485	}
	486	spin_lock(&vm_contig_spin);
	487	}
	488	spin_unlock(&vm_contig_spin);
	489
	490	/*
	491	* Print out how much DMA space drivers have already allocated and
	492	* how much is left over.
	493	*/
	494	kprintf("DMA space used: %jdk, remaining available: %jdk\n",
	495	(intmax_t)(vmstats.v_dma_pages - vm_contig_alist.bl_free) *
	496	(PAGE_SIZE / 1024),
	497	(intmax_t)vm_contig_alist.bl_free * (PAGE_SIZE / 1024));
	498	}
	499	SYSINIT(vm_pgend, SI_SUB_PROC0_POST, SI_ORDER_ANY,
	500	vm_page_startup_finish, NULL)
	501
	502
	503	/*
	504	* Scan comparison function for Red-Black tree scans. An inclusive
	505	* (start,end) is expected. Other fields are not used.
	506	*/
	507	int
	508	rb_vm_page_scancmp(struct vm_page p, void data)
	509	{
	510	struct rb_vm_page_scan_info *info = data;
	511
	512	if (p->pindex < info->start_pindex)
	513	return(-1);
	514	if (p->pindex > info->end_pindex)
	515	return(1);
	516	return(0);
	517	}
	518
	519	int
	520	rb_vm_page_compare(struct vm_page p1, struct vm_page p2)
	521	{
	522	if (p1->pindex < p2->pindex)
	523	return(-1);
	524	if (p1->pindex > p2->pindex)
	525	return(1);
	526	return(0);
	527	}
	528
	529	/*
	530	* Each page queue has its own spin lock, which is fairly optimal for
	531	* allocating and freeing pages at least.
	532	*
	533	* The caller must hold the vm_page_spin_lock() before locking a vm_page's
	534	* queue spinlock via this function. Also note that m->queue cannot change
	535	* unless both the page and queue are locked.
	536	*/
	537	static __inline
	538	void
	539	_vm_page_queue_spin_lock(vm_page_t m)
	540	{
	541	u_short queue;
	542
	543	queue = m->queue;
	544	if (queue != PQ_NONE) {
	545	spin_lock(&vm_page_queues[queue].spin);
	546	KKASSERT(queue == m->queue);
	547	}
	548	}
	549
	550	static __inline
	551	void
	552	_vm_page_queue_spin_unlock(vm_page_t m)
	553	{
	554	u_short queue;
	555
	556	queue = m->queue;
	557	cpu_ccfence();
	558	if (queue != PQ_NONE)
	559	spin_unlock(&vm_page_queues[queue].spin);
	560	}
	561
	562	static __inline
	563	void
	564	_vm_page_queues_spin_lock(u_short queue)
	565	{
	566	cpu_ccfence();
	567	if (queue != PQ_NONE)
	568	spin_lock(&vm_page_queues[queue].spin);
	569	}
	570
	571
	572	static __inline
	573	void
	574	_vm_page_queues_spin_unlock(u_short queue)
	575	{
	576	cpu_ccfence();
	577	if (queue != PQ_NONE)
	578	spin_unlock(&vm_page_queues[queue].spin);
	579	}
	580
	581	void
	582	vm_page_queue_spin_lock(vm_page_t m)
	583	{
	584	_vm_page_queue_spin_lock(m);
	585	}
	586
	587	void
	588	vm_page_queues_spin_lock(u_short queue)
	589	{
	590	_vm_page_queues_spin_lock(queue);
	591	}
	592
	593	void
	594	vm_page_queue_spin_unlock(vm_page_t m)
	595	{
	596	_vm_page_queue_spin_unlock(m);
	597	}
	598
	599	void
	600	vm_page_queues_spin_unlock(u_short queue)
	601	{
	602	_vm_page_queues_spin_unlock(queue);
	603	}
	604
	605	/*
	606	* This locks the specified vm_page and its queue in the proper order
	607	* (page first, then queue). The queue may change so the caller must
	608	* recheck on return.
	609	*/
	610	static __inline
	611	void
	612	_vm_page_and_queue_spin_lock(vm_page_t m)
	613	{
	614	vm_page_spin_lock(m);
	615	_vm_page_queue_spin_lock(m);
	616	}
	617
	618	static __inline
	619	void
	620	_vm_page_and_queue_spin_unlock(vm_page_t m)
	621	{
	622	_vm_page_queues_spin_unlock(m->queue);
	623	vm_page_spin_unlock(m);
	624	}
	625
	626	void
	627	vm_page_and_queue_spin_unlock(vm_page_t m)
	628	{
	629	_vm_page_and_queue_spin_unlock(m);
	630	}
	631
	632	void
	633	vm_page_and_queue_spin_lock(vm_page_t m)
	634	{
	635	_vm_page_and_queue_spin_lock(m);
	636	}
	637
	638	/*
	639	* Helper function removes vm_page from its current queue.
	640	* Returns the base queue the page used to be on.
	641	*
	642	* The vm_page and the queue must be spinlocked.
	643	* This function will unlock the queue but leave the page spinlocked.
	644	*/
	645	static __inline u_short
	646	_vm_page_rem_queue_spinlocked(vm_page_t m)
	647	{
	648	struct vpgqueues *pq;
	649	u_short queue;
	650
	651	queue = m->queue;
	652	if (queue != PQ_NONE) {
	653	pq = &vm_page_queues[queue];
	654	TAILQ_REMOVE(&pq->pl, m, pageq);
	655	atomic_add_int(pq->cnt, -1);
	656	pq->lcnt--;
	657	m->queue = PQ_NONE;
	658	vm_page_queues_spin_unlock(queue);
	659	if ((queue - m->pc) == PQ_FREE && (m->flags & PG_ZERO))
	660	atomic_subtract_int(&vm_page_zero_count, 1);
	661	if ((queue - m->pc) == PQ_CACHE \|\| (queue - m->pc) == PQ_FREE)
	662	return (queue - m->pc);
	663	}
	664	return queue;
	665	}
	666
	667	/*
	668	* Helper function places the vm_page on the specified queue.
	669	*
	670	* The vm_page must be spinlocked.
	671	* This function will return with both the page and the queue locked.
	672	*/
	673	static __inline void
	674	_vm_page_add_queue_spinlocked(vm_page_t m, u_short queue, int athead)
	675	{
	676	struct vpgqueues *pq;
	677
	678	KKASSERT(m->queue == PQ_NONE);
	679
	680	if (queue != PQ_NONE) {
	681	vm_page_queues_spin_lock(queue);
	682	pq = &vm_page_queues[queue];
	683	++pq->lcnt;
	684	atomic_add_int(pq->cnt, 1);
	685	m->queue = queue;
	686
	687	/*
	688	* Put zero'd pages on the end ( where we look for zero'd pages
	689	* first ) and non-zerod pages at the head.
	690	*/
	691	if (queue - m->pc == PQ_FREE) {
	692	if (m->flags & PG_ZERO) {
	693	TAILQ_INSERT_TAIL(&pq->pl, m, pageq);
	694	atomic_add_int(&vm_page_zero_count, 1);
	695	} else {
	696	TAILQ_INSERT_HEAD(&pq->pl, m, pageq);
	697	}
	698	} else if (athead) {
	699	TAILQ_INSERT_HEAD(&pq->pl, m, pageq);
	700	} else {
	701	TAILQ_INSERT_TAIL(&pq->pl, m, pageq);
	702	}
	703	/* leave the queue spinlocked */
	704	}
	705	}
	706
	707	/*
	708	* Wait until page is no longer PG_BUSY or (if also_m_busy is TRUE)
	709	* m->busy is zero. Returns TRUE if it had to sleep, FALSE if we
	710	* did not. Only one sleep call will be made before returning.
	711	*
	712	* This function does NOT busy the page and on return the page is not
	713	* guaranteed to be available.
	714	*/
	715	void
	716	vm_page_sleep_busy(vm_page_t m, int also_m_busy, const char *msg)
	717	{
	718	u_int32_t flags;
	719
	720	for (;;) {
	721	flags = m->flags;
	722	cpu_ccfence();
	723
	724	if ((flags & PG_BUSY) == 0 &&
	725	(also_m_busy == 0 \|\| (flags & PG_SBUSY) == 0)) {
	726	break;
	727	}
	728	tsleep_interlock(m, 0);
	729	if (atomic_cmpset_int(&m->flags, flags,
	730	flags \| PG_WANTED \| PG_REFERENCED)) {
	731	tsleep(m, PINTERLOCKED, msg, 0);
	732	break;
	733	}
	734	}
	735	}
	736
	737	/*
	738	* Wait until PG_BUSY can be set, then set it. If also_m_busy is TRUE we
	739	* also wait for m->busy to become 0 before setting PG_BUSY.
	740	*/
	741	void
	742	VM_PAGE_DEBUG_EXT(vm_page_busy_wait)(vm_page_t m,
	743	int also_m_busy, const char *msg
	744	VM_PAGE_DEBUG_ARGS)
	745	{
	746	u_int32_t flags;
	747
	748	for (;;) {
	749	flags = m->flags;
	750	cpu_ccfence();
	751	if (flags & PG_BUSY) {
	752	tsleep_interlock(m, 0);
	753	if (atomic_cmpset_int(&m->flags, flags,
	754	flags \| PG_WANTED \| PG_REFERENCED)) {
	755	tsleep(m, PINTERLOCKED, msg, 0);
	756	}
	757	} else if (also_m_busy && (flags & PG_SBUSY)) {
	758	tsleep_interlock(m, 0);
	759	if (atomic_cmpset_int(&m->flags, flags,
	760	flags \| PG_WANTED \| PG_REFERENCED)) {
	761	tsleep(m, PINTERLOCKED, msg, 0);
	762	}
	763	} else {
	764	if (atomic_cmpset_int(&m->flags, flags,
	765	flags \| PG_BUSY)) {
	766	#ifdef VM_PAGE_DEBUG
	767	m->busy_func = func;
	768	m->busy_line = lineno;
	769	#endif
	770	break;
	771	}
	772	}
	773	}
	774	}
	775
	776	/*
	777	* Attempt to set PG_BUSY. If also_m_busy is TRUE we only succeed if m->busy
	778	* is also 0.
	779	*
	780	* Returns non-zero on failure.
	781	*/
	782	int
	783	VM_PAGE_DEBUG_EXT(vm_page_busy_try)(vm_page_t m, int also_m_busy
	784	VM_PAGE_DEBUG_ARGS)
	785	{
	786	u_int32_t flags;
	787
	788	for (;;) {
	789	flags = m->flags;
	790	cpu_ccfence();
	791	if (flags & PG_BUSY)
	792	return TRUE;
	793	if (also_m_busy && (flags & PG_SBUSY))
	794	return TRUE;
	795	if (atomic_cmpset_int(&m->flags, flags, flags \| PG_BUSY)) {
	796	#ifdef VM_PAGE_DEBUG
	797	m->busy_func = func;
	798	m->busy_line = lineno;
	799	#endif
	800	return FALSE;
	801	}
	802	}
	803	}
	804
	805	/*
	806	* Clear the PG_BUSY flag and return non-zero to indicate to the caller
	807	* that a wakeup() should be performed.
	808	*
	809	* The vm_page must be spinlocked and will remain spinlocked on return.
	810	* The related queue must NOT be spinlocked (which could deadlock us).
	811	*
	812	* (inline version)
	813	*/
	814	static __inline
	815	int
	816	_vm_page_wakeup(vm_page_t m)
	817	{
	818	u_int32_t flags;
	819
	820	for (;;) {
	821	flags = m->flags;
	822	cpu_ccfence();
	823	if (atomic_cmpset_int(&m->flags, flags,
	824	flags & ~(PG_BUSY \| PG_WANTED))) {
	825	break;
	826	}
	827	}
	828	return(flags & PG_WANTED);
	829	}
	830
	831	/*
	832	* Clear the PG_BUSY flag and wakeup anyone waiting for the page. This
	833	* is typically the last call you make on a page before moving onto
	834	* other things.
	835	*/
	836	void
	837	vm_page_wakeup(vm_page_t m)
	838	{
	839	KASSERT(m->flags & PG_BUSY, ("vm_page_wakeup: page not busy!!!"));
	840	vm_page_spin_lock(m);
	841	if (_vm_page_wakeup(m)) {
	842	vm_page_spin_unlock(m);
	843	wakeup(m);
	844	} else {
	845	vm_page_spin_unlock(m);
	846	}
	847	}
	848
	849	/*
	850	* Holding a page keeps it from being reused. Other parts of the system
	851	* can still disassociate the page from its current object and free it, or
	852	* perform read or write I/O on it and/or otherwise manipulate the page,
	853	* but if the page is held the VM system will leave the page and its data
	854	* intact and not reuse the page for other purposes until the last hold
	855	* reference is released. (see vm_page_wire() if you want to prevent the
	856	* page from being disassociated from its object too).
	857	*
	858	* The caller must still validate the contents of the page and, if necessary,
	859	* wait for any pending I/O (e.g. vm_page_sleep_busy() loop) to complete
	860	* before manipulating the page.
	861	*
	862	* XXX get vm_page_spin_lock() here and move FREE->HOLD if necessary
	863	*/
	864	void
	865	vm_page_hold(vm_page_t m)
	866	{
	867	vm_page_spin_lock(m);
	868	atomic_add_int(&m->hold_count, 1);
	869	if (m->queue - m->pc == PQ_FREE) {
	870	_vm_page_queue_spin_lock(m);
	871	_vm_page_rem_queue_spinlocked(m);
	872	_vm_page_add_queue_spinlocked(m, PQ_HOLD + m->pc, 0);
	873	_vm_page_queue_spin_unlock(m);
	874	}
	875	vm_page_spin_unlock(m);
	876	}
	877
	878	/*
	879	* The opposite of vm_page_hold(). A page can be freed while being held,
	880	* which places it on the PQ_HOLD queue. If we are able to busy the page
	881	* after the hold count drops to zero we will move the page to the
	882	* appropriate PQ_FREE queue by calling vm_page_free_toq().
	883	*/
	884	void
	885	vm_page_unhold(vm_page_t m)
	886	{
	887	vm_page_spin_lock(m);
	888	atomic_add_int(&m->hold_count, -1);
	889	if (m->hold_count == 0 && m->queue - m->pc == PQ_HOLD) {
	890	_vm_page_queue_spin_lock(m);
	891	_vm_page_rem_queue_spinlocked(m);
	892	_vm_page_add_queue_spinlocked(m, PQ_FREE + m->pc, 0);
	893	_vm_page_queue_spin_unlock(m);
	894	}
	895	vm_page_spin_unlock(m);
	896	}
	897
	898	/*
	899	* Inserts the given vm_page into the object and object list.
	900	*
	901	* The pagetables are not updated but will presumably fault the page
	902	* in if necessary, or if a kernel page the caller will at some point
	903	* enter the page into the kernel's pmap. We are not allowed to block
	904	* here so we can't do this anyway.
	905	*
	906	* This routine may not block.
	907	* This routine must be called with the vm_object held.
	908	* This routine must be called with a critical section held.
	909	*
	910	* This routine returns TRUE if the page was inserted into the object
	911	* successfully, and FALSE if the page already exists in the object.
	912	*/
	913	int
	914	vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
	915	{
	916	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
	917	if (m->object != NULL)
	918	panic("vm_page_insert: already inserted");
	919
	920	object->generation++;
	921
	922	/*
	923	* Record the object/offset pair in this page and add the
	924	* pv_list_count of the page to the object.
	925	*
	926	* The vm_page spin lock is required for interactions with the pmap.
	927	*/
	928	vm_page_spin_lock(m);
	929	m->object = object;
	930	m->pindex = pindex;
	931	if (vm_page_rb_tree_RB_INSERT(&object->rb_memq, m)) {
	932	m->object = NULL;
	933	m->pindex = 0;
	934	vm_page_spin_unlock(m);
	935	return FALSE;
	936	}
	937	object->resident_page_count++;
	938	/* atomic_add_int(&object->agg_pv_list_count, m->md.pv_list_count); */
	939	vm_page_spin_unlock(m);
	940
	941	/*
	942	* Since we are inserting a new and possibly dirty page,
	943	* update the object's OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY flags.
	944	*/
	945	if ((m->valid & m->dirty) \|\| (m->flags & PG_WRITEABLE))
	946	vm_object_set_writeable_dirty(object);
	947
	948	/*
	949	* Checks for a swap assignment and sets PG_SWAPPED if appropriate.
	950	*/
	951	swap_pager_page_inserted(m);
	952	return TRUE;
	953	}
	954
	955	/*
	956	* Removes the given vm_page_t from the (object,index) table
	957	*
	958	* The underlying pmap entry (if any) is NOT removed here.
	959	* This routine may not block.
	960	*
	961	* The page must be BUSY and will remain BUSY on return.
	962	* No other requirements.
	963	*
	964	* NOTE: FreeBSD side effect was to unbusy the page on return. We leave
	965	* it busy.
	966	*/
	967	void
	968	vm_page_remove(vm_page_t m)
	969	{
	970	vm_object_t object;
	971
	972	if (m->object == NULL) {
	973	return;
	974	}
	975
	976	if ((m->flags & PG_BUSY) == 0)
	977	panic("vm_page_remove: page not busy");
	978
	979	object = m->object;
	980
	981	vm_object_hold(object);
	982
	983	/*
	984	* Remove the page from the object and update the object.
	985	*
	986	* The vm_page spin lock is required for interactions with the pmap.
	987	*/
	988	vm_page_spin_lock(m);
	989	vm_page_rb_tree_RB_REMOVE(&object->rb_memq, m);
	990	object->resident_page_count--;
	991	/* atomic_add_int(&object->agg_pv_list_count, -m->md.pv_list_count); */
	992	m->object = NULL;
	993	vm_page_spin_unlock(m);
	994
	995	object->generation++;
	996
	997	vm_object_drop(object);
	998	}
	999
	1000	/*
	1001	* Locate and return the page at (object, pindex), or NULL if the
	1002	* page could not be found.
	1003	*
	1004	* The caller must hold the vm_object token.
	1005	*/
	1006	vm_page_t
	1007	vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
	1008	{
	1009	vm_page_t m;
	1010
	1011	/*
	1012	* Search the hash table for this object/offset pair
	1013	*/
	1014	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
	1015	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
	1016	KKASSERT(m == NULL \|\| (m->object == object && m->pindex == pindex));
	1017	return(m);
	1018	}
	1019
	1020	vm_page_t
	1021	VM_PAGE_DEBUG_EXT(vm_page_lookup_busy_wait)(struct vm_object *object,
	1022	vm_pindex_t pindex,
	1023	int also_m_busy, const char *msg
	1024	VM_PAGE_DEBUG_ARGS)
	1025	{
	1026	u_int32_t flags;
	1027	vm_page_t m;
	1028
	1029	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
	1030	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
	1031	while (m) {
	1032	KKASSERT(m->object == object && m->pindex == pindex);
	1033	flags = m->flags;
	1034	cpu_ccfence();
	1035	if (flags & PG_BUSY) {
	1036	tsleep_interlock(m, 0);
	1037	if (atomic_cmpset_int(&m->flags, flags,
	1038	flags \| PG_WANTED \| PG_REFERENCED)) {
	1039	tsleep(m, PINTERLOCKED, msg, 0);
	1040	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq,
	1041	pindex);
	1042	}
	1043	} else if (also_m_busy && (flags & PG_SBUSY)) {
	1044	tsleep_interlock(m, 0);
	1045	if (atomic_cmpset_int(&m->flags, flags,
	1046	flags \| PG_WANTED \| PG_REFERENCED)) {
	1047	tsleep(m, PINTERLOCKED, msg, 0);
	1048	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq,
	1049	pindex);
	1050	}
	1051	} else if (atomic_cmpset_int(&m->flags, flags,
	1052	flags \| PG_BUSY)) {
	1053	#ifdef VM_PAGE_DEBUG
	1054	m->busy_func = func;
	1055	m->busy_line = lineno;
	1056	#endif
	1057	break;
	1058	}
	1059	}
	1060	return m;
	1061	}
	1062
	1063	/*
	1064	* Attempt to lookup and busy a page.
	1065	*
	1066	* Returns NULL if the page could not be found
	1067	*
	1068	* Returns a vm_page and error == TRUE if the page exists but could not
	1069	* be busied.
	1070	*
	1071	* Returns a vm_page and error == FALSE on success.
	1072	*/
	1073	vm_page_t
	1074	VM_PAGE_DEBUG_EXT(vm_page_lookup_busy_try)(struct vm_object *object,
	1075	vm_pindex_t pindex,
	1076	int also_m_busy, int *errorp
	1077	VM_PAGE_DEBUG_ARGS)
	1078	{
	1079	u_int32_t flags;
	1080	vm_page_t m;
	1081
	1082	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
	1083	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
	1084	*errorp = FALSE;
	1085	while (m) {
	1086	KKASSERT(m->object == object && m->pindex == pindex);
	1087	flags = m->flags;
	1088	cpu_ccfence();
	1089	if (flags & PG_BUSY) {
	1090	*errorp = TRUE;
	1091	break;
	1092	}
	1093	if (also_m_busy && (flags & PG_SBUSY)) {
	1094	*errorp = TRUE;
	1095	break;
	1096	}
	1097	if (atomic_cmpset_int(&m->flags, flags, flags \| PG_BUSY)) {
	1098	#ifdef VM_PAGE_DEBUG
	1099	m->busy_func = func;
	1100	m->busy_line = lineno;
	1101	#endif
	1102	break;
	1103	}
	1104	}
	1105	return m;
	1106	}
	1107
	1108	/*
	1109	* Caller must hold the related vm_object
	1110	*/
	1111	vm_page_t
	1112	vm_page_next(vm_page_t m)
	1113	{
	1114	vm_page_t next;
	1115
	1116	next = vm_page_rb_tree_RB_NEXT(m);
	1117	if (next && next->pindex != m->pindex + 1)
	1118	next = NULL;
	1119	return (next);
	1120	}
	1121
	1122	/*
	1123	* vm_page_rename()
	1124	*
	1125	* Move the given vm_page from its current object to the specified
	1126	* target object/offset. The page must be busy and will remain so
	1127	* on return.
	1128	*
	1129	* new_object must be held.
	1130	* This routine might block. XXX ?
	1131	*
	1132	* NOTE: Swap associated with the page must be invalidated by the move. We
	1133	* have to do this for several reasons: (1) we aren't freeing the
	1134	* page, (2) we are dirtying the page, (3) the VM system is probably
	1135	* moving the page from object A to B, and will then later move
	1136	* the backing store from A to B and we can't have a conflict.
	1137	*
	1138	* NOTE: We always dirty the page. It is necessary both for the
	1139	* fact that we moved it, and because we may be invalidating
	1140	* swap. If the page is on the cache, we have to deactivate it
	1141	* or vm_page_dirty() will panic. Dirty pages are not allowed
	1142	* on the cache.
	1143	*/
	1144	void
	1145	vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
	1146	{
	1147	KKASSERT(m->flags & PG_BUSY);
	1148	ASSERT_LWKT_TOKEN_HELD(vm_object_token(new_object));
	1149	if (m->object) {
	1150	ASSERT_LWKT_TOKEN_HELD(vm_object_token(m->object));
	1151	vm_page_remove(m);
	1152	}
	1153	if (vm_page_insert(m, new_object, new_pindex) == FALSE) {
	1154	panic("vm_page_rename: target exists (%p,%"PRIu64")",
	1155	new_object, new_pindex);
	1156	}
	1157	if (m->queue - m->pc == PQ_CACHE)
	1158	vm_page_deactivate(m);
	1159	vm_page_dirty(m);
	1160	}
	1161
	1162	/*
	1163	* vm_page_unqueue() without any wakeup. This routine is used when a page
	1164	* is being moved between queues or otherwise is to remain BUSYied by the
	1165	* caller.
	1166	*
	1167	* This routine may not block.
	1168	*/
	1169	void
	1170	vm_page_unqueue_nowakeup(vm_page_t m)
	1171	{
	1172	vm_page_and_queue_spin_lock(m);
	1173	(void)_vm_page_rem_queue_spinlocked(m);
	1174	vm_page_spin_unlock(m);
	1175	}
	1176
	1177	/*
	1178	* vm_page_unqueue() - Remove a page from its queue, wakeup the pagedemon
	1179	* if necessary.
	1180	*
	1181	* This routine may not block.
	1182	*/
	1183	void
	1184	vm_page_unqueue(vm_page_t m)
	1185	{
	1186	u_short queue;
	1187
	1188	vm_page_and_queue_spin_lock(m);
	1189	queue = _vm_page_rem_queue_spinlocked(m);
	1190	if (queue == PQ_FREE \|\| queue == PQ_CACHE) {
	1191	vm_page_spin_unlock(m);
	1192	pagedaemon_wakeup();
	1193	} else {
	1194	vm_page_spin_unlock(m);
	1195	}
	1196	}
	1197
	1198	/*
	1199	* vm_page_list_find()
	1200	*
	1201	* Find a page on the specified queue with color optimization.
	1202	*
	1203	* The page coloring optimization attempts to locate a page that does
	1204	* not overload other nearby pages in the object in the cpu's L1 or L2
	1205	* caches. We need this optimization because cpu caches tend to be
	1206	* physical caches, while object spaces tend to be virtual.
	1207	*
	1208	* On MP systems each PQ_FREE and PQ_CACHE color queue has its own spinlock
	1209	* and the algorithm is adjusted to localize allocations on a per-core basis.
	1210	* This is done by 'twisting' the colors.
	1211	*
	1212	* The page is returned spinlocked and removed from its queue (it will
	1213	* be on PQ_NONE), or NULL. The page is not PG_BUSY'd. The caller
	1214	* is responsible for dealing with the busy-page case (usually by
	1215	* deactivating the page and looping).
	1216	*
	1217	* NOTE: This routine is carefully inlined. A non-inlined version
	1218	* is available for outside callers but the only critical path is
	1219	* from within this source file.
	1220	*
	1221	* NOTE: This routine assumes that the vm_pages found in PQ_CACHE and PQ_FREE
	1222	* represent stable storage, allowing us to order our locks vm_page
	1223	* first, then queue.
	1224	*/
	1225	static __inline
	1226	vm_page_t
	1227	_vm_page_list_find(int basequeue, int index, boolean_t prefer_zero)
	1228	{
	1229	vm_page_t m;
	1230
	1231	for (;;) {
	1232	if (prefer_zero)
	1233	m = TAILQ_LAST(&vm_page_queues[basequeue+index].pl, pglist);
	1234	else
	1235	m = TAILQ_FIRST(&vm_page_queues[basequeue+index].pl);
	1236	if (m == NULL) {
	1237	m = _vm_page_list_find2(basequeue, index);
	1238	return(m);
	1239	}
	1240	vm_page_and_queue_spin_lock(m);
	1241	if (m->queue == basequeue + index) {
	1242	_vm_page_rem_queue_spinlocked(m);
	1243	/* vm_page_t spin held, no queue spin */
	1244	break;
	1245	}
	1246	vm_page_and_queue_spin_unlock(m);
	1247	}
	1248	return(m);
	1249	}
	1250
	1251	static vm_page_t
	1252	_vm_page_list_find2(int basequeue, int index)
	1253	{
	1254	int i;
	1255	vm_page_t m = NULL;
	1256	struct vpgqueues *pq;
	1257
	1258	pq = &vm_page_queues[basequeue];
	1259
	1260	/*
	1261	* Note that for the first loop, index+i and index-i wind up at the
	1262	* same place. Even though this is not totally optimal, we've already
	1263	* blown it by missing the cache case so we do not care.
	1264	*/
	1265	for (i = PQ_L2_SIZE / 2; i > 0; --i) {
	1266	for (;;) {
	1267	m = TAILQ_FIRST(&pq[(index + i) & PQ_L2_MASK].pl);
	1268	if (m) {
	1269	_vm_page_and_queue_spin_lock(m);
	1270	if (m->queue ==
	1271	basequeue + ((index + i) & PQ_L2_MASK)) {
	1272	_vm_page_rem_queue_spinlocked(m);
	1273	return(m);
	1274	}
	1275	_vm_page_and_queue_spin_unlock(m);
	1276	continue;
	1277	}
	1278	m = TAILQ_FIRST(&pq[(index - i) & PQ_L2_MASK].pl);
	1279	if (m) {
	1280	_vm_page_and_queue_spin_lock(m);
	1281	if (m->queue ==
	1282	basequeue + ((index - i) & PQ_L2_MASK)) {
	1283	_vm_page_rem_queue_spinlocked(m);
	1284	return(m);
	1285	}
	1286	_vm_page_and_queue_spin_unlock(m);
	1287	continue;
	1288	}
	1289	break; /* next i */
	1290	}
	1291	}
	1292	return(m);
	1293	}
	1294
	1295	/*
	1296	* Returns a vm_page candidate for allocation. The page is not busied so
	1297	* it can move around. The caller must busy the page (and typically
	1298	* deactivate it if it cannot be busied!)
	1299	*
	1300	* Returns a spinlocked vm_page that has been removed from its queue.
	1301	*/
	1302	vm_page_t
	1303	vm_page_list_find(int basequeue, int index, boolean_t prefer_zero)
	1304	{
	1305	return(_vm_page_list_find(basequeue, index, prefer_zero));
	1306	}
	1307
	1308	/*
	1309	* Find a page on the cache queue with color optimization, remove it
	1310	* from the queue, and busy it. The returned page will not be spinlocked.
	1311	*
	1312	* A candidate failure will be deactivated. Candidates can fail due to
	1313	* being busied by someone else, in which case they will be deactivated.
	1314	*
	1315	* This routine may not block.
	1316	*
	1317	*/
	1318	static vm_page_t
	1319	vm_page_select_cache(u_short pg_color)
	1320	{
	1321	vm_page_t m;
	1322
	1323	for (;;) {
	1324	m = _vm_page_list_find(PQ_CACHE, pg_color & PQ_L2_MASK, FALSE);
	1325	if (m == NULL)
	1326	break;
	1327	/*
	1328	* (m) has been removed from its queue and spinlocked
	1329	*/
	1330	if (vm_page_busy_try(m, TRUE)) {
	1331	_vm_page_deactivate_locked(m, 0);
	1332	vm_page_spin_unlock(m);
	1333	#ifdef INVARIANTS
	1334	kprintf("Warning: busy page %p found in cache\n", m);
	1335	#endif
	1336	} else {
	1337	/*
	1338	* We successfully busied the page
	1339	*/
	1340	if ((m->flags & (PG_UNMANAGED \| PG_NEED_COMMIT)) == 0 &&
	1341	m->hold_count == 0 &&
	1342	m->wire_count == 0 &&
	1343	(m->dirty & m->valid) == 0) {
	1344	vm_page_spin_unlock(m);
	1345	pagedaemon_wakeup();
	1346	return(m);
	1347	}
	1348
	1349	/*
	1350	* The page cannot be recycled, deactivate it.
	1351	*/
	1352	_vm_page_deactivate_locked(m, 0);
	1353	if (_vm_page_wakeup(m)) {
	1354	vm_page_spin_unlock(m);
	1355	wakeup(m);
	1356	} else {
	1357	vm_page_spin_unlock(m);
	1358	}
	1359	}
	1360	}
	1361	return (m);
	1362	}
	1363
	1364	/*
	1365	* Find a free or zero page, with specified preference. We attempt to
	1366	* inline the nominal case and fall back to _vm_page_select_free()
	1367	* otherwise. A busied page is removed from the queue and returned.
	1368	*
	1369	* This routine may not block.
	1370	*/
	1371	static __inline vm_page_t
	1372	vm_page_select_free(u_short pg_color, boolean_t prefer_zero)
	1373	{
	1374	vm_page_t m;
	1375
	1376	for (;;) {
	1377	m = _vm_page_list_find(PQ_FREE, pg_color & PQ_L2_MASK,
	1378	prefer_zero);
	1379	if (m == NULL)
	1380	break;
	1381	if (vm_page_busy_try(m, TRUE)) {
	1382	/*
	1383	* Various mechanisms such as a pmap_collect can
	1384	* result in a busy page on the free queue. We
	1385	* have to move the page out of the way so we can
	1386	* retry the allocation. If the other thread is not
	1387	* allocating the page then m->valid will remain 0 and
	1388	* the pageout daemon will free the page later on.
	1389	*
	1390	* Since we could not busy the page, however, we
	1391	* cannot make assumptions as to whether the page
	1392	* will be allocated by the other thread or not,
	1393	* so all we can do is deactivate it to move it out
	1394	* of the way. In particular, if the other thread
	1395	* wires the page it may wind up on the inactive
	1396	* queue and the pageout daemon will have to deal
	1397	* with that case too.
	1398	*/
	1399	_vm_page_deactivate_locked(m, 0);
	1400	vm_page_spin_unlock(m);
	1401	#ifdef INVARIANTS
	1402	kprintf("Warning: busy page %p found in cache\n", m);
	1403	#endif
	1404	} else {
	1405	/*
	1406	* Theoretically if we are able to busy the page
	1407	* atomic with the queue removal (using the vm_page
	1408	* lock) nobody else should be able to mess with the
	1409	* page before us.
	1410	*/
	1411	KKASSERT((m->flags & (PG_UNMANAGED \|
	1412	PG_NEED_COMMIT)) == 0);
	1413	KKASSERT(m->hold_count == 0);
	1414	KKASSERT(m->wire_count == 0);
	1415	vm_page_spin_unlock(m);
	1416	pagedaemon_wakeup();
	1417
	1418	/* return busied and removed page */
	1419	return(m);
	1420	}
	1421	}
	1422	return(m);
	1423	}
	1424
	1425	/*
	1426	* This implements a per-cpu cache of free, zero'd, ready-to-go pages.
	1427	* The idea is to populate this cache prior to acquiring any locks so
	1428	* we don't wind up potentially zeroing VM pages (under heavy loads) while
	1429	* holding potentialy contending locks.
	1430	*
	1431	* Note that we allocate the page uninserted into anything and use a pindex
	1432	* of 0, the vm_page_alloc() will effectively add gd_cpuid so these
	1433	* allocations should wind up being uncontended. However, we still want
	1434	* to rove across PQ_L2_SIZE.
	1435	*/
	1436	void
	1437	vm_page_pcpu_cache(void)
	1438	{
	1439	#if 0
	1440	globaldata_t gd = mycpu;
	1441	vm_page_t m;
	1442
	1443	if (gd->gd_vmpg_count < GD_MINVMPG) {
	1444	crit_enter_gd(gd);
	1445	while (gd->gd_vmpg_count < GD_MAXVMPG) {
	1446	m = vm_page_alloc(NULL, ticks & ~ncpus2_mask,
	1447	VM_ALLOC_NULL_OK \| VM_ALLOC_NORMAL \|
	1448	VM_ALLOC_NULL_OK \| VM_ALLOC_ZERO);
	1449	if (gd->gd_vmpg_count < GD_MAXVMPG) {
	1450	if ((m->flags & PG_ZERO) == 0) {
	1451	pmap_zero_page(VM_PAGE_TO_PHYS(m));
	1452	vm_page_flag_set(m, PG_ZERO);
	1453	}
	1454	gd->gd_vmpg_array[gd->gd_vmpg_count++] = m;
	1455	} else {
	1456	vm_page_free(m);
	1457	}
	1458	}
	1459	crit_exit_gd(gd);
	1460	}
	1461	#endif
	1462	}
	1463
	1464	/*
	1465	* vm_page_alloc()
	1466	*
	1467	* Allocate and return a memory cell associated with this VM object/offset
	1468	* pair. If object is NULL an unassociated page will be allocated.
	1469	*
	1470	* The returned page will be busied and removed from its queues. This
	1471	* routine can block and may return NULL if a race occurs and the page
	1472	* is found to already exist at the specified (object, pindex).
	1473	*
	1474	* VM_ALLOC_NORMAL allow use of cache pages, nominal free drain
	1475	* VM_ALLOC_QUICK like normal but cannot use cache
	1476	* VM_ALLOC_SYSTEM greater free drain
	1477	* VM_ALLOC_INTERRUPT allow free list to be completely drained
	1478	* VM_ALLOC_ZERO advisory request for pre-zero'd page only
	1479	* VM_ALLOC_FORCE_ZERO advisory request for pre-zero'd page only
	1480	* VM_ALLOC_NULL_OK ok to return NULL on insertion collision
	1481	* (see vm_page_grab())
	1482	* VM_ALLOC_USE_GD ok to use per-gd cache
	1483	*
	1484	* The object must be held if not NULL
	1485	* This routine may not block
	1486	*
	1487	* Additional special handling is required when called from an interrupt
	1488	* (VM_ALLOC_INTERRUPT). We are not allowed to mess with the page cache
	1489	* in this case.
	1490	*/
	1491	vm_page_t
	1492	vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int page_req)
	1493	{
	1494	globaldata_t gd = mycpu;
	1495	vm_object_t obj;
	1496	vm_page_t m;
	1497	u_short pg_color;
	1498
	1499	#if 0
	1500	/*
	1501	* Special per-cpu free VM page cache. The pages are pre-busied
	1502	* and pre-zerod for us.
	1503	*/
	1504	if (gd->gd_vmpg_count && (page_req & VM_ALLOC_USE_GD)) {
	1505	crit_enter_gd(gd);
	1506	if (gd->gd_vmpg_count) {
	1507	m = gd->gd_vmpg_array[--gd->gd_vmpg_count];
	1508	crit_exit_gd(gd);
	1509	goto done;
	1510	}
	1511	crit_exit_gd(gd);
	1512	}
	1513	#endif
	1514	m = NULL;
	1515
	1516	/*
	1517	* Cpu twist - cpu localization algorithm
	1518	*/
	1519	if (object) {
	1520	pg_color = gd->gd_cpuid + (pindex & ~ncpus_fit_mask) +
	1521	(object->pg_color & ~ncpus_fit_mask);
	1522	} else {
	1523	pg_color = gd->gd_cpuid + (pindex & ~ncpus_fit_mask);
	1524	}
	1525	KKASSERT(page_req &
	1526	(VM_ALLOC_NORMAL\|VM_ALLOC_QUICK\|
	1527	VM_ALLOC_INTERRUPT\|VM_ALLOC_SYSTEM));
	1528
	1529	/*
	1530	* Certain system threads (pageout daemon, buf_daemon's) are
	1531	* allowed to eat deeper into the free page list.
	1532	*/
	1533	if (curthread->td_flags & TDF_SYSTHREAD)
	1534	page_req \|= VM_ALLOC_SYSTEM;
	1535
	1536	loop:
	1537	if (vmstats.v_free_count > vmstats.v_free_reserved \|\|
	1538	((page_req & VM_ALLOC_INTERRUPT) && vmstats.v_free_count > 0) \|\|
	1539	((page_req & VM_ALLOC_SYSTEM) && vmstats.v_cache_count == 0 &&
	1540	vmstats.v_free_count > vmstats.v_interrupt_free_min)
	1541	) {
	1542	/*
	1543	* The free queue has sufficient free pages to take one out.
	1544	*/
	1545	if (page_req & (VM_ALLOC_ZERO \| VM_ALLOC_FORCE_ZERO))
	1546	m = vm_page_select_free(pg_color, TRUE);
	1547	else
	1548	m = vm_page_select_free(pg_color, FALSE);
	1549	} else if (page_req & VM_ALLOC_NORMAL) {
	1550	/*
	1551	* Allocatable from the cache (non-interrupt only). On
	1552	* success, we must free the page and try again, thus
	1553	* ensuring that vmstats.v_*_free_min counters are replenished.
	1554	*/
	1555	#ifdef INVARIANTS
	1556	if (curthread->td_preempted) {
	1557	kprintf("vm_page_alloc(): warning, attempt to allocate"
	1558	" cache page from preempting interrupt\n");
	1559	m = NULL;
	1560	} else {
	1561	m = vm_page_select_cache(pg_color);
	1562	}
	1563	#else
	1564	m = vm_page_select_cache(pg_color);
	1565	#endif
	1566	/*
	1567	* On success move the page into the free queue and loop.
	1568	*
	1569	* Only do this if we can safely acquire the vm_object lock,
	1570	* because this is effectively a random page and the caller
	1571	* might be holding the lock shared, we don't want to
	1572	* deadlock.
	1573	*/
	1574	if (m != NULL) {
	1575	KASSERT(m->dirty == 0,
	1576	("Found dirty cache page %p", m));
	1577	if ((obj = m->object) != NULL) {
	1578	if (vm_object_hold_try(obj)) {
	1579	vm_page_protect(m, VM_PROT_NONE);
	1580	vm_page_free(m);
	1581	/* m->object NULL here */
	1582	vm_object_drop(obj);
	1583	} else {
	1584	vm_page_deactivate(m);
	1585	vm_page_wakeup(m);
	1586	}
	1587	} else {
	1588	vm_page_protect(m, VM_PROT_NONE);
	1589	vm_page_free(m);
	1590	}
	1591	goto loop;
	1592	}
	1593
	1594	/*
	1595	* On failure return NULL
	1596	*/
	1597	#if defined(DIAGNOSTIC)
	1598	if (vmstats.v_cache_count > 0)
	1599	kprintf("vm_page_alloc(NORMAL): missing pages on cache queue: %d\n", vmstats.v_cache_count);
	1600	#endif
	1601	vm_pageout_deficit++;
	1602	pagedaemon_wakeup();
	1603	return (NULL);
	1604	} else {
	1605	/*
	1606	* No pages available, wakeup the pageout daemon and give up.
	1607	*/
	1608	vm_pageout_deficit++;
	1609	pagedaemon_wakeup();
	1610	return (NULL);
	1611	}
	1612
	1613	/*
	1614	* v_free_count can race so loop if we don't find the expected
	1615	* page.
	1616	*/
	1617	if (m == NULL)
	1618	goto loop;
	1619
	1620	/*
	1621	* Good page found. The page has already been busied for us and
	1622	* removed from its queues.
	1623	*/
	1624	KASSERT(m->dirty == 0,
	1625	("vm_page_alloc: free/cache page %p was dirty", m));
	1626	KKASSERT(m->queue == PQ_NONE);
	1627
	1628	#if 0
	1629	done:
	1630	#endif
	1631	/*
	1632	* Initialize the structure, inheriting some flags but clearing
	1633	* all the rest. The page has already been busied for us.
	1634	*/
	1635	vm_page_flag_clear(m, ~(PG_ZERO \| PG_BUSY \| PG_SBUSY));
	1636	KKASSERT(m->wire_count == 0);
	1637	KKASSERT(m->busy == 0);
	1638	m->act_count = 0;
	1639	m->valid = 0;
	1640
	1641	/*
	1642	* Caller must be holding the object lock (asserted by
	1643	* vm_page_insert()).
	1644	*
	1645	* NOTE: Inserting a page here does not insert it into any pmaps
	1646	* (which could cause us to block allocating memory).
	1647	*
	1648	* NOTE: If no object an unassociated page is allocated, m->pindex
	1649	* can be used by the caller for any purpose.
	1650	*/
	1651	if (object) {
	1652	if (vm_page_insert(m, object, pindex) == FALSE) {
	1653	kprintf("PAGE RACE (%p:%d,%"PRIu64")\n",
	1654	object, object->type, pindex);
	1655	vm_page_free(m);
	1656	m = NULL;
	1657	if ((page_req & VM_ALLOC_NULL_OK) == 0)
	1658	panic("PAGE RACE");
	1659	}
	1660	} else {
	1661	m->pindex = pindex;
	1662	}
	1663
	1664	/*
	1665	* Don't wakeup too often - wakeup the pageout daemon when
	1666	* we would be nearly out of memory.
	1667	*/
	1668	pagedaemon_wakeup();
	1669
	1670	/*
	1671	* A PG_BUSY page is returned.
	1672	*/
	1673	return (m);
	1674	}
	1675
	1676	/*
	1677	* Attempt to allocate contiguous physical memory with the specified
	1678	* requirements.
	1679	*/
	1680	vm_page_t
	1681	vm_page_alloc_contig(vm_paddr_t low, vm_paddr_t high,
	1682	unsigned long alignment, unsigned long boundary,
	1683	unsigned long size)
	1684	{
	1685	alist_blk_t blk;
	1686
	1687	alignment >>= PAGE_SHIFT;
	1688	if (alignment == 0)
	1689	alignment = 1;
	1690	boundary >>= PAGE_SHIFT;
	1691	if (boundary == 0)
	1692	boundary = 1;
	1693	size = (size + PAGE_MASK) >> PAGE_SHIFT;
	1694
	1695	spin_lock(&vm_contig_spin);
	1696	blk = alist_alloc(&vm_contig_alist, 0, size);
	1697	if (blk == ALIST_BLOCK_NONE) {
	1698	spin_unlock(&vm_contig_spin);
	1699	if (bootverbose) {
	1700	kprintf("vm_page_alloc_contig: %ldk nospace\n",
	1701	(size + PAGE_MASK) * (PAGE_SIZE / 1024));
	1702	}
	1703	return(NULL);
	1704	}
	1705	if (high && ((vm_paddr_t)(blk + size) << PAGE_SHIFT) > high) {
	1706	alist_free(&vm_contig_alist, blk, size);
	1707	spin_unlock(&vm_contig_spin);
	1708	if (bootverbose) {
	1709	kprintf("vm_page_alloc_contig: %ldk high "
	1710	"%016jx failed\n",
	1711	(size + PAGE_MASK) * (PAGE_SIZE / 1024),
	1712	(intmax_t)high);
	1713	}
	1714	return(NULL);
	1715	}
	1716	spin_unlock(&vm_contig_spin);
	1717	if (vm_contig_verbose) {
	1718	kprintf("vm_page_alloc_contig: %016jx/%ldk\n",
	1719	(intmax_t)(vm_paddr_t)blk << PAGE_SHIFT,
	1720	(size + PAGE_MASK) * (PAGE_SIZE / 1024));
	1721	}
	1722	return (PHYS_TO_VM_PAGE((vm_paddr_t)blk << PAGE_SHIFT));
	1723	}
	1724
	1725	/*
	1726	* Free contiguously allocated pages. The pages will be wired but not busy.
	1727	* When freeing to the alist we leave them wired and not busy.
	1728	*/
	1729	void
	1730	vm_page_free_contig(vm_page_t m, unsigned long size)
	1731	{
	1732	vm_paddr_t pa = VM_PAGE_TO_PHYS(m);
	1733	vm_pindex_t start = pa >> PAGE_SHIFT;
	1734	vm_pindex_t pages = (size + PAGE_MASK) >> PAGE_SHIFT;
	1735
	1736	if (vm_contig_verbose) {
	1737	kprintf("vm_page_free_contig: %016jx/%ldk\n",
	1738	(intmax_t)pa, size / 1024);
	1739	}
	1740	if (pa < vm_low_phys_reserved) {
	1741	KKASSERT(pa + size <= vm_low_phys_reserved);
	1742	spin_lock(&vm_contig_spin);
	1743	alist_free(&vm_contig_alist, start, pages);
	1744	spin_unlock(&vm_contig_spin);
	1745	} else {
	1746	while (pages) {
	1747	vm_page_busy_wait(m, FALSE, "cpgfr");
	1748	vm_page_unwire(m, 0);
	1749	vm_page_free(m);
	1750	--pages;
	1751	++m;
	1752	}
	1753
	1754	}
	1755	}
	1756
	1757
	1758	/*
	1759	* Wait for sufficient free memory for nominal heavy memory use kernel
	1760	* operations.
	1761	*
	1762	* WARNING! Be sure never to call this in any vm_pageout code path, which
	1763	* will trivially deadlock the system.
	1764	*/
	1765	void
	1766	vm_wait_nominal(void)
	1767	{
	1768	while (vm_page_count_min(0))
	1769	vm_wait(0);
	1770	}
	1771
	1772	/*
	1773	* Test if vm_wait_nominal() would block.
	1774	*/
	1775	int
	1776	vm_test_nominal(void)
	1777	{
	1778	if (vm_page_count_min(0))
	1779	return(1);
	1780	return(0);
	1781	}
	1782
	1783	/*
	1784	* Block until free pages are available for allocation, called in various
	1785	* places before memory allocations.
	1786	*
	1787	* The caller may loop if vm_page_count_min() == FALSE so we cannot be
	1788	* more generous then that.
	1789	*/
	1790	void
	1791	vm_wait(int timo)
	1792	{
	1793	/*
	1794	* never wait forever
	1795	*/
	1796	if (timo == 0)
	1797	timo = hz;
	1798	lwkt_gettoken(&vm_token);
	1799
	1800	if (curthread == pagethread) {
	1801	/*
	1802	* The pageout daemon itself needs pages, this is bad.
	1803	*/
	1804	if (vm_page_count_min(0)) {
	1805	vm_pageout_pages_needed = 1;
	1806	tsleep(&vm_pageout_pages_needed, 0, "VMWait", timo);
	1807	}
	1808	} else {
	1809	/*
	1810	* Wakeup the pageout daemon if necessary and wait.
	1811	*/
	1812	if (vm_page_count_target()) {
	1813	if (vm_pages_needed == 0) {
	1814	vm_pages_needed = 1;
	1815	wakeup(&vm_pages_needed);
	1816	}
	1817	++vm_pages_waiting; /* SMP race ok */
	1818	tsleep(&vmstats.v_free_count, 0, "vmwait", timo);
	1819	}
	1820	}
	1821	lwkt_reltoken(&vm_token);
	1822	}
	1823
	1824	/*
	1825	* Block until free pages are available for allocation
	1826	*
	1827	* Called only from vm_fault so that processes page faulting can be
	1828	* easily tracked.
	1829	*/
	1830	void
	1831	vm_waitpfault(void)
	1832	{
	1833	/*
	1834	* Wakeup the pageout daemon if necessary and wait.
	1835	*/
	1836	if (vm_page_count_target()) {
	1837	lwkt_gettoken(&vm_token);
	1838	if (vm_page_count_target()) {
	1839	if (vm_pages_needed == 0) {
	1840	vm_pages_needed = 1;
	1841	wakeup(&vm_pages_needed);
	1842	}
	1843	++vm_pages_waiting; /* SMP race ok */
	1844	tsleep(&vmstats.v_free_count, 0, "pfault", hz);
	1845	}
	1846	lwkt_reltoken(&vm_token);
	1847	}
	1848	}
	1849
	1850	/*
	1851	* Put the specified page on the active list (if appropriate). Ensure
	1852	* that act_count is at least ACT_INIT but do not otherwise mess with it.
	1853	*
	1854	* The caller should be holding the page busied ? XXX
	1855	* This routine may not block.
	1856	*/
	1857	void
	1858	vm_page_activate(vm_page_t m)
	1859	{
	1860	u_short oqueue;
	1861
	1862	vm_page_spin_lock(m);
	1863	if (m->queue - m->pc != PQ_ACTIVE) {
	1864	_vm_page_queue_spin_lock(m);
	1865	oqueue = _vm_page_rem_queue_spinlocked(m);
	1866	/* page is left spinlocked, queue is unlocked */
	1867
	1868	if (oqueue == PQ_CACHE)
	1869	mycpu->gd_cnt.v_reactivated++;
	1870	if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
	1871	if (m->act_count < ACT_INIT)
	1872	m->act_count = ACT_INIT;
	1873	_vm_page_add_queue_spinlocked(m, PQ_ACTIVE + m->pc, 0);
	1874	}
	1875	_vm_page_and_queue_spin_unlock(m);
	1876	if (oqueue == PQ_CACHE \|\| oqueue == PQ_FREE)
	1877	pagedaemon_wakeup();
	1878	} else {
	1879	if (m->act_count < ACT_INIT)
	1880	m->act_count = ACT_INIT;
	1881	vm_page_spin_unlock(m);
	1882	}
	1883	}
	1884
	1885	/*
	1886	* Helper routine for vm_page_free_toq() and vm_page_cache(). This
	1887	* routine is called when a page has been added to the cache or free
	1888	* queues.
	1889	*
	1890	* This routine may not block.
	1891	*/
	1892	static __inline void
	1893	vm_page_free_wakeup(void)
	1894	{
	1895	/*
	1896	* If the pageout daemon itself needs pages, then tell it that
	1897	* there are some free.
	1898	*/
	1899	if (vm_pageout_pages_needed &&
	1900	vmstats.v_cache_count + vmstats.v_free_count >=
	1901	vmstats.v_pageout_free_min
	1902	) {
	1903	wakeup(&vm_pageout_pages_needed);
	1904	vm_pageout_pages_needed = 0;
	1905	}
	1906
	1907	/*
	1908	* Wakeup processes that are waiting on memory.
	1909	*
	1910	* NOTE: vm_paging_target() is the pageout daemon's target, while
	1911	* vm_page_count_target() is somewhere inbetween. We want
	1912	* to wake processes up prior to the pageout daemon reaching
	1913	* its target to provide some hysteresis.
	1914	*/
	1915	if (vm_pages_waiting) {
	1916	if (!vm_page_count_target()) {
	1917	/*
	1918	* Plenty of pages are free, wakeup everyone.
	1919	*/
	1920	vm_pages_waiting = 0;
	1921	wakeup(&vmstats.v_free_count);
	1922	++mycpu->gd_cnt.v_ppwakeups;
	1923	} else if (!vm_page_count_min(0)) {
	1924	/*
	1925	* Some pages are free, wakeup someone.
	1926	*/
	1927	int wcount = vm_pages_waiting;
	1928	if (wcount > 0)
	1929	--wcount;
	1930	vm_pages_waiting = wcount;
	1931	wakeup_one(&vmstats.v_free_count);
	1932	++mycpu->gd_cnt.v_ppwakeups;
	1933	}
	1934	}
	1935	}
	1936
	1937	/*
	1938	* Returns the given page to the PQ_FREE or PQ_HOLD list and disassociates
	1939	* it from its VM object.
	1940	*
	1941	* The vm_page must be PG_BUSY on entry. PG_BUSY will be released on
	1942	* return (the page will have been freed).
	1943	*/
	1944	void
	1945	vm_page_free_toq(vm_page_t m)
	1946	{
	1947	mycpu->gd_cnt.v_tfree++;
	1948	KKASSERT((m->flags & PG_MAPPED) == 0);
	1949	KKASSERT(m->flags & PG_BUSY);
	1950
	1951	if (m->busy \|\| ((m->queue - m->pc) == PQ_FREE)) {
	1952	kprintf("vm_page_free: pindex(%lu), busy(%d), "
	1953	"PG_BUSY(%d), hold(%d)\n",
	1954	(u_long)m->pindex, m->busy,
	1955	((m->flags & PG_BUSY) ? 1 : 0), m->hold_count);
	1956	if ((m->queue - m->pc) == PQ_FREE)
	1957	panic("vm_page_free: freeing free page");
	1958	else
	1959	panic("vm_page_free: freeing busy page");
	1960	}
	1961
	1962	/*
	1963	* Remove from object, spinlock the page and its queues and
	1964	* remove from any queue. No queue spinlock will be held
	1965	* after this section (because the page was removed from any
	1966	* queue).
	1967	*/
	1968	vm_page_remove(m);
	1969	vm_page_and_queue_spin_lock(m);
	1970	_vm_page_rem_queue_spinlocked(m);
	1971
	1972	/*
	1973	* No further management of fictitious pages occurs beyond object
	1974	* and queue removal.
	1975	*/
	1976	if ((m->flags & PG_FICTITIOUS) != 0) {
	1977	vm_page_spin_unlock(m);
	1978	vm_page_wakeup(m);
	1979	return;
	1980	}
	1981
	1982	m->valid = 0;
	1983	vm_page_undirty(m);
	1984
	1985	if (m->wire_count != 0) {
	1986	if (m->wire_count > 1) {
	1987	panic(
	1988	"vm_page_free: invalid wire count (%d), pindex: 0x%lx",
	1989	m->wire_count, (long)m->pindex);
	1990	}
	1991	panic("vm_page_free: freeing wired page");
	1992	}
	1993
	1994	/*
	1995	* Clear the UNMANAGED flag when freeing an unmanaged page.
	1996	* Clear the NEED_COMMIT flag
	1997	*/
	1998	if (m->flags & PG_UNMANAGED)
	1999	vm_page_flag_clear(m, PG_UNMANAGED);
	2000	if (m->flags & PG_NEED_COMMIT)
	2001	vm_page_flag_clear(m, PG_NEED_COMMIT);
	2002
	2003	if (m->hold_count != 0) {
	2004	vm_page_flag_clear(m, PG_ZERO);
	2005	_vm_page_add_queue_spinlocked(m, PQ_HOLD + m->pc, 0);
	2006	} else {
	2007	_vm_page_add_queue_spinlocked(m, PQ_FREE + m->pc, 0);
	2008	}
	2009
	2010	/*
	2011	* This sequence allows us to clear PG_BUSY while still holding
	2012	* its spin lock, which reduces contention vs allocators. We
	2013	* must not leave the queue locked or _vm_page_wakeup() may
	2014	* deadlock.
	2015	*/
	2016	_vm_page_queue_spin_unlock(m);
	2017	if (_vm_page_wakeup(m)) {
	2018	vm_page_spin_unlock(m);
	2019	wakeup(m);
	2020	} else {
	2021	vm_page_spin_unlock(m);
	2022	}
	2023	vm_page_free_wakeup();
	2024	}
	2025
	2026	/*
	2027	* vm_page_free_fromq_fast()
	2028	*
	2029	* Remove a non-zero page from one of the free queues; the page is removed for
	2030	* zeroing, so do not issue a wakeup.
	2031	*/
	2032	vm_page_t
	2033	vm_page_free_fromq_fast(void)
	2034	{
	2035	static int qi;
	2036	vm_page_t m;
	2037	int i;
	2038
	2039	for (i = 0; i < PQ_L2_SIZE; ++i) {
	2040	m = vm_page_list_find(PQ_FREE, qi, FALSE);
	2041	/* page is returned spinlocked and removed from its queue */
	2042	if (m) {
	2043	if (vm_page_busy_try(m, TRUE)) {
	2044	/*
	2045	* We were unable to busy the page, deactivate
	2046	* it and loop.
	2047	*/
	2048	_vm_page_deactivate_locked(m, 0);
	2049	vm_page_spin_unlock(m);
	2050	} else if (m->flags & PG_ZERO) {
	2051	/*
	2052	* The page is PG_ZERO, requeue it and loop
	2053	*/
	2054	_vm_page_add_queue_spinlocked(m,
	2055	PQ_FREE + m->pc,
	2056	0);
	2057	vm_page_queue_spin_unlock(m);
	2058	if (_vm_page_wakeup(m)) {
	2059	vm_page_spin_unlock(m);
	2060	wakeup(m);
	2061	} else {
	2062	vm_page_spin_unlock(m);
	2063	}
	2064	} else {
	2065	/*
	2066	* The page is not PG_ZERO'd so return it.
	2067	*/
	2068	vm_page_spin_unlock(m);
	2069	KKASSERT((m->flags & (PG_UNMANAGED \|
	2070	PG_NEED_COMMIT)) == 0);
	2071	KKASSERT(m->hold_count == 0);
	2072	KKASSERT(m->wire_count == 0);
	2073	break;
	2074	}
	2075	m = NULL;
	2076	}
	2077	qi = (qi + PQ_PRIME2) & PQ_L2_MASK;
	2078	}
	2079	return (m);
	2080	}
	2081
	2082	/*
	2083	* vm_page_unmanage()
	2084	*
	2085	* Prevent PV management from being done on the page. The page is
	2086	* removed from the paging queues as if it were wired, and as a
	2087	* consequence of no longer being managed the pageout daemon will not
	2088	* touch it (since there is no way to locate the pte mappings for the
	2089	* page). madvise() calls that mess with the pmap will also no longer
	2090	* operate on the page.
	2091	*
	2092	* Beyond that the page is still reasonably 'normal'. Freeing the page
	2093	* will clear the flag.
	2094	*
	2095	* This routine is used by OBJT_PHYS objects - objects using unswappable
	2096	* physical memory as backing store rather then swap-backed memory and
	2097	* will eventually be extended to support 4MB unmanaged physical
	2098	* mappings.
	2099	*
	2100	* Caller must be holding the page busy.
	2101	*/
	2102	void
	2103	vm_page_unmanage(vm_page_t m)
	2104	{
	2105	KKASSERT(m->flags & PG_BUSY);
	2106	if ((m->flags & PG_UNMANAGED) == 0) {
	2107	if (m->wire_count == 0)
	2108	vm_page_unqueue(m);
	2109	}
	2110	vm_page_flag_set(m, PG_UNMANAGED);
	2111	}
	2112
	2113	/*
	2114	* Mark this page as wired down by yet another map, removing it from
	2115	* paging queues as necessary.
	2116	*
	2117	* Caller must be holding the page busy.
	2118	*/
	2119	void
	2120	vm_page_wire(vm_page_t m)
	2121	{
	2122	/*
	2123	* Only bump the wire statistics if the page is not already wired,
	2124	* and only unqueue the page if it is on some queue (if it is unmanaged
	2125	* it is already off the queues). Don't do anything with fictitious
	2126	* pages because they are always wired.
	2127	*/
	2128	KKASSERT(m->flags & PG_BUSY);
	2129	if ((m->flags & PG_FICTITIOUS) == 0) {
	2130	if (atomic_fetchadd_int(&m->wire_count, 1) == 0) {
	2131	if ((m->flags & PG_UNMANAGED) == 0)
	2132	vm_page_unqueue(m);
	2133	atomic_add_int(&vmstats.v_wire_count, 1);
	2134	}
	2135	KASSERT(m->wire_count != 0,
	2136	("vm_page_wire: wire_count overflow m=%p", m));
	2137	}
	2138	}
	2139
	2140	/*
	2141	* Release one wiring of this page, potentially enabling it to be paged again.
	2142	*
	2143	* Many pages placed on the inactive queue should actually go
	2144	* into the cache, but it is difficult to figure out which. What
	2145	* we do instead, if the inactive target is well met, is to put
	2146	* clean pages at the head of the inactive queue instead of the tail.
	2147	* This will cause them to be moved to the cache more quickly and
	2148	* if not actively re-referenced, freed more quickly. If we just
	2149	* stick these pages at the end of the inactive queue, heavy filesystem
	2150	* meta-data accesses can cause an unnecessary paging load on memory bound
	2151	* processes. This optimization causes one-time-use metadata to be
	2152	* reused more quickly.
	2153	*
	2154	* Pages marked PG_NEED_COMMIT are always activated and never placed on
	2155	* the inactive queue. This helps the pageout daemon determine memory
	2156	* pressure and act on out-of-memory situations more quickly.
	2157	*
	2158	* BUT, if we are in a low-memory situation we have no choice but to
	2159	* put clean pages on the cache queue.
	2160	*
	2161	* A number of routines use vm_page_unwire() to guarantee that the page
	2162	* will go into either the inactive or active queues, and will NEVER
	2163	* be placed in the cache - for example, just after dirtying a page.
	2164	* dirty pages in the cache are not allowed.
	2165	*
	2166	* The page queues must be locked.
	2167	* This routine may not block.
	2168	*/
	2169	void
	2170	vm_page_unwire(vm_page_t m, int activate)
	2171	{
	2172	KKASSERT(m->flags & PG_BUSY);
	2173	if (m->flags & PG_FICTITIOUS) {
	2174	/* do nothing */
	2175	} else if (m->wire_count <= 0) {
	2176	panic("vm_page_unwire: invalid wire count: %d", m->wire_count);
	2177	} else {
	2178	if (atomic_fetchadd_int(&m->wire_count, -1) == 1) {
	2179	atomic_add_int(&vmstats.v_wire_count, -1);
	2180	if (m->flags & PG_UNMANAGED) {
	2181	;
	2182	} else if (activate \|\| (m->flags & PG_NEED_COMMIT)) {
	2183	vm_page_spin_lock(m);
	2184	_vm_page_add_queue_spinlocked(m,
	2185	PQ_ACTIVE + m->pc, 0);
	2186	_vm_page_and_queue_spin_unlock(m);
	2187	} else {
	2188	vm_page_spin_lock(m);
	2189	vm_page_flag_clear(m, PG_WINATCFLS);
	2190	_vm_page_add_queue_spinlocked(m,
	2191	PQ_INACTIVE + m->pc, 0);
	2192	++vm_swapcache_inactive_heuristic;
	2193	_vm_page_and_queue_spin_unlock(m);
	2194	}
	2195	}
	2196	}
	2197	}
	2198
	2199	/*
	2200	* Move the specified page to the inactive queue. If the page has
	2201	* any associated swap, the swap is deallocated.
	2202	*
	2203	* Normally athead is 0 resulting in LRU operation. athead is set
	2204	* to 1 if we want this page to be 'as if it were placed in the cache',
	2205	* except without unmapping it from the process address space.
	2206	*
	2207	* vm_page's spinlock must be held on entry and will remain held on return.
	2208	* This routine may not block.
	2209	*/
	2210	static void
	2211	_vm_page_deactivate_locked(vm_page_t m, int athead)
	2212	{
	2213	u_short oqueue;
	2214
	2215	/*
	2216	* Ignore if already inactive.
	2217	*/
	2218	if (m->queue - m->pc == PQ_INACTIVE)
	2219	return;
	2220	_vm_page_queue_spin_lock(m);
	2221	oqueue = _vm_page_rem_queue_spinlocked(m);
	2222
	2223	if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
	2224	if (oqueue == PQ_CACHE)
	2225	mycpu->gd_cnt.v_reactivated++;
	2226	vm_page_flag_clear(m, PG_WINATCFLS);
	2227	_vm_page_add_queue_spinlocked(m, PQ_INACTIVE + m->pc, athead);
	2228	if (athead == 0)
	2229	++vm_swapcache_inactive_heuristic;
	2230	}
	2231	_vm_page_queue_spin_unlock(m);
	2232	/* leaves vm_page spinlocked */
	2233	}
	2234
	2235	/*
	2236	* Attempt to deactivate a page.
	2237	*
	2238	* No requirements.
	2239	*/
	2240	void
	2241	vm_page_deactivate(vm_page_t m)
	2242	{
	2243	vm_page_spin_lock(m);
	2244	_vm_page_deactivate_locked(m, 0);
	2245	vm_page_spin_unlock(m);
	2246	}
	2247
	2248	void
	2249	vm_page_deactivate_locked(vm_page_t m)
	2250	{
	2251	_vm_page_deactivate_locked(m, 0);
	2252	}
	2253
	2254	/*
	2255	* Attempt to move a page to PQ_CACHE.
	2256	*
	2257	* Returns 0 on failure, 1 on success
	2258	*
	2259	* The page should NOT be busied by the caller. This function will validate
	2260	* whether the page can be safely moved to the cache.
	2261	*/
	2262	int
	2263	vm_page_try_to_cache(vm_page_t m)
	2264	{
	2265	vm_page_spin_lock(m);
	2266	if (vm_page_busy_try(m, TRUE)) {
	2267	vm_page_spin_unlock(m);
	2268	return(0);
	2269	}
	2270	if (m->dirty \|\| m->hold_count \|\| m->wire_count \|\|
	2271	(m->flags & (PG_UNMANAGED \| PG_NEED_COMMIT))) {
	2272	if (_vm_page_wakeup(m)) {
	2273	vm_page_spin_unlock(m);
	2274	wakeup(m);
	2275	} else {
	2276	vm_page_spin_unlock(m);
	2277	}
	2278	return(0);
	2279	}
	2280	vm_page_spin_unlock(m);
	2281
	2282	/*
	2283	* Page busied by us and no longer spinlocked. Dirty pages cannot
	2284	* be moved to the cache.
	2285	*/
	2286	vm_page_test_dirty(m);
	2287	if (m->dirty) {
	2288	vm_page_wakeup(m);
	2289	return(0);
	2290	}
	2291	vm_page_cache(m);
	2292	return(1);
	2293	}
	2294
	2295	/*
	2296	* Attempt to free the page. If we cannot free it, we do nothing.
	2297	* 1 is returned on success, 0 on failure.
	2298	*
	2299	* No requirements.
	2300	*/
	2301	int
	2302	vm_page_try_to_free(vm_page_t m)
	2303	{
	2304	vm_page_spin_lock(m);
	2305	if (vm_page_busy_try(m, TRUE)) {
	2306	vm_page_spin_unlock(m);
	2307	return(0);
	2308	}
	2309
	2310	/*
	2311	* The page can be in any state, including already being on the free
	2312	* queue. Check to see if it really can be freed.
	2313	*/
	2314	if (m->dirty \|\| /* can't free if it is dirty */
	2315	m->hold_count \|\| /* or held (XXX may be wrong) */
	2316	m->wire_count \|\| /* or wired */
	2317	(m->flags & (PG_UNMANAGED \| /* or unmanaged */
	2318	PG_NEED_COMMIT)) \|\| /* or needs a commit */
	2319	m->queue - m->pc == PQ_FREE \|\| /* already on PQ_FREE */
	2320	m->queue - m->pc == PQ_HOLD) { /* already on PQ_HOLD */
	2321	if (_vm_page_wakeup(m)) {
	2322	vm_page_spin_unlock(m);
	2323	wakeup(m);
	2324	} else {
	2325	vm_page_spin_unlock(m);
	2326	}
	2327	return(0);
	2328	}
	2329	vm_page_spin_unlock(m);
	2330
	2331	/*
	2332	* We can probably free the page.
	2333	*
	2334	* Page busied by us and no longer spinlocked. Dirty pages will
	2335	* not be freed by this function. We have to re-test the
	2336	* dirty bit after cleaning out the pmaps.
	2337	*/
	2338	vm_page_test_dirty(m);
	2339	if (m->dirty) {
	2340	vm_page_wakeup(m);
	2341	return(0);
	2342	}
	2343	vm_page_protect(m, VM_PROT_NONE);
	2344	if (m->dirty) {
	2345	vm_page_wakeup(m);
	2346	return(0);
	2347	}
	2348	vm_page_free(m);
	2349	return(1);
	2350	}
	2351
	2352	/*
	2353	* vm_page_cache
	2354	*
	2355	* Put the specified page onto the page cache queue (if appropriate).
	2356	*
	2357	* The page must be busy, and this routine will release the busy and
	2358	* possibly even free the page.
	2359	*/
	2360	void
	2361	vm_page_cache(vm_page_t m)
	2362	{
	2363	if ((m->flags & (PG_UNMANAGED \| PG_NEED_COMMIT)) \|\|
	2364	m->busy \|\| m->wire_count \|\| m->hold_count) {
	2365	kprintf("vm_page_cache: attempting to cache busy/held page\n");
	2366	vm_page_wakeup(m);
	2367	return;
	2368	}
	2369
	2370	/*
	2371	* Already in the cache (and thus not mapped)
	2372	*/
	2373	if ((m->queue - m->pc) == PQ_CACHE) {
	2374	KKASSERT((m->flags & PG_MAPPED) == 0);
	2375	vm_page_wakeup(m);
	2376	return;
	2377	}
	2378
	2379	/*
	2380	* Caller is required to test m->dirty, but note that the act of
	2381	* removing the page from its maps can cause it to become dirty
	2382	* on an SMP system due to another cpu running in usermode.
	2383	*/
	2384	if (m->dirty) {
	2385	panic("vm_page_cache: caching a dirty page, pindex: %ld",
	2386	(long)m->pindex);
	2387	}
	2388
	2389	/*
	2390	* Remove all pmaps and indicate that the page is not
	2391	* writeable or mapped. Our vm_page_protect() call may
	2392	* have blocked (especially w/ VM_PROT_NONE), so recheck
	2393	* everything.
	2394	*/
	2395	vm_page_protect(m, VM_PROT_NONE);
	2396	if ((m->flags & (PG_UNMANAGED \| PG_MAPPED)) \|\|
	2397	m->busy \|\| m->wire_count \|\| m->hold_count) {
	2398	vm_page_wakeup(m);
	2399	} else if (m->dirty \|\| (m->flags & PG_NEED_COMMIT)) {
	2400	vm_page_deactivate(m);
	2401	vm_page_wakeup(m);
	2402	} else {
	2403	_vm_page_and_queue_spin_lock(m);
	2404	_vm_page_rem_queue_spinlocked(m);
	2405	_vm_page_add_queue_spinlocked(m, PQ_CACHE + m->pc, 0);
	2406	_vm_page_queue_spin_unlock(m);
	2407	if (_vm_page_wakeup(m)) {
	2408	vm_page_spin_unlock(m);
	2409	wakeup(m);
	2410	} else {
	2411	vm_page_spin_unlock(m);
	2412	}
	2413	vm_page_free_wakeup();
	2414	}
	2415	}
	2416
	2417	/*
	2418	* vm_page_dontneed()
	2419	*
	2420	* Cache, deactivate, or do nothing as appropriate. This routine
	2421	* is typically used by madvise() MADV_DONTNEED.
	2422	*
	2423	* Generally speaking we want to move the page into the cache so
	2424	* it gets reused quickly. However, this can result in a silly syndrome
	2425	* due to the page recycling too quickly. Small objects will not be
	2426	* fully cached. On the otherhand, if we move the page to the inactive
	2427	* queue we wind up with a problem whereby very large objects
	2428	* unnecessarily blow away our inactive and cache queues.
	2429	*
	2430	* The solution is to move the pages based on a fixed weighting. We
	2431	* either leave them alone, deactivate them, or move them to the cache,
	2432	* where moving them to the cache has the highest weighting.
	2433	* By forcing some pages into other queues we eventually force the
	2434	* system to balance the queues, potentially recovering other unrelated
	2435	* space from active. The idea is to not force this to happen too
	2436	* often.
	2437	*
	2438	* The page must be busied.
	2439	*/
	2440	void
	2441	vm_page_dontneed(vm_page_t m)
	2442	{
	2443	static int dnweight;
	2444	int dnw;
	2445	int head;
	2446
	2447	dnw = ++dnweight;
	2448
	2449	/*
	2450	* occassionally leave the page alone
	2451	*/
	2452	if ((dnw & 0x01F0) == 0 \|\|
	2453	m->queue - m->pc == PQ_INACTIVE \|\|
	2454	m->queue - m->pc == PQ_CACHE
	2455	) {
	2456	if (m->act_count >= ACT_INIT)
	2457	--m->act_count;
	2458	return;
	2459	}
	2460
	2461	/*
	2462	* If vm_page_dontneed() is inactivating a page, it must clear
	2463	* the referenced flag; otherwise the pagedaemon will see references
	2464	* on the page in the inactive queue and reactivate it. Until the
	2465	* page can move to the cache queue, madvise's job is not done.
	2466	*/
	2467	vm_page_flag_clear(m, PG_REFERENCED);
	2468	pmap_clear_reference(m);
	2469
	2470	if (m->dirty == 0)
	2471	vm_page_test_dirty(m);
	2472
	2473	if (m->dirty \|\| (dnw & 0x0070) == 0) {
	2474	/*
	2475	* Deactivate the page 3 times out of 32.
	2476	*/
	2477	head = 0;
	2478	} else {
	2479	/*
	2480	* Cache the page 28 times out of every 32. Note that
	2481	* the page is deactivated instead of cached, but placed
	2482	* at the head of the queue instead of the tail.
	2483	*/
	2484	head = 1;
	2485	}
	2486	vm_page_spin_lock(m);
	2487	_vm_page_deactivate_locked(m, head);
	2488	vm_page_spin_unlock(m);
	2489	}
	2490
	2491	/*
	2492	* These routines manipulate the 'soft busy' count for a page. A soft busy
	2493	* is almost like PG_BUSY except that it allows certain compatible operations
	2494	* to occur on the page while it is busy. For example, a page undergoing a
	2495	* write can still be mapped read-only.
	2496	*
	2497	* Because vm_pages can overlap buffers m->busy can be > 1. m->busy is only
	2498	* adjusted while the vm_page is PG_BUSY so the flash will occur when the
	2499	* busy bit is cleared.
	2500	*/
	2501	void
	2502	vm_page_io_start(vm_page_t m)
	2503	{
	2504	KASSERT(m->flags & PG_BUSY, ("vm_page_io_start: page not busy!!!"));
	2505	atomic_add_char(&m->busy, 1);
	2506	vm_page_flag_set(m, PG_SBUSY);
	2507	}
	2508
	2509	void
	2510	vm_page_io_finish(vm_page_t m)
	2511	{
	2512	KASSERT(m->flags & PG_BUSY, ("vm_page_io_finish: page not busy!!!"));
	2513	atomic_subtract_char(&m->busy, 1);
	2514	if (m->busy == 0)
	2515	vm_page_flag_clear(m, PG_SBUSY);
	2516	}
	2517
	2518	/*
	2519	* Indicate that a clean VM page requires a filesystem commit and cannot
	2520	* be reused. Used by tmpfs.
	2521	*/
	2522	void
	2523	vm_page_need_commit(vm_page_t m)
	2524	{
	2525	vm_page_flag_set(m, PG_NEED_COMMIT);
	2526	}
	2527
	2528	void
	2529	vm_page_clear_commit(vm_page_t m)
	2530	{
	2531	vm_page_flag_clear(m, PG_NEED_COMMIT);
	2532	}
	2533
	2534	/*
	2535	* Grab a page, blocking if it is busy and allocating a page if necessary.
	2536	* A busy page is returned or NULL. The page may or may not be valid and
	2537	* might not be on a queue (the caller is responsible for the disposition of
	2538	* the page).
	2539	*
	2540	* If VM_ALLOC_ZERO is specified and the grab must allocate a new page, the
	2541	* page will be zero'd and marked valid.
	2542	*
	2543	* If VM_ALLOC_FORCE_ZERO is specified the page will be zero'd and marked
	2544	* valid even if it already exists.
	2545	*
	2546	* If VM_ALLOC_RETRY is specified this routine will never return NULL. Also
	2547	* note that VM_ALLOC_NORMAL must be specified if VM_ALLOC_RETRY is specified.
	2548	* VM_ALLOC_NULL_OK is implied when VM_ALLOC_RETRY is specified.
	2549	*
	2550	* This routine may block, but if VM_ALLOC_RETRY is not set then NULL is
	2551	* always returned if we had blocked.
	2552	*
	2553	* This routine may not be called from an interrupt.
	2554	*
	2555	* PG_ZERO is ALWAYS cleared by this routine.
	2556	*
	2557	* No other requirements.
	2558	*/
	2559	vm_page_t
	2560	vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
	2561	{
	2562	vm_page_t m;
	2563	int error;
	2564
	2565	KKASSERT(allocflags &
	2566	(VM_ALLOC_NORMAL\|VM_ALLOC_INTERRUPT\|VM_ALLOC_SYSTEM));
	2567	vm_object_hold(object);
	2568	for (;;) {
	2569	m = vm_page_lookup_busy_try(object, pindex, TRUE, &error);
	2570	if (error) {
	2571	vm_page_sleep_busy(m, TRUE, "pgrbwt");
	2572	if ((allocflags & VM_ALLOC_RETRY) == 0) {
	2573	m = NULL;
	2574	break;
	2575	}
	2576	/* retry */
	2577	} else if (m == NULL) {
	2578	if (allocflags & VM_ALLOC_RETRY)
	2579	allocflags \|= VM_ALLOC_NULL_OK;
	2580	m = vm_page_alloc(object, pindex,
	2581	allocflags & ~VM_ALLOC_RETRY);
	2582	if (m)
	2583	break;
	2584	vm_wait(0);
	2585	if ((allocflags & VM_ALLOC_RETRY) == 0)
	2586	goto failed;
	2587	} else {
	2588	/* m found */
	2589	break;
	2590	}
	2591	}
	2592
	2593	/*
	2594	* If VM_ALLOC_ZERO an invalid page will be zero'd and set valid.
	2595	*
	2596	* If VM_ALLOC_FORCE_ZERO the page is unconditionally zero'd and set
	2597	* valid even if already valid.
	2598	*/
	2599	if (m->valid == 0) {
	2600	if (allocflags & (VM_ALLOC_ZERO \| VM_ALLOC_FORCE_ZERO)) {
	2601	if ((m->flags & PG_ZERO) == 0)
	2602	pmap_zero_page(VM_PAGE_TO_PHYS(m));
	2603	m->valid = VM_PAGE_BITS_ALL;
	2604	}
	2605	} else if (allocflags & VM_ALLOC_FORCE_ZERO) {
	2606	pmap_zero_page(VM_PAGE_TO_PHYS(m));
	2607	m->valid = VM_PAGE_BITS_ALL;
	2608	}
	2609	vm_page_flag_clear(m, PG_ZERO);
	2610	failed:
	2611	vm_object_drop(object);
	2612	return(m);
	2613	}
	2614
	2615	/*
	2616	* Mapping function for valid bits or for dirty bits in
	2617	* a page. May not block.
	2618	*
	2619	* Inputs are required to range within a page.
	2620	*
	2621	* No requirements.
	2622	* Non blocking.
	2623	*/
	2624	int
	2625	vm_page_bits(int base, int size)
	2626	{
	2627	int first_bit;
	2628	int last_bit;
	2629
	2630	KASSERT(
	2631	base + size <= PAGE_SIZE,
	2632	("vm_page_bits: illegal base/size %d/%d", base, size)
	2633	);
	2634
	2635	if (size == 0) /* handle degenerate case */
	2636	return(0);
	2637
	2638	first_bit = base >> DEV_BSHIFT;
	2639	last_bit = (base + size - 1) >> DEV_BSHIFT;
	2640
	2641	return ((2 << last_bit) - (1 << first_bit));
	2642	}
	2643
	2644	/*
	2645	* Sets portions of a page valid and clean. The arguments are expected
	2646	* to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
	2647	* of any partial chunks touched by the range. The invalid portion of
	2648	* such chunks will be zero'd.
	2649	*
	2650	* NOTE: When truncating a buffer vnode_pager_setsize() will automatically
	2651	* align base to DEV_BSIZE so as not to mark clean a partially
	2652	* truncated device block. Otherwise the dirty page status might be
	2653	* lost.
	2654	*
	2655	* This routine may not block.
	2656	*
	2657	* (base + size) must be less then or equal to PAGE_SIZE.
	2658	*/
	2659	static void
	2660	_vm_page_zero_valid(vm_page_t m, int base, int size)
	2661	{
	2662	int frag;
	2663	int endoff;
	2664
	2665	if (size == 0) /* handle degenerate case */
	2666	return;
	2667
	2668	/*
	2669	* If the base is not DEV_BSIZE aligned and the valid
	2670	* bit is clear, we have to zero out a portion of the
	2671	* first block.
	2672	*/
	2673
	2674	if ((frag = base & ~(DEV_BSIZE - 1)) != base &&
	2675	(m->valid & (1 << (base >> DEV_BSHIFT))) == 0
	2676	) {
	2677	pmap_zero_page_area(
	2678	VM_PAGE_TO_PHYS(m),
	2679	frag,
	2680	base - frag
	2681	);
	2682	}
	2683
	2684	/*
	2685	* If the ending offset is not DEV_BSIZE aligned and the
	2686	* valid bit is clear, we have to zero out a portion of
	2687	* the last block.
	2688	*/
	2689
	2690	endoff = base + size;
	2691
	2692	if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff &&
	2693	(m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0
	2694	) {
	2695	pmap_zero_page_area(
	2696	VM_PAGE_TO_PHYS(m),
	2697	endoff,
	2698	DEV_BSIZE - (endoff & (DEV_BSIZE - 1))
	2699	);
	2700	}
	2701	}
	2702
	2703	/*
	2704	* Set valid, clear dirty bits. If validating the entire
	2705	* page we can safely clear the pmap modify bit. We also
	2706	* use this opportunity to clear the PG_NOSYNC flag. If a process
	2707	* takes a write fault on a MAP_NOSYNC memory area the flag will
	2708	* be set again.
	2709	*
	2710	* We set valid bits inclusive of any overlap, but we can only
	2711	* clear dirty bits for DEV_BSIZE chunks that are fully within
	2712	* the range.
	2713	*
	2714	* Page must be busied?
	2715	* No other requirements.
	2716	*/
	2717	void
	2718	vm_page_set_valid(vm_page_t m, int base, int size)
	2719	{
	2720	_vm_page_zero_valid(m, base, size);
	2721	m->valid \|= vm_page_bits(base, size);
	2722	}
	2723
	2724
	2725	/*
	2726	* Set valid bits and clear dirty bits.
	2727	*
	2728	* NOTE: This function does not clear the pmap modified bit.
	2729	* Also note that e.g. NFS may use a byte-granular base
	2730	* and size.
	2731	*
	2732	* WARNING: Page must be busied? But vfs_clean_one_page() will call
	2733	* this without necessarily busying the page (via bdwrite()).
	2734	* So for now vm_token must also be held.
	2735	*
	2736	* No other requirements.
	2737	*/
	2738	void
	2739	vm_page_set_validclean(vm_page_t m, int base, int size)
	2740	{
	2741	int pagebits;
	2742
	2743	_vm_page_zero_valid(m, base, size);
	2744	pagebits = vm_page_bits(base, size);
	2745	m->valid \|= pagebits;
	2746	m->dirty &= ~pagebits;
	2747	if (base == 0 && size == PAGE_SIZE) {
	2748	/pmap_clear_modify(m);/
	2749	vm_page_flag_clear(m, PG_NOSYNC);
	2750	}
	2751	}
	2752
	2753	/*
	2754	* Set valid & dirty. Used by buwrite()
	2755	*
	2756	* WARNING: Page must be busied? But vfs_dirty_one_page() will
	2757	* call this function in buwrite() so for now vm_token must
	2758	* be held.
	2759	*
	2760	* No other requirements.
	2761	*/
	2762	void
	2763	vm_page_set_validdirty(vm_page_t m, int base, int size)
	2764	{
	2765	int pagebits;
	2766
	2767	pagebits = vm_page_bits(base, size);
	2768	m->valid \|= pagebits;
	2769	m->dirty \|= pagebits;
	2770	if (m->object)
	2771	vm_object_set_writeable_dirty(m->object);
	2772	}
	2773
	2774	/*
	2775	* Clear dirty bits.
	2776	*
	2777	* NOTE: This function does not clear the pmap modified bit.
	2778	* Also note that e.g. NFS may use a byte-granular base
	2779	* and size.
	2780	*
	2781	* Page must be busied?
	2782	* No other requirements.
	2783	*/
	2784	void
	2785	vm_page_clear_dirty(vm_page_t m, int base, int size)
	2786	{
	2787	m->dirty &= ~vm_page_bits(base, size);
	2788	if (base == 0 && size == PAGE_SIZE) {
	2789	/pmap_clear_modify(m);/
	2790	vm_page_flag_clear(m, PG_NOSYNC);
	2791	}
	2792	}
	2793
	2794	/*
	2795	* Make the page all-dirty.
	2796	*
	2797	* Also make sure the related object and vnode reflect the fact that the
	2798	* object may now contain a dirty page.
	2799	*
	2800	* Page must be busied?
	2801	* No other requirements.
	2802	*/
	2803	void
	2804	vm_page_dirty(vm_page_t m)
	2805	{
	2806	#ifdef INVARIANTS
	2807	int pqtype = m->queue - m->pc;
	2808	#endif
	2809	KASSERT(pqtype != PQ_CACHE && pqtype != PQ_FREE,
	2810	("vm_page_dirty: page in free/cache queue!"));
	2811	if (m->dirty != VM_PAGE_BITS_ALL) {
	2812	m->dirty = VM_PAGE_BITS_ALL;
	2813	if (m->object)
	2814	vm_object_set_writeable_dirty(m->object);
	2815	}
	2816	}
	2817
	2818	/*
	2819	* Invalidates DEV_BSIZE'd chunks within a page. Both the
	2820	* valid and dirty bits for the effected areas are cleared.
	2821	*
	2822	* Page must be busied?
	2823	* Does not block.
	2824	* No other requirements.
	2825	*/
	2826	void
	2827	vm_page_set_invalid(vm_page_t m, int base, int size)
	2828	{
	2829	int bits;
	2830
	2831	bits = vm_page_bits(base, size);
	2832	m->valid &= ~bits;
	2833	m->dirty &= ~bits;
	2834	m->object->generation++;
	2835	}
	2836
	2837	/*
	2838	* The kernel assumes that the invalid portions of a page contain
	2839	* garbage, but such pages can be mapped into memory by user code.
	2840	* When this occurs, we must zero out the non-valid portions of the
	2841	* page so user code sees what it expects.
	2842	*
	2843	* Pages are most often semi-valid when the end of a file is mapped
	2844	* into memory and the file's size is not page aligned.
	2845	*
	2846	* Page must be busied?
	2847	* No other requirements.
	2848	*/
	2849	void
	2850	vm_page_zero_invalid(vm_page_t m, boolean_t setvalid)
	2851	{
	2852	int b;
	2853	int i;
	2854
	2855	/*
	2856	* Scan the valid bits looking for invalid sections that
	2857	* must be zerod. Invalid sub-DEV_BSIZE'd areas ( where the
	2858	* valid bit may be set ) have already been zerod by
	2859	* vm_page_set_validclean().
	2860	*/
	2861	for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
	2862	if (i == (PAGE_SIZE / DEV_BSIZE) \|\|
	2863	(m->valid & (1 << i))
	2864	) {
	2865	if (i > b) {
	2866	pmap_zero_page_area(
	2867	VM_PAGE_TO_PHYS(m),
	2868	b << DEV_BSHIFT,
	2869	(i - b) << DEV_BSHIFT
	2870	);
	2871	}
	2872	b = i + 1;
	2873	}
	2874	}
	2875
	2876	/*
	2877	* setvalid is TRUE when we can safely set the zero'd areas
	2878	* as being valid. We can do this if there are no cache consistency
	2879	* issues. e.g. it is ok to do with UFS, but not ok to do with NFS.
	2880	*/
	2881	if (setvalid)
	2882	m->valid = VM_PAGE_BITS_ALL;
	2883	}
	2884
	2885	/*
	2886	* Is a (partial) page valid? Note that the case where size == 0
	2887	* will return FALSE in the degenerate case where the page is entirely
	2888	* invalid, and TRUE otherwise.
	2889	*
	2890	* Does not block.
	2891	* No other requirements.
	2892	*/
	2893	int
	2894	vm_page_is_valid(vm_page_t m, int base, int size)
	2895	{
	2896	int bits = vm_page_bits(base, size);
	2897
	2898	if (m->valid && ((m->valid & bits) == bits))
	2899	return 1;
	2900	else
	2901	return 0;
	2902	}
	2903
	2904	/*
	2905	* update dirty bits from pmap/mmu. May not block.
	2906	*
	2907	* Caller must hold the page busy
	2908	*/
	2909	void
	2910	vm_page_test_dirty(vm_page_t m)
	2911	{
	2912	if ((m->dirty != VM_PAGE_BITS_ALL) && pmap_is_modified(m)) {
	2913	vm_page_dirty(m);
	2914	}
	2915	}
	2916
	2917	/*
	2918	* Register an action, associating it with its vm_page
	2919	*/
	2920	void
	2921	vm_page_register_action(vm_page_action_t action, vm_page_event_t event)
	2922	{
	2923	struct vm_page_action_list *list;
	2924	int hv;
	2925
	2926	hv = (int)((intptr_t)action->m >> 8) & VMACTION_HMASK;
	2927	list = &action_list[hv];
	2928
	2929	lwkt_gettoken(&vm_token);
	2930	vm_page_flag_set(action->m, PG_ACTIONLIST);
	2931	action->event = event;
	2932	LIST_INSERT_HEAD(list, action, entry);
	2933	lwkt_reltoken(&vm_token);
	2934	}
	2935
	2936	/*
	2937	* Unregister an action, disassociating it from its related vm_page
	2938	*/
	2939	void
	2940	vm_page_unregister_action(vm_page_action_t action)
	2941	{
	2942	struct vm_page_action_list *list;
	2943	int hv;
	2944
	2945	lwkt_gettoken(&vm_token);
	2946	if (action->event != VMEVENT_NONE) {
	2947	action->event = VMEVENT_NONE;
	2948	LIST_REMOVE(action, entry);
	2949
	2950	hv = (int)((intptr_t)action->m >> 8) & VMACTION_HMASK;
	2951	list = &action_list[hv];
	2952	if (LIST_EMPTY(list))
	2953	vm_page_flag_clear(action->m, PG_ACTIONLIST);
	2954	}
	2955	lwkt_reltoken(&vm_token);
	2956	}
	2957
	2958	/*
	2959	* Issue an event on a VM page. Corresponding action structures are
	2960	* removed from the page's list and called.
	2961	*
	2962	* If the vm_page has no more pending action events we clear its
	2963	* PG_ACTIONLIST flag.
	2964	*/
	2965	void
	2966	vm_page_event_internal(vm_page_t m, vm_page_event_t event)
	2967	{
	2968	struct vm_page_action_list *list;
	2969	struct vm_page_action *scan;
	2970	struct vm_page_action *next;
	2971	int hv;
	2972	int all;
	2973
	2974	hv = (int)((intptr_t)m >> 8) & VMACTION_HMASK;
	2975	list = &action_list[hv];
	2976	all = 1;
	2977
	2978	lwkt_gettoken(&vm_token);
	2979	LIST_FOREACH_MUTABLE(scan, list, entry, next) {
	2980	if (scan->m == m) {
	2981	if (scan->event == event) {
	2982	scan->event = VMEVENT_NONE;
	2983	LIST_REMOVE(scan, entry);
	2984	scan->func(m, scan);
	2985	/* XXX */
	2986	} else {
	2987	all = 0;
	2988	}
	2989	}
	2990	}
	2991	if (all)
	2992	vm_page_flag_clear(m, PG_ACTIONLIST);
	2993	lwkt_reltoken(&vm_token);
	2994	}
	2995
	2996	#include "opt_ddb.h"
	2997	#ifdef DDB
	2998	#include <sys/kernel.h>
	2999
	3000	#include <ddb/ddb.h>
	3001
	3002	DB_SHOW_COMMAND(page, vm_page_print_page_info)
	3003	{
	3004	db_printf("vmstats.v_free_count: %d\n", vmstats.v_free_count);
	3005	db_printf("vmstats.v_cache_count: %d\n", vmstats.v_cache_count);
	3006	db_printf("vmstats.v_inactive_count: %d\n", vmstats.v_inactive_count);
	3007	db_printf("vmstats.v_active_count: %d\n", vmstats.v_active_count);
	3008	db_printf("vmstats.v_wire_count: %d\n", vmstats.v_wire_count);
	3009	db_printf("vmstats.v_free_reserved: %d\n", vmstats.v_free_reserved);
	3010	db_printf("vmstats.v_free_min: %d\n", vmstats.v_free_min);
	3011	db_printf("vmstats.v_free_target: %d\n", vmstats.v_free_target);
	3012	db_printf("vmstats.v_cache_min: %d\n", vmstats.v_cache_min);
	3013	db_printf("vmstats.v_inactive_target: %d\n", vmstats.v_inactive_target);
	3014	}
	3015
	3016	DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
	3017	{
	3018	int i;
	3019	db_printf("PQ_FREE:");
	3020	for(i=0;i<PQ_L2_SIZE;i++) {
	3021	db_printf(" %d", vm_page_queues[PQ_FREE + i].lcnt);
	3022	}
	3023	db_printf("\n");
	3024
	3025	db_printf("PQ_CACHE:");
	3026	for(i=0;i<PQ_L2_SIZE;i++) {
	3027	db_printf(" %d", vm_page_queues[PQ_CACHE + i].lcnt);
	3028	}
	3029	db_printf("\n");
	3030
	3031	db_printf("PQ_ACTIVE:");
	3032	for(i=0;i<PQ_L2_SIZE;i++) {
	3033	db_printf(" %d", vm_page_queues[PQ_ACTIVE + i].lcnt);
	3034	}
	3035	db_printf("\n");
	3036
	3037	db_printf("PQ_INACTIVE:");
	3038	for(i=0;i<PQ_L2_SIZE;i++) {
	3039	db_printf(" %d", vm_page_queues[PQ_INACTIVE + i].lcnt);
	3040	}
	3041	db_printf("\n");
	3042	}
	3043	#endif /* DDB */