gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 1991 Regents of the University of California.
	3	* All rights reserved.
	4	*
	5	* This code is derived from software contributed to Berkeley by
	6	* The Mach Operating System project at Carnegie-Mellon University.
	7	*
	8	* Redistribution and use in source and binary forms, with or without
	9	* modification, are permitted provided that the following conditions
	10	* are met:
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in the
	15	* documentation and/or other materials provided with the distribution.
	16	* 3. All advertising materials mentioning features or use of this software
	17	* must display the following acknowledgement:
	18	* This product includes software developed by the University of
	19	* California, Berkeley and its contributors.
	20	* 4. Neither the name of the University nor the names of its contributors
	21	* may be used to endorse or promote products derived from this software
	22	* without specific prior written permission.
	23	*
	24	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	25	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	26	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	27	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	28	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	29	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	30	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	31	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	32	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	33	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	34	* SUCH DAMAGE.
	35	*
	36	* from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91
	37	* $FreeBSD: src/sys/vm/vm_page.c,v 1.147.2.18 2002/03/10 05:03:19 alc Exp $
	38	* $DragonFly: src/sys/vm/vm_page.c,v 1.40 2008/08/25 17:01:42 dillon Exp $
	39	*/
	40
	41	/*
	42	* Copyright (c) 1987, 1990 Carnegie-Mellon University.
	43	* All rights reserved.
	44	*
	45	* Authors: Avadis Tevanian, Jr., Michael Wayne Young
	46	*
	47	* Permission to use, copy, modify and distribute this software and
	48	* its documentation is hereby granted, provided that both the copyright
	49	* notice and this permission notice appear in all copies of the
	50	* software, derivative works or modified versions, and any portions
	51	* thereof, and that both notices appear in supporting documentation.
	52	*
	53	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
	54	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
	55	* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
	56	*
	57	* Carnegie Mellon requests users of this software to return to
	58	*
	59	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
	60	* School of Computer Science
	61	* Carnegie Mellon University
	62	* Pittsburgh PA 15213-3890
	63	*
	64	* any improvements or extensions that they make and grant Carnegie the
	65	* rights to redistribute these changes.
	66	*/
	67	/*
	68	* Resident memory management module. The module manipulates 'VM pages'.
	69	* A VM page is the core building block for memory management.
	70	*/
	71
	72	#include <sys/param.h>
	73	#include <sys/systm.h>
	74	#include <sys/malloc.h>
	75	#include <sys/proc.h>
	76	#include <sys/vmmeter.h>
	77	#include <sys/vnode.h>
	78
	79	#include <vm/vm.h>
	80	#include <vm/vm_param.h>
	81	#include <sys/lock.h>
	82	#include <vm/vm_kern.h>
	83	#include <vm/pmap.h>
	84	#include <vm/vm_map.h>
	85	#include <vm/vm_object.h>
	86	#include <vm/vm_page.h>
	87	#include <vm/vm_pageout.h>
	88	#include <vm/vm_pager.h>
	89	#include <vm/vm_extern.h>
	90	#include <vm/vm_page2.h>
	91
	92	static void vm_page_queue_init(void);
	93	static void vm_page_free_wakeup(void);
	94	static vm_page_t vm_page_select_cache(vm_object_t, vm_pindex_t);
	95	static vm_page_t _vm_page_list_find2(int basequeue, int index);
	96
	97	struct vpgqueues vm_page_queues[PQ_COUNT]; /* Array of tailq lists */
	98
	99	#define ASSERT_IN_CRIT_SECTION() KKASSERT(crit_test(curthread));
	100
	101	RB_GENERATE2(vm_page_rb_tree, vm_page, rb_entry, rb_vm_page_compare,
	102	vm_pindex_t, pindex);
	103
	104	static void
	105	vm_page_queue_init(void)
	106	{
	107	int i;
	108
	109	for (i = 0; i < PQ_L2_SIZE; i++)
	110	vm_page_queues[PQ_FREE+i].cnt = &vmstats.v_free_count;
	111	for (i = 0; i < PQ_L2_SIZE; i++)
	112	vm_page_queues[PQ_CACHE+i].cnt = &vmstats.v_cache_count;
	113
	114	vm_page_queues[PQ_INACTIVE].cnt = &vmstats.v_inactive_count;
	115	vm_page_queues[PQ_ACTIVE].cnt = &vmstats.v_active_count;
	116	vm_page_queues[PQ_HOLD].cnt = &vmstats.v_active_count;
	117	/* PQ_NONE has no queue */
	118
	119	for (i = 0; i < PQ_COUNT; i++)
	120	TAILQ_INIT(&vm_page_queues[i].pl);
	121	}
	122
	123	/*
	124	* note: place in initialized data section? Is this necessary?
	125	*/
	126	long first_page = 0;
	127	int vm_page_array_size = 0;
	128	int vm_page_zero_count = 0;
	129	vm_page_t vm_page_array = 0;
	130
	131	/*
	132	* (low level boot)
	133	*
	134	* Sets the page size, perhaps based upon the memory size.
	135	* Must be called before any use of page-size dependent functions.
	136	*/
	137	void
	138	vm_set_page_size(void)
	139	{
	140	if (vmstats.v_page_size == 0)
	141	vmstats.v_page_size = PAGE_SIZE;
	142	if (((vmstats.v_page_size - 1) & vmstats.v_page_size) != 0)
	143	panic("vm_set_page_size: page size not a power of two");
	144	}
	145
	146	/*
	147	* (low level boot)
	148	*
	149	* Add a new page to the freelist for use by the system. New pages
	150	* are added to both the head and tail of the associated free page
	151	* queue in a bottom-up fashion, so both zero'd and non-zero'd page
	152	* requests pull 'recent' adds (higher physical addresses) first.
	153	*
	154	* Must be called in a critical section.
	155	*/
	156	vm_page_t
	157	vm_add_new_page(vm_paddr_t pa)
	158	{
	159	struct vpgqueues *vpq;
	160	vm_page_t m;
	161
	162	++vmstats.v_page_count;
	163	++vmstats.v_free_count;
	164	m = PHYS_TO_VM_PAGE(pa);
	165	m->phys_addr = pa;
	166	m->flags = 0;
	167	m->pc = (pa >> PAGE_SHIFT) & PQ_L2_MASK;
	168	m->queue = m->pc + PQ_FREE;
	169	KKASSERT(m->dirty == 0);
	170
	171	vpq = &vm_page_queues[m->queue];
	172	if (vpq->flipflop)
	173	TAILQ_INSERT_TAIL(&vpq->pl, m, pageq);
	174	else
	175	TAILQ_INSERT_HEAD(&vpq->pl, m, pageq);
	176	vpq->flipflop = 1 - vpq->flipflop;
	177
	178	vm_page_queues[m->queue].lcnt++;
	179	return (m);
	180	}
	181
	182	/*
	183	* (low level boot)
	184	*
	185	* Initializes the resident memory module.
	186	*
	187	* Allocates memory for the page cells, and for the object/offset-to-page
	188	* hash table headers. Each page cell is initialized and placed on the
	189	* free list.
	190	*
	191	* starta/enda represents the range of physical memory addresses available
	192	* for use (skipping memory already used by the kernel), subject to
	193	* phys_avail[]. Note that phys_avail[] has already mapped out memory
	194	* already in use by the kernel.
	195	*/
	196	vm_offset_t
	197	vm_page_startup(vm_offset_t vaddr)
	198	{
	199	vm_offset_t mapped;
	200	vm_size_t npages;
	201	vm_paddr_t page_range;
	202	vm_paddr_t new_end;
	203	int i;
	204	vm_paddr_t pa;
	205	int nblocks;
	206	vm_paddr_t last_pa;
	207	vm_paddr_t end;
	208	vm_paddr_t biggestone, biggestsize;
	209	vm_paddr_t total;
	210
	211	total = 0;
	212	biggestsize = 0;
	213	biggestone = 0;
	214	nblocks = 0;
	215	vaddr = round_page(vaddr);
	216
	217	for (i = 0; phys_avail[i + 1]; i += 2) {
	218	phys_avail[i] = round_page(phys_avail[i]);
	219	phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
	220	}
	221
	222	for (i = 0; phys_avail[i + 1]; i += 2) {
	223	vm_paddr_t size = phys_avail[i + 1] - phys_avail[i];
	224
	225	if (size > biggestsize) {
	226	biggestone = i;
	227	biggestsize = size;
	228	}
	229	++nblocks;
	230	total += size;
	231	}
	232
	233	end = phys_avail[biggestone+1];
	234	end = trunc_page(end);
	235
	236	/*
	237	* Initialize the queue headers for the free queue, the active queue
	238	* and the inactive queue.
	239	*/
	240
	241	vm_page_queue_init();
	242
	243	/*
	244	* Compute the number of pages of memory that will be available for
	245	* use (taking into account the overhead of a page structure per
	246	* page).
	247	*/
	248	first_page = phys_avail[0] / PAGE_SIZE;
	249	page_range = phys_avail[(nblocks - 1) * 2 + 1] / PAGE_SIZE - first_page;
	250	npages = (total - (page_range * sizeof(struct vm_page))) / PAGE_SIZE;
	251
	252	/*
	253	* Initialize the mem entry structures now, and put them in the free
	254	* queue.
	255	*/
	256	vm_page_array = (vm_page_t) vaddr;
	257	mapped = vaddr;
	258
	259	/*
	260	* Validate these addresses.
	261	*/
	262	new_end = trunc_page(end - page_range * sizeof(struct vm_page));
	263	mapped = pmap_map(mapped, new_end, end,
	264	VM_PROT_READ \| VM_PROT_WRITE);
	265
	266	/*
	267	* Clear all of the page structures
	268	*/
	269	bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page));
	270	vm_page_array_size = page_range;
	271
	272	/*
	273	* Construct the free queue(s) in ascending order (by physical
	274	* address) so that the first 16MB of physical memory is allocated
	275	* last rather than first. On large-memory machines, this avoids
	276	* the exhaustion of low physical memory before isa_dmainit has run.
	277	*/
	278	vmstats.v_page_count = 0;
	279	vmstats.v_free_count = 0;
	280	for (i = 0; phys_avail[i + 1] && npages > 0; i += 2) {
	281	pa = phys_avail[i];
	282	if (i == biggestone)
	283	last_pa = new_end;
	284	else
	285	last_pa = phys_avail[i + 1];
	286	while (pa < last_pa && npages-- > 0) {
	287	vm_add_new_page(pa);
	288	pa += PAGE_SIZE;
	289	}
	290	}
	291	return (mapped);
	292	}
	293
	294	/*
	295	* Scan comparison function for Red-Black tree scans. An inclusive
	296	* (start,end) is expected. Other fields are not used.
	297	*/
	298	int
	299	rb_vm_page_scancmp(struct vm_page p, void data)
	300	{
	301	struct rb_vm_page_scan_info *info = data;
	302
	303	if (p->pindex < info->start_pindex)
	304	return(-1);
	305	if (p->pindex > info->end_pindex)
	306	return(1);
	307	return(0);
	308	}
	309
	310	int
	311	rb_vm_page_compare(struct vm_page p1, struct vm_page p2)
	312	{
	313	if (p1->pindex < p2->pindex)
	314	return(-1);
	315	if (p1->pindex > p2->pindex)
	316	return(1);
	317	return(0);
	318	}
	319
	320	/*
	321	* The opposite of vm_page_hold(). A page can be freed while being held,
	322	* which places it on the PQ_HOLD queue. We must call vm_page_free_toq()
	323	* in this case to actually free it once the hold count drops to 0.
	324	*
	325	* This routine must be called at splvm().
	326	*/
	327	void
	328	vm_page_unhold(vm_page_t mem)
	329	{
	330	--mem->hold_count;
	331	KASSERT(mem->hold_count >= 0, ("vm_page_unhold: hold count < 0!!!"));
	332	if (mem->hold_count == 0 && mem->queue == PQ_HOLD) {
	333	vm_page_busy(mem);
	334	vm_page_free_toq(mem);
	335	}
	336	}
	337
	338	/*
	339	* Inserts the given mem entry into the object and object list.
	340	*
	341	* The pagetables are not updated but will presumably fault the page
	342	* in if necessary, or if a kernel page the caller will at some point
	343	* enter the page into the kernel's pmap. We are not allowed to block
	344	* here so we can't do this anyway.
	345	*
	346	* This routine may not block.
	347	* This routine must be called with a critical section held.
	348	*/
	349	void
	350	vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
	351	{
	352	ASSERT_IN_CRIT_SECTION();
	353	if (m->object != NULL)
	354	panic("vm_page_insert: already inserted");
	355
	356	/*
	357	* Record the object/offset pair in this page
	358	*/
	359	m->object = object;
	360	m->pindex = pindex;
	361
	362	/*
	363	* Insert it into the object.
	364	*/
	365	vm_page_rb_tree_RB_INSERT(&object->rb_memq, m);
	366	object->generation++;
	367
	368	/*
	369	* show that the object has one more resident page.
	370	*/
	371	object->resident_page_count++;
	372
	373	/*
	374	* Since we are inserting a new and possibly dirty page,
	375	* update the object's OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY flags.
	376	*/
	377	if ((m->valid & m->dirty) \|\| (m->flags & PG_WRITEABLE))
	378	vm_object_set_writeable_dirty(object);
	379	}
	380
	381	/*
	382	* Removes the given vm_page_t from the global (object,index) hash table
	383	* and from the object's memq.
	384	*
	385	* The underlying pmap entry (if any) is NOT removed here.
	386	* This routine may not block.
	387	*
	388	* The page must be BUSY and will remain BUSY on return. No spl needs to be
	389	* held on call to this routine.
	390	*
	391	* note: FreeBSD side effect was to unbusy the page on return. We leave
	392	* it busy.
	393	*/
	394	void
	395	vm_page_remove(vm_page_t m)
	396	{
	397	vm_object_t object;
	398
	399	crit_enter();
	400	if (m->object == NULL) {
	401	crit_exit();
	402	return;
	403	}
	404
	405	if ((m->flags & PG_BUSY) == 0)
	406	panic("vm_page_remove: page not busy");
	407
	408	object = m->object;
	409
	410	/*
	411	* Remove the page from the object and update the object.
	412	*/
	413	vm_page_rb_tree_RB_REMOVE(&object->rb_memq, m);
	414	object->resident_page_count--;
	415	object->generation++;
	416	m->object = NULL;
	417
	418	crit_exit();
	419	}
	420
	421	/*
	422	* Locate and return the page at (object, pindex), or NULL if the
	423	* page could not be found.
	424	*
	425	* This routine will operate properly without spl protection, but
	426	* the returned page could be in flux if it is busy. Because an
	427	* interrupt can race a caller's busy check (unbusying and freeing the
	428	* page we return before the caller is able to check the busy bit),
	429	* the caller should generally call this routine with a critical
	430	* section held.
	431	*
	432	* Callers may call this routine without spl protection if they know
	433	* 'for sure' that the page will not be ripped out from under them
	434	* by an interrupt.
	435	*/
	436	vm_page_t
	437	vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
	438	{
	439	vm_page_t m;
	440
	441	/*
	442	* Search the hash table for this object/offset pair
	443	*/
	444	crit_enter();
	445	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
	446	crit_exit();
	447	KKASSERT(m == NULL \|\| (m->object == object && m->pindex == pindex));
	448	return(m);
	449	}
	450
	451	/*
	452	* vm_page_rename()
	453	*
	454	* Move the given memory entry from its current object to the specified
	455	* target object/offset.
	456	*
	457	* The object must be locked.
	458	* This routine may not block.
	459	*
	460	* Note: This routine will raise itself to splvm(), the caller need not.
	461	*
	462	* Note: Swap associated with the page must be invalidated by the move. We
	463	* have to do this for several reasons: (1) we aren't freeing the
	464	* page, (2) we are dirtying the page, (3) the VM system is probably
	465	* moving the page from object A to B, and will then later move
	466	* the backing store from A to B and we can't have a conflict.
	467	*
	468	* Note: We always dirty the page. It is necessary both for the
	469	* fact that we moved it, and because we may be invalidating
	470	* swap. If the page is on the cache, we have to deactivate it
	471	* or vm_page_dirty() will panic. Dirty pages are not allowed
	472	* on the cache.
	473	*/
	474	void
	475	vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
	476	{
	477	crit_enter();
	478	vm_page_remove(m);
	479	vm_page_insert(m, new_object, new_pindex);
	480	if (m->queue - m->pc == PQ_CACHE)
	481	vm_page_deactivate(m);
	482	vm_page_dirty(m);
	483	vm_page_wakeup(m);
	484	crit_exit();
	485	}
	486
	487	/*
	488	* vm_page_unqueue() without any wakeup. This routine is used when a page
	489	* is being moved between queues or otherwise is to remain BUSYied by the
	490	* caller.
	491	*
	492	* This routine must be called at splhigh().
	493	* This routine may not block.
	494	*/
	495	void
	496	vm_page_unqueue_nowakeup(vm_page_t m)
	497	{
	498	int queue = m->queue;
	499	struct vpgqueues *pq;
	500
	501	if (queue != PQ_NONE) {
	502	pq = &vm_page_queues[queue];
	503	m->queue = PQ_NONE;
	504	TAILQ_REMOVE(&pq->pl, m, pageq);
	505	(*pq->cnt)--;
	506	pq->lcnt--;
	507	}
	508	}
	509
	510	/*
	511	* vm_page_unqueue() - Remove a page from its queue, wakeup the pagedemon
	512	* if necessary.
	513	*
	514	* This routine must be called at splhigh().
	515	* This routine may not block.
	516	*/
	517	void
	518	vm_page_unqueue(vm_page_t m)
	519	{
	520	int queue = m->queue;
	521	struct vpgqueues *pq;
	522
	523	if (queue != PQ_NONE) {
	524	m->queue = PQ_NONE;
	525	pq = &vm_page_queues[queue];
	526	TAILQ_REMOVE(&pq->pl, m, pageq);
	527	(*pq->cnt)--;
	528	pq->lcnt--;
	529	if ((queue - m->pc) == PQ_CACHE \|\| (queue - m->pc) == PQ_FREE)
	530	pagedaemon_wakeup();
	531	}
	532	}
	533
	534	/*
	535	* vm_page_list_find()
	536	*
	537	* Find a page on the specified queue with color optimization.
	538	*
	539	* The page coloring optimization attempts to locate a page that does
	540	* not overload other nearby pages in the object in the cpu's L1 or L2
	541	* caches. We need this optimization because cpu caches tend to be
	542	* physical caches, while object spaces tend to be virtual.
	543	*
	544	* This routine must be called at splvm().
	545	* This routine may not block.
	546	*
	547	* Note that this routine is carefully inlined. A non-inlined version
	548	* is available for outside callers but the only critical path is
	549	* from within this source file.
	550	*/
	551	static __inline
	552	vm_page_t
	553	_vm_page_list_find(int basequeue, int index, boolean_t prefer_zero)
	554	{
	555	vm_page_t m;
	556
	557	if (prefer_zero)
	558	m = TAILQ_LAST(&vm_page_queues[basequeue+index].pl, pglist);
	559	else
	560	m = TAILQ_FIRST(&vm_page_queues[basequeue+index].pl);
	561	if (m == NULL)
	562	m = _vm_page_list_find2(basequeue, index);
	563	return(m);
	564	}
	565
	566	static vm_page_t
	567	_vm_page_list_find2(int basequeue, int index)
	568	{
	569	int i;
	570	vm_page_t m = NULL;
	571	struct vpgqueues *pq;
	572
	573	pq = &vm_page_queues[basequeue];
	574
	575	/*
	576	* Note that for the first loop, index+i and index-i wind up at the
	577	* same place. Even though this is not totally optimal, we've already
	578	* blown it by missing the cache case so we do not care.
	579	*/
	580
	581	for(i = PQ_L2_SIZE / 2; i > 0; --i) {
	582	if ((m = TAILQ_FIRST(&pq[(index + i) & PQ_L2_MASK].pl)) != NULL)
	583	break;
	584
	585	if ((m = TAILQ_FIRST(&pq[(index - i) & PQ_L2_MASK].pl)) != NULL)
	586	break;
	587	}
	588	return(m);
	589	}
	590
	591	vm_page_t
	592	vm_page_list_find(int basequeue, int index, boolean_t prefer_zero)
	593	{
	594	return(_vm_page_list_find(basequeue, index, prefer_zero));
	595	}
	596
	597	/*
	598	* Find a page on the cache queue with color optimization. As pages
	599	* might be found, but not applicable, they are deactivated. This
	600	* keeps us from using potentially busy cached pages.
	601	*
	602	* This routine must be called with a critical section held.
	603	* This routine may not block.
	604	*/
	605	vm_page_t
	606	vm_page_select_cache(vm_object_t object, vm_pindex_t pindex)
	607	{
	608	vm_page_t m;
	609
	610	while (TRUE) {
	611	m = _vm_page_list_find(
	612	PQ_CACHE,
	613	(pindex + object->pg_color) & PQ_L2_MASK,
	614	FALSE
	615	);
	616	if (m && ((m->flags & (PG_BUSY\|PG_UNMANAGED)) \|\| m->busy \|\|
	617	m->hold_count \|\| m->wire_count)) {
	618	vm_page_deactivate(m);
	619	continue;
	620	}
	621	return m;
	622	}
	623	/* not reached */
	624	}
	625
	626	/*
	627	* Find a free or zero page, with specified preference. We attempt to
	628	* inline the nominal case and fall back to _vm_page_select_free()
	629	* otherwise.
	630	*
	631	* This routine must be called with a critical section held.
	632	* This routine may not block.
	633	*/
	634	static __inline vm_page_t
	635	vm_page_select_free(vm_object_t object, vm_pindex_t pindex, boolean_t prefer_zero)
	636	{
	637	vm_page_t m;
	638
	639	m = _vm_page_list_find(
	640	PQ_FREE,
	641	(pindex + object->pg_color) & PQ_L2_MASK,
	642	prefer_zero
	643	);
	644	return(m);
	645	}
	646
	647	/*
	648	* vm_page_alloc()
	649	*
	650	* Allocate and return a memory cell associated with this VM object/offset
	651	* pair.
	652	*
	653	* page_req classes:
	654	*
	655	* VM_ALLOC_NORMAL allow use of cache pages, nominal free drain
	656	* VM_ALLOC_SYSTEM greater free drain
	657	* VM_ALLOC_INTERRUPT allow free list to be completely drained
	658	* VM_ALLOC_ZERO advisory request for pre-zero'd page
	659	*
	660	* The object must be locked.
	661	* This routine may not block.
	662	* The returned page will be marked PG_BUSY
	663	*
	664	* Additional special handling is required when called from an interrupt
	665	* (VM_ALLOC_INTERRUPT). We are not allowed to mess with the page cache
	666	* in this case.
	667	*/
	668	vm_page_t
	669	vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int page_req)
	670	{
	671	vm_page_t m = NULL;
	672
	673	KKASSERT(object != NULL);
	674	KASSERT(!vm_page_lookup(object, pindex),
	675	("vm_page_alloc: page already allocated"));
	676	KKASSERT(page_req &
	677	(VM_ALLOC_NORMAL\|VM_ALLOC_INTERRUPT\|VM_ALLOC_SYSTEM));
	678
	679	/*
	680	* Certain system threads (pageout daemon, buf_daemon's) are
	681	* allowed to eat deeper into the free page list.
	682	*/
	683	if (curthread->td_flags & TDF_SYSTHREAD)
	684	page_req \|= VM_ALLOC_SYSTEM;
	685
	686	crit_enter();
	687	loop:
	688	if (vmstats.v_free_count > vmstats.v_free_reserved \|\|
	689	((page_req & VM_ALLOC_INTERRUPT) && vmstats.v_free_count > 0) \|\|
	690	((page_req & VM_ALLOC_SYSTEM) && vmstats.v_cache_count == 0 &&
	691	vmstats.v_free_count > vmstats.v_interrupt_free_min)
	692	) {
	693	/*
	694	* The free queue has sufficient free pages to take one out.
	695	*/
	696	if (page_req & VM_ALLOC_ZERO)
	697	m = vm_page_select_free(object, pindex, TRUE);
	698	else
	699	m = vm_page_select_free(object, pindex, FALSE);
	700	} else if (page_req & VM_ALLOC_NORMAL) {
	701	/*
	702	* Allocatable from the cache (non-interrupt only). On
	703	* success, we must free the page and try again, thus
	704	* ensuring that vmstats.v_*_free_min counters are replenished.
	705	*/
	706	#ifdef INVARIANTS
	707	if (curthread->td_preempted) {
	708	kprintf("vm_page_alloc(): warning, attempt to allocate"
	709	" cache page from preempting interrupt\n");
	710	m = NULL;
	711	} else {
	712	m = vm_page_select_cache(object, pindex);
	713	}
	714	#else
	715	m = vm_page_select_cache(object, pindex);
	716	#endif
	717	/*
	718	* On success move the page into the free queue and loop.
	719	*/
	720	if (m != NULL) {
	721	KASSERT(m->dirty == 0,
	722	("Found dirty cache page %p", m));
	723	vm_page_busy(m);
	724	vm_page_protect(m, VM_PROT_NONE);
	725	vm_page_free(m);
	726	goto loop;
	727	}
	728
	729	/*
	730	* On failure return NULL
	731	*/
	732	crit_exit();
	733	#if defined(DIAGNOSTIC)
	734	if (vmstats.v_cache_count > 0)
	735	kprintf("vm_page_alloc(NORMAL): missing pages on cache queue: %d\n", vmstats.v_cache_count);
	736	#endif
	737	vm_pageout_deficit++;
	738	pagedaemon_wakeup();
	739	return (NULL);
	740	} else {
	741	/*
	742	* No pages available, wakeup the pageout daemon and give up.
	743	*/
	744	crit_exit();
	745	vm_pageout_deficit++;
	746	pagedaemon_wakeup();
	747	return (NULL);
	748	}
	749
	750	/*
	751	* Good page found. The page has not yet been busied. We are in
	752	* a critical section.
	753	*/
	754	KASSERT(m != NULL, ("vm_page_alloc(): missing page on free queue\n"));
	755	KASSERT(m->dirty == 0,
	756	("vm_page_alloc: free/cache page %p was dirty", m));
	757
	758	/*
	759	* Remove from free queue
	760	*/
	761	vm_page_unqueue_nowakeup(m);
	762
	763	/*
	764	* Initialize structure. Only the PG_ZERO flag is inherited. Set
	765	* the page PG_BUSY
	766	*/
	767	if (m->flags & PG_ZERO) {
	768	vm_page_zero_count--;
	769	m->flags = PG_ZERO \| PG_BUSY;
	770	} else {
	771	m->flags = PG_BUSY;
	772	}
	773	m->wire_count = 0;
	774	m->hold_count = 0;
	775	m->act_count = 0;
	776	m->busy = 0;
	777	m->valid = 0;
	778
	779	/*
	780	* vm_page_insert() is safe prior to the crit_exit(). Note also that
	781	* inserting a page here does not insert it into the pmap (which
	782	* could cause us to block allocating memory). We cannot block
	783	* anywhere.
	784	*/
	785	vm_page_insert(m, object, pindex);
	786
	787	/*
	788	* Don't wakeup too often - wakeup the pageout daemon when
	789	* we would be nearly out of memory.
	790	*/
	791	pagedaemon_wakeup();
	792
	793	crit_exit();
	794
	795	/*
	796	* A PG_BUSY page is returned.
	797	*/
	798	return (m);
	799	}
	800
	801	/*
	802	* Block until free pages are available for allocation, called in various
	803	* places before memory allocations.
	804	*/
	805	void
	806	vm_wait(int timo)
	807	{
	808	crit_enter();
	809	if (curthread == pagethread) {
	810	vm_pageout_pages_needed = 1;
	811	tsleep(&vm_pageout_pages_needed, 0, "VMWait", timo);
	812	} else {
	813	if (vm_pages_needed == 0) {
	814	vm_pages_needed = 1;
	815	wakeup(&vm_pages_needed);
	816	}
	817	tsleep(&vmstats.v_free_count, 0, "vmwait", timo);
	818	}
	819	crit_exit();
	820	}
	821
	822	/*
	823	* Block until free pages are available for allocation
	824	*
	825	* Called only in vm_fault so that processes page faulting can be
	826	* easily tracked.
	827	*/
	828	void
	829	vm_waitpfault(void)
	830	{
	831	crit_enter();
	832	if (vm_pages_needed == 0) {
	833	vm_pages_needed = 1;
	834	wakeup(&vm_pages_needed);
	835	}
	836	tsleep(&vmstats.v_free_count, 0, "pfault", 0);
	837	crit_exit();
	838	}
	839
	840	/*
	841	* Put the specified page on the active list (if appropriate). Ensure
	842	* that act_count is at least ACT_INIT but do not otherwise mess with it.
	843	*
	844	* The page queues must be locked.
	845	* This routine may not block.
	846	*/
	847	void
	848	vm_page_activate(vm_page_t m)
	849	{
	850	crit_enter();
	851	if (m->queue != PQ_ACTIVE) {
	852	if ((m->queue - m->pc) == PQ_CACHE)
	853	mycpu->gd_cnt.v_reactivated++;
	854
	855	vm_page_unqueue(m);
	856
	857	if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
	858	m->queue = PQ_ACTIVE;
	859	vm_page_queues[PQ_ACTIVE].lcnt++;
	860	TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl,
	861	m, pageq);
	862	if (m->act_count < ACT_INIT)
	863	m->act_count = ACT_INIT;
	864	vmstats.v_active_count++;
	865	}
	866	} else {
	867	if (m->act_count < ACT_INIT)
	868	m->act_count = ACT_INIT;
	869	}
	870	crit_exit();
	871	}
	872
	873	/*
	874	* Helper routine for vm_page_free_toq() and vm_page_cache(). This
	875	* routine is called when a page has been added to the cache or free
	876	* queues.
	877	*
	878	* This routine may not block.
	879	* This routine must be called at splvm()
	880	*/
	881	static __inline void
	882	vm_page_free_wakeup(void)
	883	{
	884	/*
	885	* if pageout daemon needs pages, then tell it that there are
	886	* some free.
	887	*/
	888	if (vm_pageout_pages_needed &&
	889	vmstats.v_cache_count + vmstats.v_free_count >=
	890	vmstats.v_pageout_free_min
	891	) {
	892	wakeup(&vm_pageout_pages_needed);
	893	vm_pageout_pages_needed = 0;
	894	}
	895
	896	/*
	897	* wakeup processes that are waiting on memory if we hit a
	898	* high water mark. And wakeup scheduler process if we have
	899	* lots of memory. this process will swapin processes.
	900	*/
	901	if (vm_pages_needed && !vm_page_count_min(0)) {
	902	vm_pages_needed = 0;
	903	wakeup(&vmstats.v_free_count);
	904	}
	905	}
	906
	907	/*
	908	* vm_page_free_toq:
	909	*
	910	* Returns the given page to the PQ_FREE list, disassociating it with
	911	* any VM object.
	912	*
	913	* The vm_page must be PG_BUSY on entry. PG_BUSY will be released on
	914	* return (the page will have been freed). No particular spl is required
	915	* on entry.
	916	*
	917	* This routine may not block.
	918	*/
	919	void
	920	vm_page_free_toq(vm_page_t m)
	921	{
	922	struct vpgqueues *pq;
	923
	924	crit_enter();
	925	mycpu->gd_cnt.v_tfree++;
	926
	927	KKASSERT((m->flags & PG_MAPPED) == 0);
	928
	929	if (m->busy \|\| ((m->queue - m->pc) == PQ_FREE)) {
	930	kprintf(
	931	"vm_page_free: pindex(%lu), busy(%d), PG_BUSY(%d), hold(%d)\n",
	932	(u_long)m->pindex, m->busy, (m->flags & PG_BUSY) ? 1 : 0,
	933	m->hold_count);
	934	if ((m->queue - m->pc) == PQ_FREE)
	935	panic("vm_page_free: freeing free page");
	936	else
	937	panic("vm_page_free: freeing busy page");
	938	}
	939
	940	/*
	941	* unqueue, then remove page. Note that we cannot destroy
	942	* the page here because we do not want to call the pager's
	943	* callback routine until after we've put the page on the
	944	* appropriate free queue.
	945	*/
	946	vm_page_unqueue_nowakeup(m);
	947	vm_page_remove(m);
	948
	949	/*
	950	* No further management of fictitious pages occurs beyond object
	951	* and queue removal.
	952	*/
	953	if ((m->flags & PG_FICTITIOUS) != 0) {
	954	vm_page_wakeup(m);
	955	crit_exit();
	956	return;
	957	}
	958
	959	m->valid = 0;
	960	vm_page_undirty(m);
	961
	962	if (m->wire_count != 0) {
	963	if (m->wire_count > 1) {
	964	panic(
	965	"vm_page_free: invalid wire count (%d), pindex: 0x%lx",
	966	m->wire_count, (long)m->pindex);
	967	}
	968	panic("vm_page_free: freeing wired page");
	969	}
	970
	971	/*
	972	* Clear the UNMANAGED flag when freeing an unmanaged page.
	973	*/
	974	if (m->flags & PG_UNMANAGED) {
	975	m->flags &= ~PG_UNMANAGED;
	976	}
	977
	978	if (m->hold_count != 0) {
	979	m->flags &= ~PG_ZERO;
	980	m->queue = PQ_HOLD;
	981	} else {
	982	m->queue = PQ_FREE + m->pc;
	983	}
	984	pq = &vm_page_queues[m->queue];
	985	pq->lcnt++;
	986	++(*pq->cnt);
	987
	988	/*
	989	* Put zero'd pages on the end ( where we look for zero'd pages
	990	* first ) and non-zerod pages at the head.
	991	*/
	992	if (m->flags & PG_ZERO) {
	993	TAILQ_INSERT_TAIL(&pq->pl, m, pageq);
	994	++vm_page_zero_count;
	995	} else {
	996	TAILQ_INSERT_HEAD(&pq->pl, m, pageq);
	997	}
	998	vm_page_wakeup(m);
	999	vm_page_free_wakeup();
	1000	crit_exit();
	1001	}
	1002
	1003	/*
	1004	* vm_page_unmanage()
	1005	*
	1006	* Prevent PV management from being done on the page. The page is
	1007	* removed from the paging queues as if it were wired, and as a
	1008	* consequence of no longer being managed the pageout daemon will not
	1009	* touch it (since there is no way to locate the pte mappings for the
	1010	* page). madvise() calls that mess with the pmap will also no longer
	1011	* operate on the page.
	1012	*
	1013	* Beyond that the page is still reasonably 'normal'. Freeing the page
	1014	* will clear the flag.
	1015	*
	1016	* This routine is used by OBJT_PHYS objects - objects using unswappable
	1017	* physical memory as backing store rather then swap-backed memory and
	1018	* will eventually be extended to support 4MB unmanaged physical
	1019	* mappings.
	1020	*
	1021	* Must be called with a critical section held.
	1022	*/
	1023	void
	1024	vm_page_unmanage(vm_page_t m)
	1025	{
	1026	ASSERT_IN_CRIT_SECTION();
	1027	if ((m->flags & PG_UNMANAGED) == 0) {
	1028	if (m->wire_count == 0)
	1029	vm_page_unqueue(m);
	1030	}
	1031	vm_page_flag_set(m, PG_UNMANAGED);
	1032	}
	1033
	1034	/*
	1035	* Mark this page as wired down by yet another map, removing it from
	1036	* paging queues as necessary.
	1037	*
	1038	* The page queues must be locked.
	1039	* This routine may not block.
	1040	*/
	1041	void
	1042	vm_page_wire(vm_page_t m)
	1043	{
	1044	/*
	1045	* Only bump the wire statistics if the page is not already wired,
	1046	* and only unqueue the page if it is on some queue (if it is unmanaged
	1047	* it is already off the queues). Don't do anything with fictitious
	1048	* pages because they are always wired.
	1049	*/
	1050	crit_enter();
	1051	if ((m->flags & PG_FICTITIOUS) == 0) {
	1052	if (m->wire_count == 0) {
	1053	if ((m->flags & PG_UNMANAGED) == 0)
	1054	vm_page_unqueue(m);
	1055	vmstats.v_wire_count++;
	1056	}
	1057	m->wire_count++;
	1058	KASSERT(m->wire_count != 0,
	1059	("vm_page_wire: wire_count overflow m=%p", m));
	1060	}
	1061	crit_exit();
	1062	}
	1063
	1064	/*
	1065	* Release one wiring of this page, potentially enabling it to be paged again.
	1066	*
	1067	* Many pages placed on the inactive queue should actually go
	1068	* into the cache, but it is difficult to figure out which. What
	1069	* we do instead, if the inactive target is well met, is to put
	1070	* clean pages at the head of the inactive queue instead of the tail.
	1071	* This will cause them to be moved to the cache more quickly and
	1072	* if not actively re-referenced, freed more quickly. If we just
	1073	* stick these pages at the end of the inactive queue, heavy filesystem
	1074	* meta-data accesses can cause an unnecessary paging load on memory bound
	1075	* processes. This optimization causes one-time-use metadata to be
	1076	* reused more quickly.
	1077	*
	1078	* BUT, if we are in a low-memory situation we have no choice but to
	1079	* put clean pages on the cache queue.
	1080	*
	1081	* A number of routines use vm_page_unwire() to guarantee that the page
	1082	* will go into either the inactive or active queues, and will NEVER
	1083	* be placed in the cache - for example, just after dirtying a page.
	1084	* dirty pages in the cache are not allowed.
	1085	*
	1086	* The page queues must be locked.
	1087	* This routine may not block.
	1088	*/
	1089	void
	1090	vm_page_unwire(vm_page_t m, int activate)
	1091	{
	1092	crit_enter();
	1093	if (m->flags & PG_FICTITIOUS) {
	1094	/* do nothing */
	1095	} else if (m->wire_count <= 0) {
	1096	panic("vm_page_unwire: invalid wire count: %d", m->wire_count);
	1097	} else {
	1098	if (--m->wire_count == 0) {
	1099	--vmstats.v_wire_count;
	1100	if (m->flags & PG_UNMANAGED) {
	1101	;
	1102	} else if (activate) {
	1103	TAILQ_INSERT_TAIL(
	1104	&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
	1105	m->queue = PQ_ACTIVE;
	1106	vm_page_queues[PQ_ACTIVE].lcnt++;
	1107	vmstats.v_active_count++;
	1108	} else {
	1109	vm_page_flag_clear(m, PG_WINATCFLS);
	1110	TAILQ_INSERT_TAIL(
	1111	&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
	1112	m->queue = PQ_INACTIVE;
	1113	vm_page_queues[PQ_INACTIVE].lcnt++;
	1114	vmstats.v_inactive_count++;
	1115	}
	1116	}
	1117	}
	1118	crit_exit();
	1119	}
	1120
	1121
	1122	/*
	1123	* Move the specified page to the inactive queue. If the page has
	1124	* any associated swap, the swap is deallocated.
	1125	*
	1126	* Normally athead is 0 resulting in LRU operation. athead is set
	1127	* to 1 if we want this page to be 'as if it were placed in the cache',
	1128	* except without unmapping it from the process address space.
	1129	*
	1130	* This routine may not block.
	1131	*/
	1132	static __inline void
	1133	_vm_page_deactivate(vm_page_t m, int athead)
	1134	{
	1135	/*
	1136	* Ignore if already inactive.
	1137	*/
	1138	if (m->queue == PQ_INACTIVE)
	1139	return;
	1140
	1141	if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
	1142	if ((m->queue - m->pc) == PQ_CACHE)
	1143	mycpu->gd_cnt.v_reactivated++;
	1144	vm_page_flag_clear(m, PG_WINATCFLS);
	1145	vm_page_unqueue(m);
	1146	if (athead)
	1147	TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
	1148	else
	1149	TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
	1150	m->queue = PQ_INACTIVE;
	1151	vm_page_queues[PQ_INACTIVE].lcnt++;
	1152	vmstats.v_inactive_count++;
	1153	}
	1154	}
	1155
	1156	void
	1157	vm_page_deactivate(vm_page_t m)
	1158	{
	1159	crit_enter();
	1160	_vm_page_deactivate(m, 0);
	1161	crit_exit();
	1162	}
	1163
	1164	/*
	1165	* vm_page_try_to_cache:
	1166	*
	1167	* Returns 0 on failure, 1 on success
	1168	*/
	1169	int
	1170	vm_page_try_to_cache(vm_page_t m)
	1171	{
	1172	crit_enter();
	1173	if (m->dirty \|\| m->hold_count \|\| m->busy \|\| m->wire_count \|\|
	1174	(m->flags & (PG_BUSY\|PG_UNMANAGED))) {
	1175	crit_exit();
	1176	return(0);
	1177	}
	1178	vm_page_test_dirty(m);
	1179	if (m->dirty) {
	1180	crit_exit();
	1181	return(0);
	1182	}
	1183	vm_page_cache(m);
	1184	crit_exit();
	1185	return(1);
	1186	}
	1187
	1188	/*
	1189	* Attempt to free the page. If we cannot free it, we do nothing.
	1190	* 1 is returned on success, 0 on failure.
	1191	*/
	1192	int
	1193	vm_page_try_to_free(vm_page_t m)
	1194	{
	1195	crit_enter();
	1196	if (m->dirty \|\| m->hold_count \|\| m->busy \|\| m->wire_count \|\|
	1197	(m->flags & (PG_BUSY\|PG_UNMANAGED))) {
	1198	crit_exit();
	1199	return(0);
	1200	}
	1201	vm_page_test_dirty(m);
	1202	if (m->dirty) {
	1203	crit_exit();
	1204	return(0);
	1205	}
	1206	vm_page_busy(m);
	1207	vm_page_protect(m, VM_PROT_NONE);
	1208	vm_page_free(m);
	1209	crit_exit();
	1210	return(1);
	1211	}
	1212
	1213	/*
	1214	* vm_page_cache
	1215	*
	1216	* Put the specified page onto the page cache queue (if appropriate).
	1217	*
	1218	* This routine may not block.
	1219	*/
	1220	void
	1221	vm_page_cache(vm_page_t m)
	1222	{
	1223	ASSERT_IN_CRIT_SECTION();
	1224
	1225	if ((m->flags & (PG_BUSY\|PG_UNMANAGED)) \|\| m->busy \|\|
	1226	m->wire_count \|\| m->hold_count) {
	1227	kprintf("vm_page_cache: attempting to cache busy/held page\n");
	1228	return;
	1229	}
	1230
	1231	/*
	1232	* Already in the cache (and thus not mapped)
	1233	*/
	1234	if ((m->queue - m->pc) == PQ_CACHE) {
	1235	KKASSERT((m->flags & PG_MAPPED) == 0);
	1236	return;
	1237	}
	1238
	1239	/*
	1240	* Caller is required to test m->dirty, but note that the act of
	1241	* removing the page from its maps can cause it to become dirty
	1242	* on an SMP system due to another cpu running in usermode.
	1243	*/
	1244	if (m->dirty) {
	1245	panic("vm_page_cache: caching a dirty page, pindex: %ld",
	1246	(long)m->pindex);
	1247	}
	1248
	1249	/*
	1250	* Remove all pmaps and indicate that the page is not
	1251	* writeable or mapped. Our vm_page_protect() call may
	1252	* have blocked (especially w/ VM_PROT_NONE), so recheck
	1253	* everything.
	1254	*/
	1255	vm_page_busy(m);
	1256	vm_page_protect(m, VM_PROT_NONE);
	1257	vm_page_wakeup(m);
	1258	if ((m->flags & (PG_BUSY\|PG_UNMANAGED\|PG_MAPPED)) \|\| m->busy \|\|
	1259	m->wire_count \|\| m->hold_count) {
	1260	/* do nothing */
	1261	} else if (m->dirty) {
	1262	vm_page_deactivate(m);
	1263	} else {
	1264	vm_page_unqueue_nowakeup(m);
	1265	m->queue = PQ_CACHE + m->pc;
	1266	vm_page_queues[m->queue].lcnt++;
	1267	TAILQ_INSERT_TAIL(&vm_page_queues[m->queue].pl, m, pageq);
	1268	vmstats.v_cache_count++;
	1269	vm_page_free_wakeup();
	1270	}
	1271	}
	1272
	1273	/*
	1274	* vm_page_dontneed()
	1275	*
	1276	* Cache, deactivate, or do nothing as appropriate. This routine
	1277	* is typically used by madvise() MADV_DONTNEED.
	1278	*
	1279	* Generally speaking we want to move the page into the cache so
	1280	* it gets reused quickly. However, this can result in a silly syndrome
	1281	* due to the page recycling too quickly. Small objects will not be
	1282	* fully cached. On the otherhand, if we move the page to the inactive
	1283	* queue we wind up with a problem whereby very large objects
	1284	* unnecessarily blow away our inactive and cache queues.
	1285	*
	1286	* The solution is to move the pages based on a fixed weighting. We
	1287	* either leave them alone, deactivate them, or move them to the cache,
	1288	* where moving them to the cache has the highest weighting.
	1289	* By forcing some pages into other queues we eventually force the
	1290	* system to balance the queues, potentially recovering other unrelated
	1291	* space from active. The idea is to not force this to happen too
	1292	* often.
	1293	*/
	1294	void
	1295	vm_page_dontneed(vm_page_t m)
	1296	{
	1297	static int dnweight;
	1298	int dnw;
	1299	int head;
	1300
	1301	dnw = ++dnweight;
	1302
	1303	/*
	1304	* occassionally leave the page alone
	1305	*/
	1306	crit_enter();
	1307	if ((dnw & 0x01F0) == 0 \|\|
	1308	m->queue == PQ_INACTIVE \|\|
	1309	m->queue - m->pc == PQ_CACHE
	1310	) {
	1311	if (m->act_count >= ACT_INIT)
	1312	--m->act_count;
	1313	crit_exit();
	1314	return;
	1315	}
	1316
	1317	if (m->dirty == 0)
	1318	vm_page_test_dirty(m);
	1319
	1320	if (m->dirty \|\| (dnw & 0x0070) == 0) {
	1321	/*
	1322	* Deactivate the page 3 times out of 32.
	1323	*/
	1324	head = 0;
	1325	} else {
	1326	/*
	1327	* Cache the page 28 times out of every 32. Note that
	1328	* the page is deactivated instead of cached, but placed
	1329	* at the head of the queue instead of the tail.
	1330	*/
	1331	head = 1;
	1332	}
	1333	_vm_page_deactivate(m, head);
	1334	crit_exit();
	1335	}
	1336
	1337	/*
	1338	* Grab a page, blocking if it is busy and allocating a page if necessary.
	1339	* A busy page is returned or NULL.
	1340	*
	1341	* If VM_ALLOC_RETRY is specified VM_ALLOC_NORMAL must also be specified.
	1342	* If VM_ALLOC_RETRY is not specified
	1343	*
	1344	* This routine may block, but if VM_ALLOC_RETRY is not set then NULL is
	1345	* always returned if we had blocked.
	1346	* This routine will never return NULL if VM_ALLOC_RETRY is set.
	1347	* This routine may not be called from an interrupt.
	1348	* The returned page may not be entirely valid.
	1349	*
	1350	* This routine may be called from mainline code without spl protection and
	1351	* be guarenteed a busied page associated with the object at the specified
	1352	* index.
	1353	*/
	1354	vm_page_t
	1355	vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
	1356	{
	1357	vm_page_t m;
	1358	int generation;
	1359
	1360	KKASSERT(allocflags &
	1361	(VM_ALLOC_NORMAL\|VM_ALLOC_INTERRUPT\|VM_ALLOC_SYSTEM));
	1362	crit_enter();
	1363	retrylookup:
	1364	if ((m = vm_page_lookup(object, pindex)) != NULL) {
	1365	if (m->busy \|\| (m->flags & PG_BUSY)) {
	1366	generation = object->generation;
	1367
	1368	while ((object->generation == generation) &&
	1369	(m->busy \|\| (m->flags & PG_BUSY))) {
	1370	vm_page_flag_set(m, PG_WANTED \| PG_REFERENCED);
	1371	tsleep(m, 0, "pgrbwt", 0);
	1372	if ((allocflags & VM_ALLOC_RETRY) == 0) {
	1373	m = NULL;
	1374	goto done;
	1375	}
	1376	}
	1377	goto retrylookup;
	1378	} else {
	1379	vm_page_busy(m);
	1380	goto done;
	1381	}
	1382	}
	1383	m = vm_page_alloc(object, pindex, allocflags & ~VM_ALLOC_RETRY);
	1384	if (m == NULL) {
	1385	vm_wait(0);
	1386	if ((allocflags & VM_ALLOC_RETRY) == 0)
	1387	goto done;
	1388	goto retrylookup;
	1389	}
	1390	done:
	1391	crit_exit();
	1392	return(m);
	1393	}
	1394
	1395	/*
	1396	* Mapping function for valid bits or for dirty bits in
	1397	* a page. May not block.
	1398	*
	1399	* Inputs are required to range within a page.
	1400	*/
	1401	__inline int
	1402	vm_page_bits(int base, int size)
	1403	{
	1404	int first_bit;
	1405	int last_bit;
	1406
	1407	KASSERT(
	1408	base + size <= PAGE_SIZE,
	1409	("vm_page_bits: illegal base/size %d/%d", base, size)
	1410	);
	1411
	1412	if (size == 0) /* handle degenerate case */
	1413	return(0);
	1414
	1415	first_bit = base >> DEV_BSHIFT;
	1416	last_bit = (base + size - 1) >> DEV_BSHIFT;
	1417
	1418	return ((2 << last_bit) - (1 << first_bit));
	1419	}
	1420
	1421	/*
	1422	* Sets portions of a page valid and clean. The arguments are expected
	1423	* to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
	1424	* of any partial chunks touched by the range. The invalid portion of
	1425	* such chunks will be zero'd.
	1426	*
	1427	* This routine may not block.
	1428	*
	1429	* (base + size) must be less then or equal to PAGE_SIZE.
	1430	*/
	1431	void
	1432	vm_page_set_validclean(vm_page_t m, int base, int size)
	1433	{
	1434	int pagebits;
	1435	int frag;
	1436	int endoff;
	1437
	1438	if (size == 0) /* handle degenerate case */
	1439	return;
	1440
	1441	/*
	1442	* If the base is not DEV_BSIZE aligned and the valid
	1443	* bit is clear, we have to zero out a portion of the
	1444	* first block.
	1445	*/
	1446
	1447	if ((frag = base & ~(DEV_BSIZE - 1)) != base &&
	1448	(m->valid & (1 << (base >> DEV_BSHIFT))) == 0
	1449	) {
	1450	pmap_zero_page_area(
	1451	VM_PAGE_TO_PHYS(m),
	1452	frag,
	1453	base - frag
	1454	);
	1455	}
	1456
	1457	/*
	1458	* If the ending offset is not DEV_BSIZE aligned and the
	1459	* valid bit is clear, we have to zero out a portion of
	1460	* the last block.
	1461	*/
	1462
	1463	endoff = base + size;
	1464
	1465	if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff &&
	1466	(m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0
	1467	) {
	1468	pmap_zero_page_area(
	1469	VM_PAGE_TO_PHYS(m),
	1470	endoff,
	1471	DEV_BSIZE - (endoff & (DEV_BSIZE - 1))
	1472	);
	1473	}
	1474
	1475	/*
	1476	* Set valid, clear dirty bits. If validating the entire
	1477	* page we can safely clear the pmap modify bit. We also
	1478	* use this opportunity to clear the PG_NOSYNC flag. If a process
	1479	* takes a write fault on a MAP_NOSYNC memory area the flag will
	1480	* be set again.
	1481	*
	1482	* We set valid bits inclusive of any overlap, but we can only
	1483	* clear dirty bits for DEV_BSIZE chunks that are fully within
	1484	* the range.
	1485	*/
	1486
	1487	pagebits = vm_page_bits(base, size);
	1488	m->valid \|= pagebits;
	1489	#if 0 /* NOT YET */
	1490	if ((frag = base & (DEV_BSIZE - 1)) != 0) {
	1491	frag = DEV_BSIZE - frag;
	1492	base += frag;
	1493	size -= frag;
	1494	if (size < 0)
	1495	size = 0;
	1496	}
	1497	pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1));
	1498	#endif
	1499	m->dirty &= ~pagebits;
	1500	if (base == 0 && size == PAGE_SIZE) {
	1501	pmap_clear_modify(m);
	1502	vm_page_flag_clear(m, PG_NOSYNC);
	1503	}
	1504	}
	1505
	1506	void
	1507	vm_page_clear_dirty(vm_page_t m, int base, int size)
	1508	{
	1509	m->dirty &= ~vm_page_bits(base, size);
	1510	}
	1511
	1512	/*
	1513	* Make the page all-dirty.
	1514	*
	1515	* Also make sure the related object and vnode reflect the fact that the
	1516	* object may now contain a dirty page.
	1517	*/
	1518	void
	1519	vm_page_dirty(vm_page_t m)
	1520	{
	1521	#ifdef INVARIANTS
	1522	int pqtype = m->queue - m->pc;
	1523	#endif
	1524	KASSERT(pqtype != PQ_CACHE && pqtype != PQ_FREE,
	1525	("vm_page_dirty: page in free/cache queue!"));
	1526	if (m->dirty != VM_PAGE_BITS_ALL) {
	1527	m->dirty = VM_PAGE_BITS_ALL;
	1528	if (m->object)
	1529	vm_object_set_writeable_dirty(m->object);
	1530	}
	1531	}
	1532
	1533	/*
	1534	* Invalidates DEV_BSIZE'd chunks within a page. Both the
	1535	* valid and dirty bits for the effected areas are cleared.
	1536	*
	1537	* May not block.
	1538	*/
	1539	void
	1540	vm_page_set_invalid(vm_page_t m, int base, int size)
	1541	{
	1542	int bits;
	1543
	1544	bits = vm_page_bits(base, size);
	1545	m->valid &= ~bits;
	1546	m->dirty &= ~bits;
	1547	m->object->generation++;
	1548	}
	1549
	1550	/*
	1551	* The kernel assumes that the invalid portions of a page contain
	1552	* garbage, but such pages can be mapped into memory by user code.
	1553	* When this occurs, we must zero out the non-valid portions of the
	1554	* page so user code sees what it expects.
	1555	*
	1556	* Pages are most often semi-valid when the end of a file is mapped
	1557	* into memory and the file's size is not page aligned.
	1558	*/
	1559	void
	1560	vm_page_zero_invalid(vm_page_t m, boolean_t setvalid)
	1561	{
	1562	int b;
	1563	int i;
	1564
	1565	/*
	1566	* Scan the valid bits looking for invalid sections that
	1567	* must be zerod. Invalid sub-DEV_BSIZE'd areas ( where the
	1568	* valid bit may be set ) have already been zerod by
	1569	* vm_page_set_validclean().
	1570	*/
	1571	for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
	1572	if (i == (PAGE_SIZE / DEV_BSIZE) \|\|
	1573	(m->valid & (1 << i))
	1574	) {
	1575	if (i > b) {
	1576	pmap_zero_page_area(
	1577	VM_PAGE_TO_PHYS(m),
	1578	b << DEV_BSHIFT,
	1579	(i - b) << DEV_BSHIFT
	1580	);
	1581	}
	1582	b = i + 1;
	1583	}
	1584	}
	1585
	1586	/*
	1587	* setvalid is TRUE when we can safely set the zero'd areas
	1588	* as being valid. We can do this if there are no cache consistency
	1589	* issues. e.g. it is ok to do with UFS, but not ok to do with NFS.
	1590	*/
	1591	if (setvalid)
	1592	m->valid = VM_PAGE_BITS_ALL;
	1593	}
	1594
	1595	/*
	1596	* Is a (partial) page valid? Note that the case where size == 0
	1597	* will return FALSE in the degenerate case where the page is entirely
	1598	* invalid, and TRUE otherwise.
	1599	*
	1600	* May not block.
	1601	*/
	1602	int
	1603	vm_page_is_valid(vm_page_t m, int base, int size)
	1604	{
	1605	int bits = vm_page_bits(base, size);
	1606
	1607	if (m->valid && ((m->valid & bits) == bits))
	1608	return 1;
	1609	else
	1610	return 0;
	1611	}
	1612
	1613	/*
	1614	* update dirty bits from pmap/mmu. May not block.
	1615	*/
	1616	void
	1617	vm_page_test_dirty(vm_page_t m)
	1618	{
	1619	if ((m->dirty != VM_PAGE_BITS_ALL) && pmap_is_modified(m)) {
	1620	vm_page_dirty(m);
	1621	}
	1622	}
	1623
	1624	/*
	1625	* Issue an event on a VM page. Corresponding action structures are
	1626	* removed from the page's list and called.
	1627	*/
	1628	void
	1629	vm_page_event_internal(vm_page_t m, vm_page_event_t event)
	1630	{
	1631	struct vm_page_action scan, next;
	1632
	1633	LIST_FOREACH_MUTABLE(scan, &m->action_list, entry, next) {
	1634	if (scan->event == event) {
	1635	scan->event = VMEVENT_NONE;
	1636	LIST_REMOVE(scan, entry);
	1637	scan->func(m, scan);
	1638	}
	1639	}
	1640	}
	1641
	1642	#include "opt_ddb.h"
	1643	#ifdef DDB
	1644	#include <sys/kernel.h>
	1645
	1646	#include <ddb/ddb.h>
	1647
	1648	DB_SHOW_COMMAND(page, vm_page_print_page_info)
	1649	{
	1650	db_printf("vmstats.v_free_count: %d\n", vmstats.v_free_count);
	1651	db_printf("vmstats.v_cache_count: %d\n", vmstats.v_cache_count);
	1652	db_printf("vmstats.v_inactive_count: %d\n", vmstats.v_inactive_count);
	1653	db_printf("vmstats.v_active_count: %d\n", vmstats.v_active_count);
	1654	db_printf("vmstats.v_wire_count: %d\n", vmstats.v_wire_count);
	1655	db_printf("vmstats.v_free_reserved: %d\n", vmstats.v_free_reserved);
	1656	db_printf("vmstats.v_free_min: %d\n", vmstats.v_free_min);
	1657	db_printf("vmstats.v_free_target: %d\n", vmstats.v_free_target);
	1658	db_printf("vmstats.v_cache_min: %d\n", vmstats.v_cache_min);
	1659	db_printf("vmstats.v_inactive_target: %d\n", vmstats.v_inactive_target);
	1660	}
	1661
	1662	DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
	1663	{
	1664	int i;
	1665	db_printf("PQ_FREE:");
	1666	for(i=0;i<PQ_L2_SIZE;i++) {
	1667	db_printf(" %d", vm_page_queues[PQ_FREE + i].lcnt);
	1668	}
	1669	db_printf("\n");
	1670
	1671	db_printf("PQ_CACHE:");
	1672	for(i=0;i<PQ_L2_SIZE;i++) {
	1673	db_printf(" %d", vm_page_queues[PQ_CACHE + i].lcnt);
	1674	}
	1675	db_printf("\n");
	1676
	1677	db_printf("PQ_ACTIVE: %d, PQ_INACTIVE: %d\n",
	1678	vm_page_queues[PQ_ACTIVE].lcnt,
	1679	vm_page_queues[PQ_INACTIVE].lcnt);
	1680	}
	1681	#endif /* DDB */