gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* (MPSAFE)
	3	*
	4	* Copyright (c) 1991, 1993
	5	* The Regents of the University of California. All rights reserved.
	6	* Copyright (c) 1994 John S. Dyson
	7	* All rights reserved.
	8	* Copyright (c) 1994 David Greenman
	9	* All rights reserved.
	10	*
	11	*
	12	* This code is derived from software contributed to Berkeley by
	13	* The Mach Operating System project at Carnegie-Mellon University.
	14	*
	15	* Redistribution and use in source and binary forms, with or without
	16	* modification, are permitted provided that the following conditions
	17	* are met:
	18	* 1. Redistributions of source code must retain the above copyright
	19	* notice, this list of conditions and the following disclaimer.
	20	* 2. Redistributions in binary form must reproduce the above copyright
	21	* notice, this list of conditions and the following disclaimer in the
	22	* documentation and/or other materials provided with the distribution.
	23	* 3. All advertising materials mentioning features or use of this software
	24	* must display the following acknowledgement:
	25	* This product includes software developed by the University of
	26	* California, Berkeley and its contributors.
	27	* 4. Neither the name of the University nor the names of its contributors
	28	* may be used to endorse or promote products derived from this software
	29	* without specific prior written permission.
	30	*
	31	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	32	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	33	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	34	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	35	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	36	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	37	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	38	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	39	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	40	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	41	* SUCH DAMAGE.
	42	*
	43	* from: @(#)vm_fault.c 8.4 (Berkeley) 1/12/94
	44	*
	45	*
	46	* Copyright (c) 1987, 1990 Carnegie-Mellon University.
	47	* All rights reserved.
	48	*
	49	* Authors: Avadis Tevanian, Jr., Michael Wayne Young
	50	*
	51	* Permission to use, copy, modify and distribute this software and
	52	* its documentation is hereby granted, provided that both the copyright
	53	* notice and this permission notice appear in all copies of the
	54	* software, derivative works or modified versions, and any portions
	55	* thereof, and that both notices appear in supporting documentation.
	56	*
	57	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
	58	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
	59	* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
	60	*
	61	* Carnegie Mellon requests users of this software to return to
	62	*
	63	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
	64	* School of Computer Science
	65	* Carnegie Mellon University
	66	* Pittsburgh PA 15213-3890
	67	*
	68	* any improvements or extensions that they make and grant Carnegie the
	69	* rights to redistribute these changes.
	70	*
	71	* $FreeBSD: src/sys/vm/vm_fault.c,v 1.108.2.8 2002/02/26 05:49:27 silby Exp $
	72	* $DragonFly: src/sys/vm/vm_fault.c,v 1.47 2008/07/01 02:02:56 dillon Exp $
	73	*/
	74
	75	/*
	76	* Page fault handling module.
	77	*/
	78
	79	#include <sys/param.h>
	80	#include <sys/systm.h>
	81	#include <sys/kernel.h>
	82	#include <sys/proc.h>
	83	#include <sys/vnode.h>
	84	#include <sys/resourcevar.h>
	85	#include <sys/vmmeter.h>
	86	#include <sys/vkernel.h>
	87	#include <sys/lock.h>
	88	#include <sys/sysctl.h>
	89
	90	#include <cpu/lwbuf.h>
	91
	92	#include <vm/vm.h>
	93	#include <vm/vm_param.h>
	94	#include <vm/pmap.h>
	95	#include <vm/vm_map.h>
	96	#include <vm/vm_object.h>
	97	#include <vm/vm_page.h>
	98	#include <vm/vm_pageout.h>
	99	#include <vm/vm_kern.h>
	100	#include <vm/vm_pager.h>
	101	#include <vm/vnode_pager.h>
	102	#include <vm/vm_extern.h>
	103
	104	#include <sys/thread2.h>
	105	#include <vm/vm_page2.h>
	106
	107	struct faultstate {
	108	vm_page_t m;
	109	vm_object_t object;
	110	vm_pindex_t pindex;
	111	vm_prot_t prot;
	112	vm_page_t first_m;
	113	vm_object_t first_object;
	114	vm_prot_t first_prot;
	115	vm_map_t map;
	116	vm_map_entry_t entry;
	117	int lookup_still_valid;
	118	int didlimit;
	119	int hardfault;
	120	int fault_flags;
	121	int map_generation;
	122	boolean_t wired;
	123	struct vnode *vp;
	124	};
	125
	126	static int vm_fast_fault = 1;
	127	SYSCTL_INT(_vm, OID_AUTO, fast_fault, CTLFLAG_RW, &vm_fast_fault, 0, "");
	128	static int debug_cluster = 0;
	129	SYSCTL_INT(_vm, OID_AUTO, debug_cluster, CTLFLAG_RW, &debug_cluster, 0, "");
	130
	131	static int vm_fault_object(struct faultstate *, vm_pindex_t, vm_prot_t);
	132	static int vm_fault_vpagetable(struct faultstate , vm_pindex_t , vpte_t, int);
	133	#if 0
	134	static int vm_fault_additional_pages (vm_page_t, int, int, vm_page_t , int );
	135	#endif
	136	static int vm_fault_ratelimit(struct vmspace *);
	137	static void vm_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry,
	138	int prot);
	139
	140	/*
	141	* The caller must hold vm_token.
	142	*/
	143	static __inline void
	144	release_page(struct faultstate *fs)
	145	{
	146	vm_page_deactivate(fs->m);
	147	vm_page_wakeup(fs->m);
	148	fs->m = NULL;
	149	}
	150
	151	/*
	152	* The caller must hold vm_token.
	153	*/
	154	static __inline void
	155	unlock_map(struct faultstate *fs)
	156	{
	157	if (fs->lookup_still_valid && fs->map) {
	158	vm_map_lookup_done(fs->map, fs->entry, 0);
	159	fs->lookup_still_valid = FALSE;
	160	}
	161	}
	162
	163	/*
	164	* Clean up after a successful call to vm_fault_object() so another call
	165	* to vm_fault_object() can be made.
	166	*
	167	* The caller must hold vm_token.
	168	*/
	169	static void
	170	_cleanup_successful_fault(struct faultstate *fs, int relock)
	171	{
	172	if (fs->object != fs->first_object) {
	173	vm_page_free(fs->first_m);
	174	vm_object_pip_wakeup(fs->object);
	175	fs->first_m = NULL;
	176	}
	177	fs->object = fs->first_object;
	178	if (relock && fs->lookup_still_valid == FALSE) {
	179	if (fs->map)
	180	vm_map_lock_read(fs->map);
	181	fs->lookup_still_valid = TRUE;
	182	}
	183	}
	184
	185	/*
	186	* The caller must hold vm_token.
	187	*/
	188	static void
	189	_unlock_things(struct faultstate *fs, int dealloc)
	190	{
	191	vm_object_pip_wakeup(fs->first_object);
	192	_cleanup_successful_fault(fs, 0);
	193	if (dealloc) {
	194	vm_object_deallocate(fs->first_object);
	195	fs->first_object = NULL;
	196	}
	197	unlock_map(fs);
	198	if (fs->vp != NULL) {
	199	vput(fs->vp);
	200	fs->vp = NULL;
	201	}
	202	}
	203
	204	#define unlock_things(fs) _unlock_things(fs, 0)
	205	#define unlock_and_deallocate(fs) _unlock_things(fs, 1)
	206	#define cleanup_successful_fault(fs) _cleanup_successful_fault(fs, 1)
	207
	208	/*
	209	* TRYPAGER
	210	*
	211	* Determine if the pager for the current object might contain the page.
	212	*
	213	* We only need to try the pager if this is not a default object (default
	214	* objects are zero-fill and have no real pager), and if we are not taking
	215	* a wiring fault or if the FS entry is wired.
	216	*/
	217	#define TRYPAGER(fs) \
	218	(fs->object->type != OBJT_DEFAULT && \
	219	(((fs->fault_flags & VM_FAULT_WIRE_MASK) == 0) \|\| fs->wired))
	220
	221	/*
	222	* vm_fault:
	223	*
	224	* Handle a page fault occuring at the given address, requiring the given
	225	* permissions, in the map specified. If successful, the page is inserted
	226	* into the associated physical map.
	227	*
	228	* NOTE: The given address should be truncated to the proper page address.
	229	*
	230	* KERN_SUCCESS is returned if the page fault is handled; otherwise,
	231	* a standard error specifying why the fault is fatal is returned.
	232	*
	233	* The map in question must be referenced, and remains so.
	234	* The caller may hold no locks.
	235	* No other requirements.
	236	*/
	237	int
	238	vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags)
	239	{
	240	int result;
	241	vm_pindex_t first_pindex;
	242	struct faultstate fs;
	243	int growstack;
	244
	245	mycpu->gd_cnt.v_vm_faults++;
	246
	247	fs.didlimit = 0;
	248	fs.hardfault = 0;
	249	fs.fault_flags = fault_flags;
	250	growstack = 1;
	251
	252	RetryFault:
	253	/*
	254	* Find the vm_map_entry representing the backing store and resolve
	255	* the top level object and page index. This may have the side
	256	* effect of executing a copy-on-write on the map entry and/or
	257	* creating a shadow object, but will not COW any actual VM pages.
	258	*
	259	* On success fs.map is left read-locked and various other fields
	260	* are initialized but not otherwise referenced or locked.
	261	*
	262	* NOTE! vm_map_lookup will try to upgrade the fault_type to
	263	* VM_FAULT_WRITE if the map entry is a virtual page table and also
	264	* writable, so we can set the 'A'accessed bit in the virtual page
	265	* table entry.
	266	*/
	267	fs.map = map;
	268	result = vm_map_lookup(&fs.map, vaddr, fault_type,
	269	&fs.entry, &fs.first_object,
	270	&first_pindex, &fs.first_prot, &fs.wired);
	271
	272	/*
	273	* If the lookup failed or the map protections are incompatible,
	274	* the fault generally fails. However, if the caller is trying
	275	* to do a user wiring we have more work to do.
	276	*/
	277	if (result != KERN_SUCCESS) {
	278	if (result != KERN_PROTECTION_FAILURE \|\|
	279	(fs.fault_flags & VM_FAULT_WIRE_MASK) != VM_FAULT_USER_WIRE)
	280	{
	281	if (result == KERN_INVALID_ADDRESS && growstack &&
	282	map != &kernel_map && curproc != NULL) {
	283	result = vm_map_growstack(curproc, vaddr);
	284	if (result != KERN_SUCCESS)
	285	return (KERN_FAILURE);
	286	growstack = 0;
	287	goto RetryFault;
	288	}
	289	return (result);
	290	}
	291
	292	/*
	293	* If we are user-wiring a r/w segment, and it is COW, then
	294	* we need to do the COW operation. Note that we don't
	295	* currently COW RO sections now, because it is NOT desirable
	296	* to COW .text. We simply keep .text from ever being COW'ed
	297	* and take the heat that one cannot debug wired .text sections.
	298	*/
	299	result = vm_map_lookup(&fs.map, vaddr,
	300	VM_PROT_READ\|VM_PROT_WRITE\|
	301	VM_PROT_OVERRIDE_WRITE,
	302	&fs.entry, &fs.first_object,
	303	&first_pindex, &fs.first_prot,
	304	&fs.wired);
	305	if (result != KERN_SUCCESS)
	306	return result;
	307
	308	/*
	309	* If we don't COW now, on a user wire, the user will never
	310	* be able to write to the mapping. If we don't make this
	311	* restriction, the bookkeeping would be nearly impossible.
	312	*/
	313	if ((fs.entry->protection & VM_PROT_WRITE) == 0)
	314	fs.entry->max_protection &= ~VM_PROT_WRITE;
	315	}
	316
	317	/*
	318	* fs.map is read-locked
	319	*
	320	* Misc checks. Save the map generation number to detect races.
	321	*/
	322	fs.map_generation = fs.map->timestamp;
	323
	324	if (fs.entry->eflags & MAP_ENTRY_NOFAULT) {
	325	panic("vm_fault: fault on nofault entry, addr: %lx",
	326	(u_long)vaddr);
	327	}
	328
	329	/*
	330	* A system map entry may return a NULL object. No object means
	331	* no pager means an unrecoverable kernel fault.
	332	*/
	333	if (fs.first_object == NULL) {
	334	panic("vm_fault: unrecoverable fault at %p in entry %p",
	335	(void *)vaddr, fs.entry);
	336	}
	337
	338	/*
	339	* Make a reference to this object to prevent its disposal while we
	340	* are messing with it. Once we have the reference, the map is free
	341	* to be diddled. Since objects reference their shadows (and copies),
	342	* they will stay around as well.
	343	*
	344	* Bump the paging-in-progress count to prevent size changes (e.g.
	345	* truncation operations) during I/O. This must be done after
	346	* obtaining the vnode lock in order to avoid possible deadlocks.
	347	*
	348	* The vm_token is needed to manipulate the vm_object
	349	*/
	350	lwkt_gettoken(&vm_token);
	351	vm_object_reference(fs.first_object);
	352	fs.vp = vnode_pager_lock(fs.first_object);
	353	vm_object_pip_add(fs.first_object, 1);
	354	lwkt_reltoken(&vm_token);
	355
	356	fs.lookup_still_valid = TRUE;
	357	fs.first_m = NULL;
	358	fs.object = fs.first_object; /* so unlock_and_deallocate works */
	359
	360	/*
	361	* If the entry is wired we cannot change the page protection.
	362	*/
	363	if (fs.wired)
	364	fault_type = fs.first_prot;
	365
	366	/*
	367	* The page we want is at (first_object, first_pindex), but if the
	368	* vm_map_entry is VM_MAPTYPE_VPAGETABLE we have to traverse the
	369	* page table to figure out the actual pindex.
	370	*
	371	* NOTE! DEVELOPMENT IN PROGRESS, THIS IS AN INITIAL IMPLEMENTATION
	372	* ONLY
	373	*/
	374	if (fs.entry->maptype == VM_MAPTYPE_VPAGETABLE) {
	375	result = vm_fault_vpagetable(&fs, &first_pindex,
	376	fs.entry->aux.master_pde,
	377	fault_type);
	378	if (result == KERN_TRY_AGAIN)
	379	goto RetryFault;
	380	if (result != KERN_SUCCESS)
	381	return (result);
	382	}
	383
	384	/*
	385	* Now we have the actual (object, pindex), fault in the page. If
	386	* vm_fault_object() fails it will unlock and deallocate the FS
	387	* data. If it succeeds everything remains locked and fs->object
	388	* will have an additional PIP count if it is not equal to
	389	* fs->first_object
	390	*
	391	* vm_fault_object will set fs->prot for the pmap operation. It is
	392	* allowed to set VM_PROT_WRITE if fault_type == VM_PROT_READ if the
	393	* page can be safely written. However, it will force a read-only
	394	* mapping for a read fault if the memory is managed by a virtual
	395	* page table.
	396	*/
	397	result = vm_fault_object(&fs, first_pindex, fault_type);
	398
	399	if (result == KERN_TRY_AGAIN)
	400	goto RetryFault;
	401	if (result != KERN_SUCCESS)
	402	return (result);
	403
	404	/*
	405	* On success vm_fault_object() does not unlock or deallocate, and fs.m
	406	* will contain a busied page.
	407	*
	408	* Enter the page into the pmap and do pmap-related adjustments.
	409	*/
	410	pmap_enter(fs.map->pmap, vaddr, fs.m, fs.prot, fs.wired);
	411
	412	/*
	413	* Burst in a few more pages if possible. The fs.map should still
	414	* be locked.
	415	*/
	416	if (fault_flags & VM_FAULT_BURST) {
	417	if ((fs.fault_flags & VM_FAULT_WIRE_MASK) == 0 &&
	418	fs.wired == 0) {
	419	vm_prefault(fs.map->pmap, vaddr, fs.entry, fs.prot);
	420	}
	421	}
	422	unlock_things(&fs);
	423
	424	vm_page_flag_clear(fs.m, PG_ZERO);
	425	vm_page_flag_set(fs.m, PG_REFERENCED);
	426
	427	/*
	428	* If the page is not wired down, then put it where the pageout daemon
	429	* can find it.
	430	*
	431	* We do not really need to get vm_token here but since all the
	432	* vm_*() calls have to doing it here improves efficiency.
	433	*/
	434	lwkt_gettoken(&vm_token);
	435	if (fs.fault_flags & VM_FAULT_WIRE_MASK) {
	436	if (fs.wired)
	437	vm_page_wire(fs.m);
	438	else
	439	vm_page_unwire(fs.m, 1);
	440	} else {
	441	vm_page_activate(fs.m);
	442	}
	443
	444	if (curthread->td_lwp) {
	445	if (fs.hardfault) {
	446	curthread->td_lwp->lwp_ru.ru_majflt++;
	447	} else {
	448	curthread->td_lwp->lwp_ru.ru_minflt++;
	449	}
	450	}
	451
	452	/*
	453	* Unlock everything, and return
	454	*/
	455	vm_page_wakeup(fs.m);
	456	vm_object_deallocate(fs.first_object);
	457	lwkt_reltoken(&vm_token);
	458
	459	return (KERN_SUCCESS);
	460	}
	461
	462	/*
	463	* Fault in the specified virtual address in the current process map,
	464	* returning a held VM page or NULL. See vm_fault_page() for more
	465	* information.
	466	*
	467	* No requirements.
	468	*/
	469	vm_page_t
	470	vm_fault_page_quick(vm_offset_t va, vm_prot_t fault_type, int *errorp)
	471	{
	472	struct lwp *lp = curthread->td_lwp;
	473	vm_page_t m;
	474
	475	m = vm_fault_page(&lp->lwp_vmspace->vm_map, va,
	476	fault_type, VM_FAULT_NORMAL, errorp);
	477	return(m);
	478	}
	479
	480	/*
	481	* Fault in the specified virtual address in the specified map, doing all
	482	* necessary manipulation of the object store and all necessary I/O. Return
	483	* a held VM page or NULL, and set *errorp. The related pmap is not
	484	* updated.
	485	*
	486	* The returned page will be properly dirtied if VM_PROT_WRITE was specified,
	487	* and marked PG_REFERENCED as well.
	488	*
	489	* If the page cannot be faulted writable and VM_PROT_WRITE was specified, an
	490	* error will be returned.
	491	*
	492	* No requirements.
	493	*/
	494	vm_page_t
	495	vm_fault_page(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
	496	int fault_flags, int *errorp)
	497	{
	498	vm_pindex_t first_pindex;
	499	struct faultstate fs;
	500	int result;
	501	vm_prot_t orig_fault_type = fault_type;
	502
	503	mycpu->gd_cnt.v_vm_faults++;
	504
	505	fs.didlimit = 0;
	506	fs.hardfault = 0;
	507	fs.fault_flags = fault_flags;
	508	KKASSERT((fault_flags & VM_FAULT_WIRE_MASK) == 0);
	509
	510	RetryFault:
	511	/*
	512	* Find the vm_map_entry representing the backing store and resolve
	513	* the top level object and page index. This may have the side
	514	* effect of executing a copy-on-write on the map entry and/or
	515	* creating a shadow object, but will not COW any actual VM pages.
	516	*
	517	* On success fs.map is left read-locked and various other fields
	518	* are initialized but not otherwise referenced or locked.
	519	*
	520	* NOTE! vm_map_lookup will upgrade the fault_type to VM_FAULT_WRITE
	521	* if the map entry is a virtual page table and also writable,
	522	* so we can set the 'A'accessed bit in the virtual page table entry.
	523	*/
	524	fs.map = map;
	525	result = vm_map_lookup(&fs.map, vaddr, fault_type,
	526	&fs.entry, &fs.first_object,
	527	&first_pindex, &fs.first_prot, &fs.wired);
	528
	529	if (result != KERN_SUCCESS) {
	530	*errorp = result;
	531	return (NULL);
	532	}
	533
	534	/*
	535	* fs.map is read-locked
	536	*
	537	* Misc checks. Save the map generation number to detect races.
	538	*/
	539	fs.map_generation = fs.map->timestamp;
	540
	541	if (fs.entry->eflags & MAP_ENTRY_NOFAULT) {
	542	panic("vm_fault: fault on nofault entry, addr: %lx",
	543	(u_long)vaddr);
	544	}
	545
	546	/*
	547	* A system map entry may return a NULL object. No object means
	548	* no pager means an unrecoverable kernel fault.
	549	*/
	550	if (fs.first_object == NULL) {
	551	panic("vm_fault: unrecoverable fault at %p in entry %p",
	552	(void *)vaddr, fs.entry);
	553	}
	554
	555	/*
	556	* Make a reference to this object to prevent its disposal while we
	557	* are messing with it. Once we have the reference, the map is free
	558	* to be diddled. Since objects reference their shadows (and copies),
	559	* they will stay around as well.
	560	*
	561	* Bump the paging-in-progress count to prevent size changes (e.g.
	562	* truncation operations) during I/O. This must be done after
	563	* obtaining the vnode lock in order to avoid possible deadlocks.
	564	*
	565	* The vm_token is needed to manipulate the vm_object
	566	*/
	567	lwkt_gettoken(&vm_token);
	568	vm_object_reference(fs.first_object);
	569	fs.vp = vnode_pager_lock(fs.first_object);
	570	vm_object_pip_add(fs.first_object, 1);
	571	lwkt_reltoken(&vm_token);
	572
	573	fs.lookup_still_valid = TRUE;
	574	fs.first_m = NULL;
	575	fs.object = fs.first_object; /* so unlock_and_deallocate works */
	576
	577	/*
	578	* If the entry is wired we cannot change the page protection.
	579	*/
	580	if (fs.wired)
	581	fault_type = fs.first_prot;
	582
	583	/*
	584	* The page we want is at (first_object, first_pindex), but if the
	585	* vm_map_entry is VM_MAPTYPE_VPAGETABLE we have to traverse the
	586	* page table to figure out the actual pindex.
	587	*
	588	* NOTE! DEVELOPMENT IN PROGRESS, THIS IS AN INITIAL IMPLEMENTATION
	589	* ONLY
	590	*/
	591	if (fs.entry->maptype == VM_MAPTYPE_VPAGETABLE) {
	592	result = vm_fault_vpagetable(&fs, &first_pindex,
	593	fs.entry->aux.master_pde,
	594	fault_type);
	595	if (result == KERN_TRY_AGAIN)
	596	goto RetryFault;
	597	if (result != KERN_SUCCESS) {
	598	*errorp = result;
	599	return (NULL);
	600	}
	601	}
	602
	603	/*
	604	* Now we have the actual (object, pindex), fault in the page. If
	605	* vm_fault_object() fails it will unlock and deallocate the FS
	606	* data. If it succeeds everything remains locked and fs->object
	607	* will have an additinal PIP count if it is not equal to
	608	* fs->first_object
	609	*/
	610	result = vm_fault_object(&fs, first_pindex, fault_type);
	611
	612	if (result == KERN_TRY_AGAIN)
	613	goto RetryFault;
	614	if (result != KERN_SUCCESS) {
	615	*errorp = result;
	616	return(NULL);
	617	}
	618
	619	if ((orig_fault_type & VM_PROT_WRITE) &&
	620	(fs.prot & VM_PROT_WRITE) == 0) {
	621	*errorp = KERN_PROTECTION_FAILURE;
	622	unlock_and_deallocate(&fs);
	623	return(NULL);
	624	}
	625
	626	/*
	627	* On success vm_fault_object() does not unlock or deallocate, and fs.m
	628	* will contain a busied page.
	629	*/
	630	unlock_things(&fs);
	631
	632	/*
	633	* Return a held page. We are not doing any pmap manipulation so do
	634	* not set PG_MAPPED. However, adjust the page flags according to
	635	* the fault type because the caller may not use a managed pmapping
	636	* (so we don't want to lose the fact that the page will be dirtied
	637	* if a write fault was specified).
	638	*/
	639	lwkt_gettoken(&vm_token);
	640	vm_page_hold(fs.m);
	641	vm_page_flag_clear(fs.m, PG_ZERO);
	642	if (fault_type & VM_PROT_WRITE)
	643	vm_page_dirty(fs.m);
	644
	645	/*
	646	* Update the pmap. We really only have to do this if a COW
	647	* occured to replace the read-only page with the new page. For
	648	* now just do it unconditionally. XXX
	649	*/
	650	pmap_enter(fs.map->pmap, vaddr, fs.m, fs.prot, fs.wired);
	651	vm_page_flag_set(fs.m, PG_REFERENCED);
	652
	653	/*
	654	* Unbusy the page by activating it. It remains held and will not
	655	* be reclaimed.
	656	*/
	657	vm_page_activate(fs.m);
	658
	659	if (curthread->td_lwp) {
	660	if (fs.hardfault) {
	661	curthread->td_lwp->lwp_ru.ru_majflt++;
	662	} else {
	663	curthread->td_lwp->lwp_ru.ru_minflt++;
	664	}
	665	}
	666
	667	/*
	668	* Unlock everything, and return the held page.
	669	*/
	670	vm_page_wakeup(fs.m);
	671	vm_object_deallocate(fs.first_object);
	672	lwkt_reltoken(&vm_token);
	673
	674	*errorp = 0;
	675	return(fs.m);
	676	}
	677
	678	/*
	679	* Fault in the specified (object,offset), dirty the returned page as
	680	* needed. If the requested fault_type cannot be done NULL and an
	681	* error is returned.
	682	*
	683	* A held (but not busied) page is returned.
	684	*
	685	* No requirements.
	686	*/
	687	vm_page_t
	688	vm_fault_object_page(vm_object_t object, vm_ooffset_t offset,
	689	vm_prot_t fault_type, int fault_flags, int *errorp)
	690	{
	691	int result;
	692	vm_pindex_t first_pindex;
	693	struct faultstate fs;
	694	struct vm_map_entry entry;
	695
	696	bzero(&entry, sizeof(entry));
	697	entry.object.vm_object = object;
	698	entry.maptype = VM_MAPTYPE_NORMAL;
	699	entry.protection = entry.max_protection = fault_type;
	700
	701	fs.didlimit = 0;
	702	fs.hardfault = 0;
	703	fs.fault_flags = fault_flags;
	704	fs.map = NULL;
	705	KKASSERT((fault_flags & VM_FAULT_WIRE_MASK) == 0);
	706
	707	RetryFault:
	708
	709	fs.first_object = object;
	710	first_pindex = OFF_TO_IDX(offset);
	711	fs.entry = &entry;
	712	fs.first_prot = fault_type;
	713	fs.wired = 0;
	714	/fs.map_generation = 0; unused /
	715
	716	/*
	717	* Make a reference to this object to prevent its disposal while we
	718	* are messing with it. Once we have the reference, the map is free
	719	* to be diddled. Since objects reference their shadows (and copies),
	720	* they will stay around as well.
	721	*
	722	* Bump the paging-in-progress count to prevent size changes (e.g.
	723	* truncation operations) during I/O. This must be done after
	724	* obtaining the vnode lock in order to avoid possible deadlocks.
	725	*/
	726	lwkt_gettoken(&vm_token);
	727	vm_object_reference(fs.first_object);
	728	fs.vp = vnode_pager_lock(fs.first_object);
	729	vm_object_pip_add(fs.first_object, 1);
	730	lwkt_reltoken(&vm_token);
	731
	732	fs.lookup_still_valid = TRUE;
	733	fs.first_m = NULL;
	734	fs.object = fs.first_object; /* so unlock_and_deallocate works */
	735
	736	#if 0
	737	/* XXX future - ability to operate on VM object using vpagetable */
	738	if (fs.entry->maptype == VM_MAPTYPE_VPAGETABLE) {
	739	result = vm_fault_vpagetable(&fs, &first_pindex,
	740	fs.entry->aux.master_pde,
	741	fault_type);
	742	if (result == KERN_TRY_AGAIN)
	743	goto RetryFault;
	744	if (result != KERN_SUCCESS) {
	745	*errorp = result;
	746	return (NULL);
	747	}
	748	}
	749	#endif
	750
	751	/*
	752	* Now we have the actual (object, pindex), fault in the page. If
	753	* vm_fault_object() fails it will unlock and deallocate the FS
	754	* data. If it succeeds everything remains locked and fs->object
	755	* will have an additinal PIP count if it is not equal to
	756	* fs->first_object
	757	*/
	758	result = vm_fault_object(&fs, first_pindex, fault_type);
	759
	760	if (result == KERN_TRY_AGAIN)
	761	goto RetryFault;
	762	if (result != KERN_SUCCESS) {
	763	*errorp = result;
	764	return(NULL);
	765	}
	766
	767	if ((fault_type & VM_PROT_WRITE) && (fs.prot & VM_PROT_WRITE) == 0) {
	768	*errorp = KERN_PROTECTION_FAILURE;
	769	unlock_and_deallocate(&fs);
	770	return(NULL);
	771	}
	772
	773	/*
	774	* On success vm_fault_object() does not unlock or deallocate, and fs.m
	775	* will contain a busied page.
	776	*/
	777	unlock_things(&fs);
	778
	779	/*
	780	* Return a held page. We are not doing any pmap manipulation so do
	781	* not set PG_MAPPED. However, adjust the page flags according to
	782	* the fault type because the caller may not use a managed pmapping
	783	* (so we don't want to lose the fact that the page will be dirtied
	784	* if a write fault was specified).
	785	*/
	786	lwkt_gettoken(&vm_token);
	787	vm_page_hold(fs.m);
	788	vm_page_flag_clear(fs.m, PG_ZERO);
	789	if (fault_type & VM_PROT_WRITE)
	790	vm_page_dirty(fs.m);
	791
	792	/*
	793	* Indicate that the page was accessed.
	794	*/
	795	vm_page_flag_set(fs.m, PG_REFERENCED);
	796
	797	/*
	798	* Unbusy the page by activating it. It remains held and will not
	799	* be reclaimed.
	800	*/
	801	vm_page_activate(fs.m);
	802
	803	if (curthread->td_lwp) {
	804	if (fs.hardfault) {
	805	mycpu->gd_cnt.v_vm_faults++;
	806	curthread->td_lwp->lwp_ru.ru_majflt++;
	807	} else {
	808	curthread->td_lwp->lwp_ru.ru_minflt++;
	809	}
	810	}
	811
	812	/*
	813	* Unlock everything, and return the held page.
	814	*/
	815	vm_page_wakeup(fs.m);
	816	vm_object_deallocate(fs.first_object);
	817	lwkt_reltoken(&vm_token);
	818
	819	*errorp = 0;
	820	return(fs.m);
	821	}
	822
	823	/*
	824	* Translate the virtual page number (first_pindex) that is relative
	825	* to the address space into a logical page number that is relative to the
	826	* backing object. Use the virtual page table pointed to by (vpte).
	827	*
	828	* This implements an N-level page table. Any level can terminate the
	829	* scan by setting VPTE_PS. A linear mapping is accomplished by setting
	830	* VPTE_PS in the master page directory entry set via mcontrol(MADV_SETMAP).
	831	*
	832	* No requirements (vm_token need not be held).
	833	*/
	834	static
	835	int
	836	vm_fault_vpagetable(struct faultstate fs, vm_pindex_t pindex,
	837	vpte_t vpte, int fault_type)
	838	{
	839	struct lwbuf *lwb;
	840	int vshift = VPTE_FRAME_END - PAGE_SHIFT; /* index bits remaining */
	841	int result = KERN_SUCCESS;
	842	vpte_t *ptep;
	843
	844	for (;;) {
	845	/*
	846	* We cannot proceed if the vpte is not valid, not readable
	847	* for a read fault, or not writable for a write fault.
	848	*/
	849	if ((vpte & VPTE_V) == 0) {
	850	unlock_and_deallocate(fs);
	851	return (KERN_FAILURE);
	852	}
	853	if ((fault_type & VM_PROT_READ) && (vpte & VPTE_R) == 0) {
	854	unlock_and_deallocate(fs);
	855	return (KERN_FAILURE);
	856	}
	857	if ((fault_type & VM_PROT_WRITE) && (vpte & VPTE_W) == 0) {
	858	unlock_and_deallocate(fs);
	859	return (KERN_FAILURE);
	860	}
	861	if ((vpte & VPTE_PS) \|\| vshift == 0)
	862	break;
	863	KKASSERT(vshift >= VPTE_PAGE_BITS);
	864
	865	/*
	866	* Get the page table page. Nominally we only read the page
	867	* table, but since we are actively setting VPTE_M and VPTE_A,
	868	* tell vm_fault_object() that we are writing it.
	869	*
	870	* There is currently no real need to optimize this.
	871	*/
	872	result = vm_fault_object(fs, (vpte & VPTE_FRAME) >> PAGE_SHIFT,
	873	VM_PROT_READ\|VM_PROT_WRITE);
	874	if (result != KERN_SUCCESS)
	875	return (result);
	876
	877	/*
	878	* Process the returned fs.m and look up the page table
	879	* entry in the page table page.
	880	*/
	881	vshift -= VPTE_PAGE_BITS;
	882	lwb = lwbuf_alloc(fs->m);
	883	ptep = ((vpte_t *)lwbuf_kva(lwb) +
	884	((*pindex >> vshift) & VPTE_PAGE_MASK));
	885	vpte = *ptep;
	886
	887	/*
	888	* Page table write-back. If the vpte is valid for the
	889	* requested operation, do a write-back to the page table.
	890	*
	891	* XXX VPTE_M is not set properly for page directory pages.
	892	* It doesn't get set in the page directory if the page table
	893	* is modified during a read access.
	894	*/
	895	if ((fault_type & VM_PROT_WRITE) && (vpte & VPTE_V) &&
	896	(vpte & VPTE_W)) {
	897	if ((vpte & (VPTE_M\|VPTE_A)) != (VPTE_M\|VPTE_A)) {
	898	atomic_set_long(ptep, VPTE_M \| VPTE_A);
	899	vm_page_dirty(fs->m);
	900	}
	901	}
	902	if ((fault_type & VM_PROT_READ) && (vpte & VPTE_V) &&
	903	(vpte & VPTE_R)) {
	904	if ((vpte & VPTE_A) == 0) {
	905	atomic_set_long(ptep, VPTE_A);
	906	vm_page_dirty(fs->m);
	907	}
	908	}
	909	lwbuf_free(lwb);
	910	vm_page_flag_set(fs->m, PG_REFERENCED);
	911	vm_page_activate(fs->m);
	912	vm_page_wakeup(fs->m);
	913	cleanup_successful_fault(fs);
	914	}
	915	/*
	916	* Combine remaining address bits with the vpte.
	917	*/
	918	/* JG how many bits from each? */
	919	*pindex = ((vpte & VPTE_FRAME) >> PAGE_SHIFT) +
	920	(*pindex & ((1L << vshift) - 1));
	921	return (KERN_SUCCESS);
	922	}
	923
	924
	925	/*
	926	* This is the core of the vm_fault code.
	927	*
	928	* Do all operations required to fault-in (fs.first_object, pindex). Run
	929	* through the shadow chain as necessary and do required COW or virtual
	930	* copy operations. The caller has already fully resolved the vm_map_entry
	931	* and, if appropriate, has created a copy-on-write layer. All we need to
	932	* do is iterate the object chain.
	933	*
	934	* On failure (fs) is unlocked and deallocated and the caller may return or
	935	* retry depending on the failure code. On success (fs) is NOT unlocked or
	936	* deallocated, fs.m will contained a resolved, busied page, and fs.object
	937	* will have an additional PIP count if it is not equal to fs.first_object.
	938	*
	939	* No requirements.
	940	*/
	941	static
	942	int
	943	vm_fault_object(struct faultstate *fs,
	944	vm_pindex_t first_pindex, vm_prot_t fault_type)
	945	{
	946	vm_object_t next_object;
	947	vm_pindex_t pindex;
	948
	949	fs->prot = fs->first_prot;
	950	fs->object = fs->first_object;
	951	pindex = first_pindex;
	952
	953	/*
	954	* If a read fault occurs we try to make the page writable if
	955	* possible. There are three cases where we cannot make the
	956	* page mapping writable:
	957	*
	958	* (1) The mapping is read-only or the VM object is read-only,
	959	* fs->prot above will simply not have VM_PROT_WRITE set.
	960	*
	961	* (2) If the mapping is a virtual page table we need to be able
	962	* to detect writes so we can set VPTE_M in the virtual page
	963	* table.
	964	*
	965	* (3) If the VM page is read-only or copy-on-write, upgrading would
	966	* just result in an unnecessary COW fault.
	967	*
	968	* VM_PROT_VPAGED is set if faulting via a virtual page table and
	969	* causes adjustments to the 'M'odify bit to also turn off write
	970	* access to force a re-fault.
	971	*/
	972	if (fs->entry->maptype == VM_MAPTYPE_VPAGETABLE) {
	973	if ((fault_type & VM_PROT_WRITE) == 0)
	974	fs->prot &= ~VM_PROT_WRITE;
	975	}
	976
	977	lwkt_gettoken(&vm_token);
	978
	979	for (;;) {
	980	/*
	981	* If the object is dead, we stop here
	982	*/
	983	if (fs->object->flags & OBJ_DEAD) {
	984	unlock_and_deallocate(fs);
	985	lwkt_reltoken(&vm_token);
	986	return (KERN_PROTECTION_FAILURE);
	987	}
	988
	989	/*
	990	* See if page is resident. spl protection is required
	991	* to avoid an interrupt unbusy/free race against our
	992	* lookup. We must hold the protection through a page
	993	* allocation or busy.
	994	*/
	995	crit_enter();
	996	fs->m = vm_page_lookup(fs->object, pindex);
	997	if (fs->m != NULL) {
	998	int queue;
	999	/*
	1000	* Wait/Retry if the page is busy. We have to do this
	1001	* if the page is busy via either PG_BUSY or
	1002	* vm_page_t->busy because the vm_pager may be using
	1003	* vm_page_t->busy for pageouts ( and even pageins if
	1004	* it is the vnode pager ), and we could end up trying
	1005	* to pagein and pageout the same page simultaneously.
	1006	*
	1007	* We can theoretically allow the busy case on a read
	1008	* fault if the page is marked valid, but since such
	1009	* pages are typically already pmap'd, putting that
	1010	* special case in might be more effort then it is
	1011	* worth. We cannot under any circumstances mess
	1012	* around with a vm_page_t->busy page except, perhaps,
	1013	* to pmap it.
	1014	*/
	1015	if ((fs->m->flags & PG_BUSY) \|\| fs->m->busy) {
	1016	unlock_things(fs);
	1017	vm_page_sleep_busy(fs->m, TRUE, "vmpfw");
	1018	mycpu->gd_cnt.v_intrans++;
	1019	vm_object_deallocate(fs->first_object);
	1020	fs->first_object = NULL;
	1021	lwkt_reltoken(&vm_token);
	1022	crit_exit();
	1023	return (KERN_TRY_AGAIN);
	1024	}
	1025
	1026	/*
	1027	* If reactivating a page from PQ_CACHE we may have
	1028	* to rate-limit.
	1029	*/
	1030	queue = fs->m->queue;
	1031	vm_page_unqueue_nowakeup(fs->m);
	1032
	1033	if ((queue - fs->m->pc) == PQ_CACHE &&
	1034	vm_page_count_severe()) {
	1035	vm_page_activate(fs->m);
	1036	unlock_and_deallocate(fs);
	1037	vm_waitpfault();
	1038	lwkt_reltoken(&vm_token);
	1039	crit_exit();
	1040	return (KERN_TRY_AGAIN);
	1041	}
	1042
	1043	/*
	1044	* Mark page busy for other processes, and the
	1045	* pagedaemon. If it still isn't completely valid
	1046	* (readable), or if a read-ahead-mark is set on
	1047	* the VM page, jump to readrest, else we found the
	1048	* page and can return.
	1049	*
	1050	* We can release the spl once we have marked the
	1051	* page busy.
	1052	*/
	1053	vm_page_busy(fs->m);
	1054	crit_exit();
	1055
	1056	if (fs->m->object != &kernel_object) {
	1057	if ((fs->m->valid & VM_PAGE_BITS_ALL) !=
	1058	VM_PAGE_BITS_ALL) {
	1059	goto readrest;
	1060	}
	1061	if (fs->m->flags & PG_RAM) {
	1062	if (debug_cluster)
	1063	kprintf("R");
	1064	vm_page_flag_clear(fs->m, PG_RAM);
	1065	goto readrest;
	1066	}
	1067	}
	1068	break; /* break to PAGE HAS BEEN FOUND */
	1069	}
	1070
	1071	/*
	1072	* Page is not resident, If this is the search termination
	1073	* or the pager might contain the page, allocate a new page.
	1074	*
	1075	* NOTE: We are still in a critical section.
	1076	*/
	1077	if (TRYPAGER(fs) \|\| fs->object == fs->first_object) {
	1078	/*
	1079	* If the page is beyond the object size we fail
	1080	*/
	1081	if (pindex >= fs->object->size) {
	1082	lwkt_reltoken(&vm_token);
	1083	crit_exit();
	1084	unlock_and_deallocate(fs);
	1085	return (KERN_PROTECTION_FAILURE);
	1086	}
	1087
	1088	/*
	1089	* Ratelimit.
	1090	*/
	1091	if (fs->didlimit == 0 && curproc != NULL) {
	1092	int limticks;
	1093
	1094	limticks = vm_fault_ratelimit(curproc->p_vmspace);
	1095	if (limticks) {
	1096	lwkt_reltoken(&vm_token);
	1097	crit_exit();
	1098	unlock_and_deallocate(fs);
	1099	tsleep(curproc, 0, "vmrate", limticks);
	1100	fs->didlimit = 1;
	1101	return (KERN_TRY_AGAIN);
	1102	}
	1103	}
	1104
	1105	/*
	1106	* Allocate a new page for this object/offset pair.
	1107	*/
	1108	fs->m = NULL;
	1109	if (!vm_page_count_severe()) {
	1110	fs->m = vm_page_alloc(fs->object, pindex,
	1111	(fs->vp \|\| fs->object->backing_object) ? VM_ALLOC_NORMAL : VM_ALLOC_NORMAL \| VM_ALLOC_ZERO);
	1112	}
	1113	if (fs->m == NULL) {
	1114	lwkt_reltoken(&vm_token);
	1115	crit_exit();
	1116	unlock_and_deallocate(fs);
	1117	vm_waitpfault();
	1118	return (KERN_TRY_AGAIN);
	1119	}
	1120	}
	1121	crit_exit();
	1122
	1123	readrest:
	1124	/*
	1125	* We have found an invalid or partially valid page, a
	1126	* page with a read-ahead mark which might be partially or
	1127	* fully valid (and maybe dirty too), or we have allocated
	1128	* a new page.
	1129	*
	1130	* Attempt to fault-in the page if there is a chance that the
	1131	* pager has it, and potentially fault in additional pages
	1132	* at the same time.
	1133	*
	1134	* We are NOT in splvm here and if TRYPAGER is true then
	1135	* fs.m will be non-NULL and will be PG_BUSY for us.
	1136	*/
	1137	if (TRYPAGER(fs)) {
	1138	int rv;
	1139	int seqaccess;
	1140	u_char behavior = vm_map_entry_behavior(fs->entry);
	1141
	1142	if (behavior == MAP_ENTRY_BEHAV_RANDOM)
	1143	seqaccess = 0;
	1144	else
	1145	seqaccess = -1;
	1146
	1147	/*
	1148	* If sequential access is detected then attempt
	1149	* to deactivate/cache pages behind the scan to
	1150	* prevent resource hogging.
	1151	*
	1152	* Use of PG_RAM to detect sequential access
	1153	* also simulates multi-zone sequential access
	1154	* detection for free.
	1155	*
	1156	* NOTE: Partially valid dirty pages cannot be
	1157	* deactivated without causing NFS picemeal
	1158	* writes to barf.
	1159	*/
	1160	if ((fs->first_object->type != OBJT_DEVICE) &&
	1161	(behavior == MAP_ENTRY_BEHAV_SEQUENTIAL \|\|
	1162	(behavior != MAP_ENTRY_BEHAV_RANDOM &&
	1163	(fs->m->flags & PG_RAM)))
	1164	) {
	1165	vm_pindex_t scan_pindex;
	1166	int scan_count = 16;
	1167
	1168	if (first_pindex < 16) {
	1169	scan_pindex = 0;
	1170	scan_count = 0;
	1171	} else {
	1172	scan_pindex = first_pindex - 16;
	1173	if (scan_pindex < 16)
	1174	scan_count = scan_pindex;
	1175	else
	1176	scan_count = 16;
	1177	}
	1178
	1179	crit_enter();
	1180	while (scan_count) {
	1181	vm_page_t mt;
	1182
	1183	mt = vm_page_lookup(fs->first_object,
	1184	scan_pindex);
	1185	if (mt == NULL \|\|
	1186	(mt->valid != VM_PAGE_BITS_ALL)) {
	1187	break;
	1188	}
	1189	if (mt->busy \|\|
	1190	(mt->flags & (PG_BUSY \| PG_FICTITIOUS \| PG_UNMANAGED)) \|\|
	1191	mt->hold_count \|\|
	1192	mt->wire_count) {
	1193	goto skip;
	1194	}
	1195	if (mt->dirty == 0)
	1196	vm_page_test_dirty(mt);
	1197	if (mt->dirty) {
	1198	vm_page_busy(mt);
	1199	vm_page_protect(mt,
	1200	VM_PROT_NONE);
	1201	vm_page_deactivate(mt);
	1202	vm_page_wakeup(mt);
	1203	} else {
	1204	vm_page_cache(mt);
	1205	}
	1206	skip:
	1207	--scan_count;
	1208	--scan_pindex;
	1209	}
	1210	crit_exit();
	1211
	1212	seqaccess = 1;
	1213	}
	1214
	1215	/*
	1216	* Avoid deadlocking against the map when doing I/O.
	1217	* fs.object and the page is PG_BUSY'd.
	1218	*/
	1219	unlock_map(fs);
	1220
	1221	/*
	1222	* Acquire the page data. We still hold a ref on
	1223	* fs.object and the page has been PG_BUSY's.
	1224	*
	1225	* The pager may replace the page (for example, in
	1226	* order to enter a fictitious page into the
	1227	* object). If it does so it is responsible for
	1228	* cleaning up the passed page and properly setting
	1229	* the new page PG_BUSY.
	1230	*
	1231	* If we got here through a PG_RAM read-ahead
	1232	* mark the page may be partially dirty and thus
	1233	* not freeable. Don't bother checking to see
	1234	* if the pager has the page because we can't free
	1235	* it anyway. We have to depend on the get_page
	1236	* operation filling in any gaps whether there is
	1237	* backing store or not.
	1238	*/
	1239	rv = vm_pager_get_page(fs->object, &fs->m, seqaccess);
	1240
	1241	if (rv == VM_PAGER_OK) {
	1242	/*
	1243	* Relookup in case pager changed page. Pager
	1244	* is responsible for disposition of old page
	1245	* if moved.
	1246	*
	1247	* XXX other code segments do relookups too.
	1248	* It's a bad abstraction that needs to be
	1249	* fixed/removed.
	1250	*/
	1251	fs->m = vm_page_lookup(fs->object, pindex);
	1252	if (fs->m == NULL) {
	1253	lwkt_reltoken(&vm_token);
	1254	unlock_and_deallocate(fs);
	1255	return (KERN_TRY_AGAIN);
	1256	}
	1257
	1258	++fs->hardfault;
	1259	break; /* break to PAGE HAS BEEN FOUND */
	1260	}
	1261
	1262	/*
	1263	* Remove the bogus page (which does not exist at this
	1264	* object/offset); before doing so, we must get back
	1265	* our object lock to preserve our invariant.
	1266	*
	1267	* Also wake up any other process that may want to bring
	1268	* in this page.
	1269	*
	1270	* If this is the top-level object, we must leave the
	1271	* busy page to prevent another process from rushing
	1272	* past us, and inserting the page in that object at
	1273	* the same time that we are.
	1274	*/
	1275	if (rv == VM_PAGER_ERROR) {
	1276	if (curproc)
	1277	kprintf("vm_fault: pager read error, pid %d (%s)\n", curproc->p_pid, curproc->p_comm);
	1278	else
	1279	kprintf("vm_fault: pager read error, thread %p (%s)\n", curthread, curproc->p_comm);
	1280	}
	1281
	1282	/*
	1283	* Data outside the range of the pager or an I/O error
	1284	*
	1285	* The page may have been wired during the pagein,
	1286	* e.g. by the buffer cache, and cannot simply be
	1287	* freed. Call vnode_pager_freepage() to deal with it.
	1288	*/
	1289	/*
	1290	* XXX - the check for kernel_map is a kludge to work
	1291	* around having the machine panic on a kernel space
	1292	* fault w/ I/O error.
	1293	*/
	1294	if (((fs->map != &kernel_map) &&
	1295	(rv == VM_PAGER_ERROR)) \|\| (rv == VM_PAGER_BAD)) {
	1296	vnode_pager_freepage(fs->m);
	1297	lwkt_reltoken(&vm_token);
	1298	fs->m = NULL;
	1299	unlock_and_deallocate(fs);
	1300	if (rv == VM_PAGER_ERROR)
	1301	return (KERN_FAILURE);
	1302	else
	1303	return (KERN_PROTECTION_FAILURE);
	1304	/* NOT REACHED */
	1305	}
	1306	if (fs->object != fs->first_object) {
	1307	vnode_pager_freepage(fs->m);
	1308	fs->m = NULL;
	1309	/*
	1310	* XXX - we cannot just fall out at this
	1311	* point, m has been freed and is invalid!
	1312	*/
	1313	}
	1314	}
	1315
	1316	/*
	1317	* We get here if the object has a default pager (or unwiring)
	1318	* or the pager doesn't have the page.
	1319	*/
	1320	if (fs->object == fs->first_object)
	1321	fs->first_m = fs->m;
	1322
	1323	/*
	1324	* Move on to the next object. Lock the next object before
	1325	* unlocking the current one.
	1326	*/
	1327	pindex += OFF_TO_IDX(fs->object->backing_object_offset);
	1328	next_object = fs->object->backing_object;
	1329	if (next_object == NULL) {
	1330	/*
	1331	* If there's no object left, fill the page in the top
	1332	* object with zeros.
	1333	*/
	1334	if (fs->object != fs->first_object) {
	1335	vm_object_pip_wakeup(fs->object);
	1336
	1337	fs->object = fs->first_object;
	1338	pindex = first_pindex;
	1339	fs->m = fs->first_m;
	1340	}
	1341	fs->first_m = NULL;
	1342
	1343	/*
	1344	* Zero the page if necessary and mark it valid.
	1345	*/
	1346	if ((fs->m->flags & PG_ZERO) == 0) {
	1347	vm_page_zero_fill(fs->m);
	1348	} else {
	1349	mycpu->gd_cnt.v_ozfod++;
	1350	}
	1351	mycpu->gd_cnt.v_zfod++;
	1352	fs->m->valid = VM_PAGE_BITS_ALL;
	1353	break; /* break to PAGE HAS BEEN FOUND */
	1354	}
	1355	if (fs->object != fs->first_object) {
	1356	vm_object_pip_wakeup(fs->object);
	1357	}
	1358	KASSERT(fs->object != next_object,
	1359	("object loop %p", next_object));
	1360	fs->object = next_object;
	1361	vm_object_pip_add(fs->object, 1);
	1362	}
	1363
	1364	/*
	1365	* PAGE HAS BEEN FOUND. [Loop invariant still holds -- the object lock
	1366	* is held.]
	1367	*
	1368	* vm_token is still held
	1369	*
	1370	* If the page is being written, but isn't already owned by the
	1371	* top-level object, we have to copy it into a new page owned by the
	1372	* top-level object.
	1373	*/
	1374	KASSERT((fs->m->flags & PG_BUSY) != 0,
	1375	("vm_fault: not busy after main loop"));
	1376
	1377	if (fs->object != fs->first_object) {
	1378	/*
	1379	* We only really need to copy if we want to write it.
	1380	*/
	1381	if (fault_type & VM_PROT_WRITE) {
	1382	/*
	1383	* This allows pages to be virtually copied from a
	1384	* backing_object into the first_object, where the
	1385	* backing object has no other refs to it, and cannot
	1386	* gain any more refs. Instead of a bcopy, we just
	1387	* move the page from the backing object to the
	1388	* first object. Note that we must mark the page
	1389	* dirty in the first object so that it will go out
	1390	* to swap when needed.
	1391	*/
	1392	if (
	1393	/*
	1394	* Map, if present, has not changed
	1395	*/
	1396	(fs->map == NULL \|\|
	1397	fs->map_generation == fs->map->timestamp) &&
	1398	/*
	1399	* Only one shadow object
	1400	*/
	1401	(fs->object->shadow_count == 1) &&
	1402	/*
	1403	* No COW refs, except us
	1404	*/
	1405	(fs->object->ref_count == 1) &&
	1406	/*
	1407	* No one else can look this object up
	1408	*/
	1409	(fs->object->handle == NULL) &&
	1410	/*
	1411	* No other ways to look the object up
	1412	*/
	1413	((fs->object->type == OBJT_DEFAULT) \|\|
	1414	(fs->object->type == OBJT_SWAP)) &&
	1415	/*
	1416	* We don't chase down the shadow chain
	1417	*/
	1418	(fs->object == fs->first_object->backing_object) &&
	1419
	1420	/*
	1421	* grab the lock if we need to
	1422	*/
	1423	(fs->lookup_still_valid \|\|
	1424	fs->map == NULL \|\|
	1425	lockmgr(&fs->map->lock, LK_EXCLUSIVE\|LK_NOWAIT) == 0)
	1426	) {
	1427
	1428	fs->lookup_still_valid = 1;
	1429	/*
	1430	* get rid of the unnecessary page
	1431	*/
	1432	vm_page_protect(fs->first_m, VM_PROT_NONE);
	1433	vm_page_free(fs->first_m);
	1434	fs->first_m = NULL;
	1435
	1436	/*
	1437	* grab the page and put it into the
	1438	* process'es object. The page is
	1439	* automatically made dirty.
	1440	*/
	1441	vm_page_rename(fs->m, fs->first_object, first_pindex);
	1442	fs->first_m = fs->m;
	1443	vm_page_busy(fs->first_m);
	1444	fs->m = NULL;
	1445	mycpu->gd_cnt.v_cow_optim++;
	1446	} else {
	1447	/*
	1448	* Oh, well, lets copy it.
	1449	*/
	1450	vm_page_copy(fs->m, fs->first_m);
	1451	vm_page_event(fs->m, VMEVENT_COW);
	1452	}
	1453
	1454	if (fs->m) {
	1455	/*
	1456	* We no longer need the old page or object.
	1457	*/
	1458	release_page(fs);
	1459	}
	1460
	1461	/*
	1462	* fs->object != fs->first_object due to above
	1463	* conditional
	1464	*/
	1465	vm_object_pip_wakeup(fs->object);
	1466
	1467	/*
	1468	* Only use the new page below...
	1469	*/
	1470
	1471	mycpu->gd_cnt.v_cow_faults++;
	1472	fs->m = fs->first_m;
	1473	fs->object = fs->first_object;
	1474	pindex = first_pindex;
	1475	} else {
	1476	/*
	1477	* If it wasn't a write fault avoid having to copy
	1478	* the page by mapping it read-only.
	1479	*/
	1480	fs->prot &= ~VM_PROT_WRITE;
	1481	}
	1482	}
	1483
	1484	/*
	1485	* We may have had to unlock a map to do I/O. If we did then
	1486	* lookup_still_valid will be FALSE. If the map generation count
	1487	* also changed then all sorts of things could have happened while
	1488	* we were doing the I/O and we need to retry.
	1489	*/
	1490
	1491	if (!fs->lookup_still_valid &&
	1492	fs->map != NULL &&
	1493	(fs->map->timestamp != fs->map_generation)) {
	1494	release_page(fs);
	1495	lwkt_reltoken(&vm_token);
	1496	unlock_and_deallocate(fs);
	1497	return (KERN_TRY_AGAIN);
	1498	}
	1499
	1500	/*
	1501	* If the fault is a write, we know that this page is being
	1502	* written NOW so dirty it explicitly to save on pmap_is_modified()
	1503	* calls later.
	1504	*
	1505	* If this is a NOSYNC mmap we do not want to set PG_NOSYNC
	1506	* if the page is already dirty to prevent data written with
	1507	* the expectation of being synced from not being synced.
	1508	* Likewise if this entry does not request NOSYNC then make
	1509	* sure the page isn't marked NOSYNC. Applications sharing
	1510	* data should use the same flags to avoid ping ponging.
	1511	*
	1512	* Also tell the backing pager, if any, that it should remove
	1513	* any swap backing since the page is now dirty.
	1514	*/
	1515	if (fs->prot & VM_PROT_WRITE) {
	1516	vm_object_set_writeable_dirty(fs->m->object);
	1517	if (fs->entry->eflags & MAP_ENTRY_NOSYNC) {
	1518	if (fs->m->dirty == 0)
	1519	vm_page_flag_set(fs->m, PG_NOSYNC);
	1520	} else {
	1521	vm_page_flag_clear(fs->m, PG_NOSYNC);
	1522	}
	1523	if (fs->fault_flags & VM_FAULT_DIRTY) {
	1524	crit_enter();
	1525	vm_page_dirty(fs->m);
	1526	swap_pager_unswapped(fs->m);
	1527	crit_exit();
	1528	}
	1529	}
	1530
	1531	lwkt_reltoken(&vm_token);
	1532
	1533	/*
	1534	* Page had better still be busy. We are still locked up and
	1535	* fs->object will have another PIP reference if it is not equal
	1536	* to fs->first_object.
	1537	*/
	1538	KASSERT(fs->m->flags & PG_BUSY,
	1539	("vm_fault: page %p not busy!", fs->m));
	1540
	1541	/*
	1542	* Sanity check: page must be completely valid or it is not fit to
	1543	* map into user space. vm_pager_get_pages() ensures this.
	1544	*/
	1545	if (fs->m->valid != VM_PAGE_BITS_ALL) {
	1546	vm_page_zero_invalid(fs->m, TRUE);
	1547	kprintf("Warning: page %p partially invalid on fault\n", fs->m);
	1548	}
	1549
	1550	return (KERN_SUCCESS);
	1551	}
	1552
	1553	/*
	1554	* Wire down a range of virtual addresses in a map. The entry in question
	1555	* should be marked in-transition and the map must be locked. We must
	1556	* release the map temporarily while faulting-in the page to avoid a
	1557	* deadlock. Note that the entry may be clipped while we are blocked but
	1558	* will never be freed.
	1559	*
	1560	* No requirements.
	1561	*/
	1562	int
	1563	vm_fault_wire(vm_map_t map, vm_map_entry_t entry, boolean_t user_wire)
	1564	{
	1565	boolean_t fictitious;
	1566	vm_offset_t start;
	1567	vm_offset_t end;
	1568	vm_offset_t va;
	1569	vm_paddr_t pa;
	1570	pmap_t pmap;
	1571	int rv;
	1572
	1573	pmap = vm_map_pmap(map);
	1574	start = entry->start;
	1575	end = entry->end;
	1576	fictitious = entry->object.vm_object &&
	1577	(entry->object.vm_object->type == OBJT_DEVICE);
	1578
	1579	lwkt_gettoken(&vm_token);
	1580	vm_map_unlock(map);
	1581	map->timestamp++;
	1582
	1583	/*
	1584	* We simulate a fault to get the page and enter it in the physical
	1585	* map.
	1586	*/
	1587	for (va = start; va < end; va += PAGE_SIZE) {
	1588	if (user_wire) {
	1589	rv = vm_fault(map, va, VM_PROT_READ,
	1590	VM_FAULT_USER_WIRE);
	1591	} else {
	1592	rv = vm_fault(map, va, VM_PROT_READ\|VM_PROT_WRITE,
	1593	VM_FAULT_CHANGE_WIRING);
	1594	}
	1595	if (rv) {
	1596	while (va > start) {
	1597	va -= PAGE_SIZE;
	1598	if ((pa = pmap_extract(pmap, va)) == 0)
	1599	continue;
	1600	pmap_change_wiring(pmap, va, FALSE);
	1601	if (!fictitious)
	1602	vm_page_unwire(PHYS_TO_VM_PAGE(pa), 1);
	1603	}
	1604	vm_map_lock(map);
	1605	lwkt_reltoken(&vm_token);
	1606	return (rv);
	1607	}
	1608	}
	1609	vm_map_lock(map);
	1610	lwkt_reltoken(&vm_token);
	1611	return (KERN_SUCCESS);
	1612	}
	1613
	1614	/*
	1615	* Unwire a range of virtual addresses in a map. The map should be
	1616	* locked.
	1617	*/
	1618	void
	1619	vm_fault_unwire(vm_map_t map, vm_map_entry_t entry)
	1620	{
	1621	boolean_t fictitious;
	1622	vm_offset_t start;
	1623	vm_offset_t end;
	1624	vm_offset_t va;
	1625	vm_paddr_t pa;
	1626	pmap_t pmap;
	1627
	1628	pmap = vm_map_pmap(map);
	1629	start = entry->start;
	1630	end = entry->end;
	1631	fictitious = entry->object.vm_object &&
	1632	(entry->object.vm_object->type == OBJT_DEVICE);
	1633
	1634	/*
	1635	* Since the pages are wired down, we must be able to get their
	1636	* mappings from the physical map system.
	1637	*/
	1638	lwkt_gettoken(&vm_token);
	1639	for (va = start; va < end; va += PAGE_SIZE) {
	1640	pa = pmap_extract(pmap, va);
	1641	if (pa != 0) {
	1642	pmap_change_wiring(pmap, va, FALSE);
	1643	if (!fictitious)
	1644	vm_page_unwire(PHYS_TO_VM_PAGE(pa), 1);
	1645	}
	1646	}
	1647	lwkt_reltoken(&vm_token);
	1648	}
	1649
	1650	/*
	1651	* Reduce the rate at which memory is allocated to a process based
	1652	* on the perceived load on the VM system. As the load increases
	1653	* the allocation burst rate goes down and the delay increases.
	1654	*
	1655	* Rate limiting does not apply when faulting active or inactive
	1656	* pages. When faulting 'cache' pages, rate limiting only applies
	1657	* if the system currently has a severe page deficit.
	1658	*
	1659	* XXX vm_pagesupply should be increased when a page is freed.
	1660	*
	1661	* We sleep up to 1/10 of a second.
	1662	*/
	1663	static int
	1664	vm_fault_ratelimit(struct vmspace *vmspace)
	1665	{
	1666	if (vm_load_enable == 0)
	1667	return(0);
	1668	if (vmspace->vm_pagesupply > 0) {
	1669	--vmspace->vm_pagesupply; /* SMP race ok */
	1670	return(0);
	1671	}
	1672	#ifdef INVARIANTS
	1673	if (vm_load_debug) {
	1674	kprintf("load %-4d give %d pgs, wait %d, pid %-5d (%s)\n",
	1675	vm_load,
	1676	(1000 - vm_load ) / 10, vm_load * hz / 10000,
	1677	curproc->p_pid, curproc->p_comm);
	1678	}
	1679	#endif
	1680	vmspace->vm_pagesupply = (1000 - vm_load) / 10;
	1681	return(vm_load * hz / 10000);
	1682	}
	1683
	1684	/*
	1685	* Copy all of the pages from a wired-down map entry to another.
	1686	*
	1687	* The source and destination maps must be locked for write.
	1688	* The source map entry must be wired down (or be a sharing map
	1689	* entry corresponding to a main map entry that is wired down).
	1690	*
	1691	* No other requirements.
	1692	*/
	1693	void
	1694	vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map,
	1695	vm_map_entry_t dst_entry, vm_map_entry_t src_entry)
	1696	{
	1697	vm_object_t dst_object;
	1698	vm_object_t src_object;
	1699	vm_ooffset_t dst_offset;
	1700	vm_ooffset_t src_offset;
	1701	vm_prot_t prot;
	1702	vm_offset_t vaddr;
	1703	vm_page_t dst_m;
	1704	vm_page_t src_m;
	1705
	1706	#ifdef lint
	1707	src_map++;
	1708	#endif /* lint */
	1709
	1710	src_object = src_entry->object.vm_object;
	1711	src_offset = src_entry->offset;
	1712
	1713	/*
	1714	* Create the top-level object for the destination entry. (Doesn't
	1715	* actually shadow anything - we copy the pages directly.)
	1716	*/
	1717	vm_map_entry_allocate_object(dst_entry);
	1718	dst_object = dst_entry->object.vm_object;
	1719
	1720	prot = dst_entry->max_protection;
	1721
	1722	/*
	1723	* Loop through all of the pages in the entry's range, copying each
	1724	* one from the source object (it should be there) to the destination
	1725	* object.
	1726	*/
	1727	for (vaddr = dst_entry->start, dst_offset = 0;
	1728	vaddr < dst_entry->end;
	1729	vaddr += PAGE_SIZE, dst_offset += PAGE_SIZE) {
	1730
	1731	/*
	1732	* Allocate a page in the destination object
	1733	*/
	1734	do {
	1735	dst_m = vm_page_alloc(dst_object,
	1736	OFF_TO_IDX(dst_offset), VM_ALLOC_NORMAL);
	1737	if (dst_m == NULL) {
	1738	vm_wait(0);
	1739	}
	1740	} while (dst_m == NULL);
	1741
	1742	/*
	1743	* Find the page in the source object, and copy it in.
	1744	* (Because the source is wired down, the page will be in
	1745	* memory.)
	1746	*/
	1747	src_m = vm_page_lookup(src_object,
	1748	OFF_TO_IDX(dst_offset + src_offset));
	1749	if (src_m == NULL)
	1750	panic("vm_fault_copy_wired: page missing");
	1751
	1752	vm_page_copy(src_m, dst_m);
	1753	vm_page_event(src_m, VMEVENT_COW);
	1754
	1755	/*
	1756	* Enter it in the pmap...
	1757	*/
	1758
	1759	vm_page_flag_clear(dst_m, PG_ZERO);
	1760	pmap_enter(dst_map->pmap, vaddr, dst_m, prot, FALSE);
	1761
	1762	/*
	1763	* Mark it no longer busy, and put it on the active list.
	1764	*/
	1765	vm_page_activate(dst_m);
	1766	vm_page_wakeup(dst_m);
	1767	}
	1768	}
	1769
	1770	#if 0
	1771
	1772	/*
	1773	* This routine checks around the requested page for other pages that
	1774	* might be able to be faulted in. This routine brackets the viable
	1775	* pages for the pages to be paged in.
	1776	*
	1777	* Inputs:
	1778	* m, rbehind, rahead
	1779	*
	1780	* Outputs:
	1781	* marray (array of vm_page_t), reqpage (index of requested page)
	1782	*
	1783	* Return value:
	1784	* number of pages in marray
	1785	*/
	1786	static int
	1787	vm_fault_additional_pages(vm_page_t m, int rbehind, int rahead,
	1788	vm_page_t marray, int reqpage)
	1789	{
	1790	int i,j;
	1791	vm_object_t object;
	1792	vm_pindex_t pindex, startpindex, endpindex, tpindex;
	1793	vm_page_t rtm;
	1794	int cbehind, cahead;
	1795
	1796	object = m->object;
	1797	pindex = m->pindex;
	1798
	1799	/*
	1800	* we don't fault-ahead for device pager
	1801	*/
	1802	if (object->type == OBJT_DEVICE) {
	1803	*reqpage = 0;
	1804	marray[0] = m;
	1805	return 1;
	1806	}
	1807
	1808	/*
	1809	* if the requested page is not available, then give up now
	1810	*/
	1811	if (!vm_pager_has_page(object, pindex, &cbehind, &cahead)) {
	1812	reqpage = 0; / not used by caller, fix compiler warn */
	1813	return 0;
	1814	}
	1815
	1816	if ((cbehind == 0) && (cahead == 0)) {
	1817	*reqpage = 0;
	1818	marray[0] = m;
	1819	return 1;
	1820	}
	1821
	1822	if (rahead > cahead) {
	1823	rahead = cahead;
	1824	}
	1825
	1826	if (rbehind > cbehind) {
	1827	rbehind = cbehind;
	1828	}
	1829
	1830	/*
	1831	* Do not do any readahead if we have insufficient free memory.
	1832	*
	1833	* XXX code was broken disabled before and has instability
	1834	* with this conditonal fixed, so shortcut for now.
	1835	*/
	1836	if (burst_fault == 0 \|\| vm_page_count_severe()) {
	1837	marray[0] = m;
	1838	*reqpage = 0;
	1839	return 1;
	1840	}
	1841
	1842	/*
	1843	* scan backward for the read behind pages -- in memory
	1844	*
	1845	* Assume that if the page is not found an interrupt will not
	1846	* create it. Theoretically interrupts can only remove (busy)
	1847	* pages, not create new associations.
	1848	*/
	1849	if (pindex > 0) {
	1850	if (rbehind > pindex) {
	1851	rbehind = pindex;
	1852	startpindex = 0;
	1853	} else {
	1854	startpindex = pindex - rbehind;
	1855	}
	1856
	1857	crit_enter();
	1858	lwkt_gettoken(&vm_token);
	1859	for (tpindex = pindex; tpindex > startpindex; --tpindex) {
	1860	if (vm_page_lookup(object, tpindex - 1))
	1861	break;
	1862	}
	1863
	1864	i = 0;
	1865	while (tpindex < pindex) {
	1866	rtm = vm_page_alloc(object, tpindex, VM_ALLOC_SYSTEM);
	1867	if (rtm == NULL) {
	1868	lwkt_reltoken(&vm_token);
	1869	crit_exit();
	1870	for (j = 0; j < i; j++) {
	1871	vm_page_free(marray[j]);
	1872	}
	1873	marray[0] = m;
	1874	*reqpage = 0;
	1875	return 1;
	1876	}
	1877	marray[i] = rtm;
	1878	++i;
	1879	++tpindex;
	1880	}
	1881	lwkt_reltoken(&vm_token);
	1882	crit_exit();
	1883	} else {
	1884	i = 0;
	1885	}
	1886
	1887	/*
	1888	* Assign requested page
	1889	*/
	1890	marray[i] = m;
	1891	*reqpage = i;
	1892	++i;
	1893
	1894	/*
	1895	* Scan forwards for read-ahead pages
	1896	*/
	1897	tpindex = pindex + 1;
	1898	endpindex = tpindex + rahead;
	1899	if (endpindex > object->size)
	1900	endpindex = object->size;
	1901
	1902	crit_enter();
	1903	lwkt_gettoken(&vm_token);
	1904	while (tpindex < endpindex) {
	1905	if (vm_page_lookup(object, tpindex))
	1906	break;
	1907	rtm = vm_page_alloc(object, tpindex, VM_ALLOC_SYSTEM);
	1908	if (rtm == NULL)
	1909	break;
	1910	marray[i] = rtm;
	1911	++i;
	1912	++tpindex;
	1913	}
	1914	lwkt_reltoken(&vm_token);
	1915	crit_exit();
	1916
	1917	return (i);
	1918	}
	1919
	1920	#endif
	1921
	1922	/*
	1923	* vm_prefault() provides a quick way of clustering pagefaults into a
	1924	* processes address space. It is a "cousin" of pmap_object_init_pt,
	1925	* except it runs at page fault time instead of mmap time.
	1926	*
	1927	* This code used to be per-platform pmap_prefault(). It is now
	1928	* machine-independent and enhanced to also pre-fault zero-fill pages
	1929	* (see vm.fast_fault) as well as make them writable, which greatly
	1930	* reduces the number of page faults programs incur.
	1931	*
	1932	* Application performance when pre-faulting zero-fill pages is heavily
	1933	* dependent on the application. Very tiny applications like /bin/echo
	1934	* lose a little performance while applications of any appreciable size
	1935	* gain performance. Prefaulting multiple pages also reduces SMP
	1936	* congestion and can improve SMP performance significantly.
	1937	*
	1938	* NOTE! prot may allow writing but this only applies to the top level
	1939	* object. If we wind up mapping a page extracted from a backing
	1940	* object we have to make sure it is read-only.
	1941	*
	1942	* NOTE! The caller has already handled any COW operations on the
	1943	* vm_map_entry via the normal fault code. Do NOT call this
	1944	* shortcut unless the normal fault code has run on this entry.
	1945	*
	1946	* No other requirements.
	1947	*/
	1948	#define PFBAK 4
	1949	#define PFFOR 4
	1950	#define PAGEORDER_SIZE (PFBAK+PFFOR)
	1951
	1952	static int vm_prefault_pageorder[] = {
	1953	-PAGE_SIZE, PAGE_SIZE,
	1954	-2 * PAGE_SIZE, 2 * PAGE_SIZE,
	1955	-3 * PAGE_SIZE, 3 * PAGE_SIZE,
	1956	-4 * PAGE_SIZE, 4 * PAGE_SIZE
	1957	};
	1958
	1959	static void
	1960	vm_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry, int prot)
	1961	{
	1962	struct lwp *lp;
	1963	vm_page_t m;
	1964	vm_offset_t starta;
	1965	vm_offset_t addr;
	1966	vm_pindex_t index;
	1967	vm_pindex_t pindex;
	1968	vm_object_t object;
	1969	int pprot;
	1970	int i;
	1971
	1972	/*
	1973	* We do not currently prefault mappings that use virtual page
	1974	* tables. We do not prefault foreign pmaps.
	1975	*/
	1976	if (entry->maptype == VM_MAPTYPE_VPAGETABLE)
	1977	return;
	1978	lp = curthread->td_lwp;
	1979	if (lp == NULL \|\| (pmap != vmspace_pmap(lp->lwp_vmspace)))
	1980	return;
	1981
	1982	object = entry->object.vm_object;
	1983
	1984	starta = addra - PFBAK * PAGE_SIZE;
	1985	if (starta < entry->start)
	1986	starta = entry->start;
	1987	else if (starta > addra)
	1988	starta = 0;
	1989
	1990	/*
	1991	* critical section protection is required to maintain the
	1992	* page/object association, interrupts can free pages and remove
	1993	* them from their objects.
	1994	*/
	1995	crit_enter();
	1996	lwkt_gettoken(&vm_token);
	1997	for (i = 0; i < PAGEORDER_SIZE; i++) {
	1998	vm_object_t lobject;
	1999	int allocated = 0;
	2000
	2001	addr = addra + vm_prefault_pageorder[i];
	2002	if (addr > addra + (PFFOR * PAGE_SIZE))
	2003	addr = 0;
	2004
	2005	if (addr < starta \|\| addr >= entry->end)
	2006	continue;
	2007
	2008	if (pmap_prefault_ok(pmap, addr) == 0)
	2009	continue;
	2010
	2011	/*
	2012	* Follow the VM object chain to obtain the page to be mapped
	2013	* into the pmap.
	2014	*
	2015	* If we reach the terminal object without finding a page
	2016	* and we determine it would be advantageous, then allocate
	2017	* a zero-fill page for the base object. The base object
	2018	* is guaranteed to be OBJT_DEFAULT for this case.
	2019	*
	2020	* In order to not have to check the pager via haspage()
	2021	* we stop if any non-default object is encountered. e.g.
	2022	* a vnode or swap object would stop the loop.
	2023	*/
	2024	index = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
	2025	lobject = object;
	2026	pindex = index;
	2027	pprot = prot;
	2028
	2029	while ((m = vm_page_lookup(lobject, pindex)) == NULL) {
	2030	if (lobject->type != OBJT_DEFAULT)
	2031	break;
	2032	if (lobject->backing_object == NULL) {
	2033	if (vm_fast_fault == 0)
	2034	break;
	2035	if (vm_prefault_pageorder[i] < 0 \|\|
	2036	(prot & VM_PROT_WRITE) == 0 \|\|
	2037	vm_page_count_min(0)) {
	2038	break;
	2039	}
	2040	/* note: allocate from base object */
	2041	m = vm_page_alloc(object, index,
	2042	VM_ALLOC_NORMAL \| VM_ALLOC_ZERO);
	2043
	2044	if ((m->flags & PG_ZERO) == 0) {
	2045	vm_page_zero_fill(m);
	2046	} else {
	2047	vm_page_flag_clear(m, PG_ZERO);
	2048	mycpu->gd_cnt.v_ozfod++;
	2049	}
	2050	mycpu->gd_cnt.v_zfod++;
	2051	m->valid = VM_PAGE_BITS_ALL;
	2052	allocated = 1;
	2053	pprot = prot;
	2054	/* lobject = object .. not needed */
	2055	break;
	2056	}
	2057	if (lobject->backing_object_offset & PAGE_MASK)
	2058	break;
	2059	pindex += lobject->backing_object_offset >> PAGE_SHIFT;
	2060	lobject = lobject->backing_object;
	2061	pprot &= ~VM_PROT_WRITE;
	2062	}
	2063	/*
	2064	* NOTE: lobject now invalid (if we did a zero-fill we didn't
	2065	* bother assigning lobject = object).
	2066	*
	2067	* Give-up if the page is not available.
	2068	*/
	2069	if (m == NULL)
	2070	break;
	2071
	2072	/*
	2073	* Do not conditionalize on PG_RAM. If pages are present in
	2074	* the VM system we assume optimal caching. If caching is
	2075	* not optimal the I/O gravy train will be restarted when we
	2076	* hit an unavailable page. We do not want to try to restart
	2077	* the gravy train now because we really don't know how much
	2078	* of the object has been cached. The cost for restarting
	2079	* the gravy train should be low (since accesses will likely
	2080	* be I/O bound anyway).
	2081	*
	2082	* The object must be marked dirty if we are mapping a
	2083	* writable page.
	2084	*/
	2085	if (pprot & VM_PROT_WRITE)
	2086	vm_object_set_writeable_dirty(m->object);
	2087
	2088	/*
	2089	* Enter the page into the pmap if appropriate. If we had
	2090	* allocated the page we have to place it on a queue. If not
	2091	* we just have to make sure it isn't on the cache queue
	2092	* (pages on the cache queue are not allowed to be mapped).
	2093	*/
	2094	if (allocated) {
	2095	pmap_enter(pmap, addr, m, pprot, 0);
	2096	vm_page_deactivate(m);
	2097	vm_page_wakeup(m);
	2098	} else if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
	2099	(m->busy == 0) &&
	2100	(m->flags & (PG_BUSY \| PG_FICTITIOUS)) == 0) {
	2101
	2102	if ((m->queue - m->pc) == PQ_CACHE) {
	2103	vm_page_deactivate(m);
	2104	}
	2105	vm_page_busy(m);
	2106	pmap_enter(pmap, addr, m, pprot, 0);
	2107	vm_page_wakeup(m);
	2108	}
	2109	}
	2110	lwkt_reltoken(&vm_token);
	2111	crit_exit();
	2112	}