gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 1991, 1993
	3	* The Regents of the University of California. All rights reserved.
	4	*
	5	* This code is derived from software contributed to Berkeley by
	6	* The Mach Operating System project at Carnegie-Mellon University.
	7	*
	8	* Redistribution and use in source and binary forms, with or without
	9	* modification, are permitted provided that the following conditions
	10	* are met:
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in the
	15	* documentation and/or other materials provided with the distribution.
	16	* 3. All advertising materials mentioning features or use of this software
	17	* must display the following acknowledgement:
	18	* This product includes software developed by the University of
	19	* California, Berkeley and its contributors.
	20	* 4. Neither the name of the University nor the names of its contributors
	21	* may be used to endorse or promote products derived from this software
	22	* without specific prior written permission.
	23	*
	24	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	25	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	26	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	27	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	28	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	29	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	30	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	31	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	32	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	33	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	34	* SUCH DAMAGE.
	35	*
	36	* from: @(#)vm_map.c 8.3 (Berkeley) 1/12/94
	37	*
	38	*
	39	* Copyright (c) 1987, 1990 Carnegie-Mellon University.
	40	* All rights reserved.
	41	*
	42	* Authors: Avadis Tevanian, Jr., Michael Wayne Young
	43	*
	44	* Permission to use, copy, modify and distribute this software and
	45	* its documentation is hereby granted, provided that both the copyright
	46	* notice and this permission notice appear in all copies of the
	47	* software, derivative works or modified versions, and any portions
	48	* thereof, and that both notices appear in supporting documentation.
	49	*
	50	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
	51	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
	52	* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
	53	*
	54	* Carnegie Mellon requests users of this software to return to
	55	*
	56	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
	57	* School of Computer Science
	58	* Carnegie Mellon University
	59	* Pittsburgh PA 15213-3890
	60	*
	61	* any improvements or extensions that they make and grant Carnegie the
	62	* rights to redistribute these changes.
	63	*
	64	* $FreeBSD: src/sys/vm/vm_map.c,v 1.187.2.19 2003/05/27 00:47:02 alc Exp $
	65	* $DragonFly: src/sys/vm/vm_map.c,v 1.29 2004/07/21 01:25:18 dillon Exp $
	66	*/
	67
	68	/*
	69	* Virtual memory mapping module.
	70	*/
	71
	72	#include <sys/param.h>
	73	#include <sys/systm.h>
	74	#include <sys/proc.h>
	75	#include <sys/lock.h>
	76	#include <sys/vmmeter.h>
	77	#include <sys/mman.h>
	78	#include <sys/vnode.h>
	79	#include <sys/resourcevar.h>
	80	#include <sys/shm.h>
	81
	82	#include <vm/vm.h>
	83	#include <vm/vm_param.h>
	84	#include <vm/pmap.h>
	85	#include <vm/vm_map.h>
	86	#include <vm/vm_page.h>
	87	#include <vm/vm_object.h>
	88	#include <vm/vm_pager.h>
	89	#include <vm/vm_kern.h>
	90	#include <vm/vm_extern.h>
	91	#include <vm/swap_pager.h>
	92	#include <vm/vm_zone.h>
	93
	94	#include <sys/thread2.h>
	95
	96	/*
	97	* Virtual memory maps provide for the mapping, protection,
	98	* and sharing of virtual memory objects. In addition,
	99	* this module provides for an efficient virtual copy of
	100	* memory from one map to another.
	101	*
	102	* Synchronization is required prior to most operations.
	103	*
	104	* Maps consist of an ordered doubly-linked list of simple
	105	* entries; a single hint is used to speed up lookups.
	106	*
	107	* Since portions of maps are specified by start/end addresses,
	108	* which may not align with existing map entries, all
	109	* routines merely "clip" entries to these start/end values.
	110	* [That is, an entry is split into two, bordering at a
	111	* start or end value.] Note that these clippings may not
	112	* always be necessary (as the two resulting entries are then
	113	* not changed); however, the clipping is done for convenience.
	114	*
	115	* As mentioned above, virtual copy operations are performed
	116	* by copying VM object references from one map to
	117	* another, and then marking both regions as copy-on-write.
	118	*/
	119
	120	/*
	121	* vm_map_startup:
	122	*
	123	* Initialize the vm_map module. Must be called before
	124	* any other vm_map routines.
	125	*
	126	* Map and entry structures are allocated from the general
	127	* purpose memory pool with some exceptions:
	128	*
	129	* - The kernel map and kmem submap are allocated statically.
	130	* - Kernel map entries are allocated out of a static pool.
	131	*
	132	* These restrictions are necessary since malloc() uses the
	133	* maps and requires map entries.
	134	*/
	135
	136	static struct vm_zone mapentzone_store, mapzone_store;
	137	static vm_zone_t mapentzone, mapzone, vmspace_zone;
	138	static struct vm_object mapentobj, mapobj;
	139
	140	static struct vm_map_entry map_entry_init[MAX_MAPENT];
	141	static struct vm_map map_init[MAX_KMAP];
	142
	143	static vm_map_entry_t vm_map_entry_create(vm_map_t map, int *);
	144	static void vm_map_entry_dispose (vm_map_t map, vm_map_entry_t entry, int *);
	145	static void _vm_map_clip_end (vm_map_t, vm_map_entry_t, vm_offset_t, int *);
	146	static void _vm_map_clip_start (vm_map_t, vm_map_entry_t, vm_offset_t, int *);
	147	static void vm_map_entry_delete (vm_map_t, vm_map_entry_t, int *);
	148	static void vm_map_entry_unwire (vm_map_t, vm_map_entry_t);
	149	static void vm_map_copy_entry (vm_map_t, vm_map_t, vm_map_entry_t,
	150	vm_map_entry_t);
	151	static void vm_map_split (vm_map_entry_t);
	152	static void vm_map_unclip_range (vm_map_t map, vm_map_entry_t start_entry, vm_offset_t start, vm_offset_t end, int *count, int flags);
	153
	154	void
	155	vm_map_startup(void)
	156	{
	157	mapzone = &mapzone_store;
	158	zbootinit(mapzone, "MAP", sizeof (struct vm_map),
	159	map_init, MAX_KMAP);
	160	mapentzone = &mapentzone_store;
	161	zbootinit(mapentzone, "MAP ENTRY", sizeof (struct vm_map_entry),
	162	map_entry_init, MAX_MAPENT);
	163	}
	164
	165	/*
	166	* Allocate a vmspace structure, including a vm_map and pmap,
	167	* and initialize those structures. The refcnt is set to 1.
	168	* The remaining fields must be initialized by the caller.
	169	*/
	170	struct vmspace *
	171	vmspace_alloc(vm_offset_t min, vm_offset_t max)
	172	{
	173	struct vmspace *vm;
	174
	175	vm = zalloc(vmspace_zone);
	176	vm_map_init(&vm->vm_map, min, max);
	177	pmap_pinit(vmspace_pmap(vm));
	178	vm->vm_map.pmap = vmspace_pmap(vm); /* XXX */
	179	vm->vm_refcnt = 1;
	180	vm->vm_shm = NULL;
	181	vm->vm_exitingcnt = 0;
	182	return (vm);
	183	}
	184
	185	void
	186	vm_init2(void)
	187	{
	188	zinitna(mapentzone, &mapentobj, NULL, 0, 0, ZONE_USE_RESERVE, 1);
	189	zinitna(mapzone, &mapobj, NULL, 0, 0, 0, 1);
	190	vmspace_zone = zinit("VMSPACE", sizeof (struct vmspace), 0, 0, 3);
	191	pmap_init2();
	192	vm_object_init2();
	193	}
	194
	195	static __inline void
	196	vmspace_dofree(struct vmspace *vm)
	197	{
	198	int count;
	199
	200	/*
	201	* Make sure any SysV shm is freed, it might not have in
	202	* exit1()
	203	*/
	204	shmexit(vm);
	205
	206	KKASSERT(vm->vm_upcalls == NULL);
	207
	208	/*
	209	* Lock the map, to wait out all other references to it.
	210	* Delete all of the mappings and pages they hold, then call
	211	* the pmap module to reclaim anything left.
	212	*/
	213	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	214	vm_map_lock(&vm->vm_map);
	215	vm_map_delete(&vm->vm_map, vm->vm_map.min_offset,
	216	vm->vm_map.max_offset, &count);
	217	vm_map_unlock(&vm->vm_map);
	218	vm_map_entry_release(count);
	219
	220	pmap_release(vmspace_pmap(vm));
	221	zfree(vmspace_zone, vm);
	222	}
	223
	224	void
	225	vmspace_free(struct vmspace *vm)
	226	{
	227	if (vm->vm_refcnt == 0)
	228	panic("vmspace_free: attempt to free already freed vmspace");
	229
	230	if (--vm->vm_refcnt == 0 && vm->vm_exitingcnt == 0)
	231	vmspace_dofree(vm);
	232	}
	233
	234	void
	235	vmspace_exitfree(struct proc *p)
	236	{
	237	struct vmspace *vm;
	238
	239	vm = p->p_vmspace;
	240	p->p_vmspace = NULL;
	241
	242	/*
	243	* cleanup by parent process wait()ing on exiting child. vm_refcnt
	244	* may not be 0 (e.g. fork() and child exits without exec()ing).
	245	* exitingcnt may increment above 0 and drop back down to zero
	246	* several times while vm_refcnt is held non-zero. vm_refcnt
	247	* may also increment above 0 and drop back down to zero several
	248	* times while vm_exitingcnt is held non-zero.
	249	*
	250	* The last wait on the exiting child's vmspace will clean up
	251	* the remainder of the vmspace.
	252	*/
	253	if (--vm->vm_exitingcnt == 0 && vm->vm_refcnt == 0)
	254	vmspace_dofree(vm);
	255	}
	256
	257	/*
	258	* vmspace_swap_count() - count the approximate swap useage in pages for a
	259	* vmspace.
	260	*
	261	* Swap useage is determined by taking the proportional swap used by
	262	* VM objects backing the VM map. To make up for fractional losses,
	263	* if the VM object has any swap use at all the associated map entries
	264	* count for at least 1 swap page.
	265	*/
	266	int
	267	vmspace_swap_count(struct vmspace *vmspace)
	268	{
	269	vm_map_t map = &vmspace->vm_map;
	270	vm_map_entry_t cur;
	271	int count = 0;
	272
	273	for (cur = map->header.next; cur != &map->header; cur = cur->next) {
	274	vm_object_t object;
	275
	276	if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 &&
	277	(object = cur->object.vm_object) != NULL &&
	278	object->type == OBJT_SWAP
	279	) {
	280	int n = (cur->end - cur->start) / PAGE_SIZE;
	281
	282	if (object->un_pager.swp.swp_bcount) {
	283	count += object->un_pager.swp.swp_bcount *
	284	SWAP_META_PAGES * n / object->size + 1;
	285	}
	286	}
	287	}
	288	return(count);
	289	}
	290
	291
	292	/*
	293	* vm_map_create:
	294	*
	295	* Creates and returns a new empty VM map with
	296	* the given physical map structure, and having
	297	* the given lower and upper address bounds.
	298	*/
	299	vm_map_t
	300	vm_map_create(pmap_t pmap, vm_offset_t min, vm_offset_t max)
	301	{
	302	vm_map_t result;
	303
	304	result = zalloc(mapzone);
	305	vm_map_init(result, min, max);
	306	result->pmap = pmap;
	307	return (result);
	308	}
	309
	310	/*
	311	* Initialize an existing vm_map structure
	312	* such as that in the vmspace structure.
	313	* The pmap is set elsewhere.
	314	*/
	315	void
	316	vm_map_init(struct vm_map *map, vm_offset_t min, vm_offset_t max)
	317	{
	318	map->header.next = map->header.prev = &map->header;
	319	map->nentries = 0;
	320	map->size = 0;
	321	map->system_map = 0;
	322	map->infork = 0;
	323	map->min_offset = min;
	324	map->max_offset = max;
	325	map->first_free = &map->header;
	326	map->hint = &map->header;
	327	map->timestamp = 0;
	328	lockinit(&map->lock, 0, "thrd_sleep", 0, LK_NOPAUSE);
	329	}
	330
	331	/*
	332	* vm_map_entry_cpu_init:
	333	*
	334	* Set an initial negative count so the first attempt to reserve
	335	* space preloads a bunch of vm_map_entry's for this cpu. This
	336	* routine is called in early boot so we cannot just call
	337	* vm_map_entry_reserve().
	338	*
	339	* May be called for a gd other then mycpu.
	340	*/
	341	void
	342	vm_map_entry_reserve_cpu_init(globaldata_t gd)
	343	{
	344	gd->gd_vme_avail -= MAP_RESERVE_COUNT * 2;
	345	}
	346
	347	/*
	348	* vm_map_entry_reserve:
	349	*
	350	* Reserves vm_map_entry structures so code later on can manipulate
	351	* map_entry structures within a locked map without blocking trying
	352	* to allocate a new vm_map_entry.
	353	*/
	354	int
	355	vm_map_entry_reserve(int count)
	356	{
	357	struct globaldata *gd = mycpu;
	358	vm_map_entry_t entry;
	359
	360	crit_enter();
	361	gd->gd_vme_avail -= count;
	362
	363	/*
	364	* Make sure we have enough structures in gd_vme_base to handle
	365	* the reservation request.
	366	*/
	367	while (gd->gd_vme_avail < 0) {
	368	entry = zalloc(mapentzone);
	369	entry->next = gd->gd_vme_base;
	370	gd->gd_vme_base = entry;
	371	++gd->gd_vme_avail;
	372	}
	373	crit_exit();
	374	return(count);
	375	}
	376
	377	/*
	378	* vm_map_entry_release:
	379	*
	380	* Releases previously reserved vm_map_entry structures that were not
	381	* used. If we have too much junk in our per-cpu cache clean some of
	382	* it out.
	383	*/
	384	void
	385	vm_map_entry_release(int count)
	386	{
	387	struct globaldata *gd = mycpu;
	388	vm_map_entry_t entry;
	389
	390	crit_enter();
	391	gd->gd_vme_avail += count;
	392	while (gd->gd_vme_avail > MAP_RESERVE_SLOP) {
	393	entry = gd->gd_vme_base;
	394	KKASSERT(entry != NULL);
	395	gd->gd_vme_base = entry->next;
	396	--gd->gd_vme_avail;
	397	crit_exit();
	398	zfree(mapentzone, entry);
	399	crit_enter();
	400	}
	401	crit_exit();
	402	}
	403
	404	/*
	405	* vm_map_entry_kreserve:
	406	*
	407	* Reserve map entry structures for use in kernel_map or (if it exists)
	408	* kmem_map. These entries have ALREADY been reserved on a per-cpu
	409	* basis when the map was inited. This function is used by zalloc()
	410	* to avoid a recursion when zalloc() itself needs to allocate additional
	411	* kernel memory.
	412	*
	413	* This function should only be used when the caller intends to later
	414	* call vm_map_entry_reserve() to 'normalize' the reserve cache.
	415	*/
	416	int
	417	vm_map_entry_kreserve(int count)
	418	{
	419	struct globaldata *gd = mycpu;
	420
	421	crit_enter();
	422	gd->gd_vme_kdeficit += count;
	423	crit_exit();
	424	KKASSERT(gd->gd_vme_base != NULL);
	425	return(count);
	426	}
	427
	428	/*
	429	* vm_map_entry_krelease:
	430	*
	431	* Release previously reserved map entries for kernel_map or kmem_map
	432	* use. This routine determines how many entries were actually used and
	433	* replentishes the kernel reserve supply from vme_avail.
	434	*
	435	* If there is insufficient supply vme_avail will go negative, which is
	436	* ok. We cannot safely call zalloc in this function without getting
	437	* into a recursion deadlock. zalloc() will call vm_map_entry_reserve()
	438	* to regenerate the lost entries.
	439	*/
	440	void
	441	vm_map_entry_krelease(int count)
	442	{
	443	struct globaldata *gd = mycpu;
	444
	445	crit_enter();
	446	gd->gd_vme_kdeficit -= count;
	447	gd->gd_vme_avail -= gd->gd_vme_kdeficit; /* can go negative */
	448	gd->gd_vme_kdeficit = 0;
	449	crit_exit();
	450	}
	451
	452	/*
	453	* vm_map_entry_create: [ internal use only ]
	454	*
	455	* Allocates a VM map entry for insertion. No entry fields are filled
	456	* in.
	457	*
	458	* This routine may be called from an interrupt thread but not a FAST
	459	* interrupt. This routine may recurse the map lock.
	460	*/
	461	static vm_map_entry_t
	462	vm_map_entry_create(vm_map_t map, int *countp)
	463	{
	464	struct globaldata *gd = mycpu;
	465	vm_map_entry_t entry;
	466
	467	KKASSERT(*countp > 0);
	468	--*countp;
	469	crit_enter();
	470	entry = gd->gd_vme_base;
	471	KASSERT(entry != NULL, ("gd_vme_base NULL! count %d", *countp));
	472	gd->gd_vme_base = entry->next;
	473	crit_exit();
	474	return(entry);
	475	}
	476
	477	/*
	478	* vm_map_entry_dispose: [ internal use only ]
	479	*
	480	* Dispose of a vm_map_entry that is no longer being referenced. This
	481	* function may be called from an interrupt.
	482	*/
	483	static void
	484	vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry, int *countp)
	485	{
	486	struct globaldata *gd = mycpu;
	487
	488	++*countp;
	489	crit_enter();
	490	entry->next = gd->gd_vme_base;
	491	gd->gd_vme_base = entry;
	492	crit_exit();
	493	}
	494
	495
	496	/*
	497	* vm_map_entry_{un,}link:
	498	*
	499	* Insert/remove entries from maps.
	500	*/
	501	static __inline void
	502	vm_map_entry_link(vm_map_t map,
	503	vm_map_entry_t after_where,
	504	vm_map_entry_t entry)
	505	{
	506	map->nentries++;
	507	entry->prev = after_where;
	508	entry->next = after_where->next;
	509	entry->next->prev = entry;
	510	after_where->next = entry;
	511	}
	512
	513	static __inline void
	514	vm_map_entry_unlink(vm_map_t map,
	515	vm_map_entry_t entry)
	516	{
	517	vm_map_entry_t prev;
	518	vm_map_entry_t next;
	519
	520	if (entry->eflags & MAP_ENTRY_IN_TRANSITION)
	521	panic("vm_map_entry_unlink: attempt to mess with locked entry! %p", entry);
	522	prev = entry->prev;
	523	next = entry->next;
	524	next->prev = prev;
	525	prev->next = next;
	526	map->nentries--;
	527	}
	528
	529	/*
	530	* SAVE_HINT:
	531	*
	532	* Saves the specified entry as the hint for
	533	* future lookups.
	534	*/
	535	#define SAVE_HINT(map,value) \
	536	(map)->hint = (value);
	537
	538	/*
	539	* vm_map_lookup_entry: [ internal use only ]
	540	*
	541	* Finds the map entry containing (or
	542	* immediately preceding) the specified address
	543	* in the given map; the entry is returned
	544	* in the "entry" parameter. The boolean
	545	* result indicates whether the address is
	546	* actually contained in the map.
	547	*/
	548	boolean_t
	549	vm_map_lookup_entry(vm_map_t map, vm_offset_t address,
	550	vm_map_entry_t entry / OUT */)
	551	{
	552	vm_map_entry_t cur;
	553	vm_map_entry_t last;
	554
	555	/*
	556	* Start looking either from the head of the list, or from the hint.
	557	*/
	558
	559	cur = map->hint;
	560
	561	if (cur == &map->header)
	562	cur = cur->next;
	563
	564	if (address >= cur->start) {
	565	/*
	566	* Go from hint to end of list.
	567	*
	568	* But first, make a quick check to see if we are already looking
	569	* at the entry we want (which is usually the case). Note also
	570	* that we don't need to save the hint here... it is the same
	571	* hint (unless we are at the header, in which case the hint
	572	* didn't buy us anything anyway).
	573	*/
	574	last = &map->header;
	575	if ((cur != last) && (cur->end > address)) {
	576	*entry = cur;
	577	return (TRUE);
	578	}
	579	} else {
	580	/*
	581	* Go from start to hint, inclusively
	582	*/
	583	last = cur->next;
	584	cur = map->header.next;
	585	}
	586
	587	/*
	588	* Search linearly
	589	*/
	590
	591	while (cur != last) {
	592	if (cur->end > address) {
	593	if (address >= cur->start) {
	594	/*
	595	* Save this lookup for future hints, and
	596	* return
	597	*/
	598
	599	*entry = cur;
	600	SAVE_HINT(map, cur);
	601	return (TRUE);
	602	}
	603	break;
	604	}
	605	cur = cur->next;
	606	}
	607	*entry = cur->prev;
	608	SAVE_HINT(map, *entry);
	609	return (FALSE);
	610	}
	611
	612	/*
	613	* vm_map_insert:
	614	*
	615	* Inserts the given whole VM object into the target
	616	* map at the specified address range. The object's
	617	* size should match that of the address range.
	618	*
	619	* Requires that the map be locked, and leaves it so. Requires that
	620	* sufficient vm_map_entry structures have been reserved and tracks
	621	* the use via countp.
	622	*
	623	* If object is non-NULL, ref count must be bumped by caller
	624	* prior to making call to account for the new entry.
	625	*/
	626	int
	627	vm_map_insert(vm_map_t map, int *countp,
	628	vm_object_t object, vm_ooffset_t offset,
	629	vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max,
	630	int cow)
	631	{
	632	vm_map_entry_t new_entry;
	633	vm_map_entry_t prev_entry;
	634	vm_map_entry_t temp_entry;
	635	vm_eflags_t protoeflags;
	636
	637	/*
	638	* Check that the start and end points are not bogus.
	639	*/
	640
	641	if ((start < map->min_offset) \|\| (end > map->max_offset) \|\|
	642	(start >= end))
	643	return (KERN_INVALID_ADDRESS);
	644
	645	/*
	646	* Find the entry prior to the proposed starting address; if it's part
	647	* of an existing entry, this range is bogus.
	648	*/
	649
	650	if (vm_map_lookup_entry(map, start, &temp_entry))
	651	return (KERN_NO_SPACE);
	652
	653	prev_entry = temp_entry;
	654
	655	/*
	656	* Assert that the next entry doesn't overlap the end point.
	657	*/
	658
	659	if ((prev_entry->next != &map->header) &&
	660	(prev_entry->next->start < end))
	661	return (KERN_NO_SPACE);
	662
	663	protoeflags = 0;
	664
	665	if (cow & MAP_COPY_ON_WRITE)
	666	protoeflags \|= MAP_ENTRY_COW\|MAP_ENTRY_NEEDS_COPY;
	667
	668	if (cow & MAP_NOFAULT) {
	669	protoeflags \|= MAP_ENTRY_NOFAULT;
	670
	671	KASSERT(object == NULL,
	672	("vm_map_insert: paradoxical MAP_NOFAULT request"));
	673	}
	674	if (cow & MAP_DISABLE_SYNCER)
	675	protoeflags \|= MAP_ENTRY_NOSYNC;
	676	if (cow & MAP_DISABLE_COREDUMP)
	677	protoeflags \|= MAP_ENTRY_NOCOREDUMP;
	678
	679	if (object) {
	680	/*
	681	* When object is non-NULL, it could be shared with another
	682	* process. We have to set or clear OBJ_ONEMAPPING
	683	* appropriately.
	684	*/
	685	if ((object->ref_count > 1) \|\| (object->shadow_count != 0)) {
	686	vm_object_clear_flag(object, OBJ_ONEMAPPING);
	687	}
	688	}
	689	else if ((prev_entry != &map->header) &&
	690	(prev_entry->eflags == protoeflags) &&
	691	(prev_entry->end == start) &&
	692	(prev_entry->wired_count == 0) &&
	693	((prev_entry->object.vm_object == NULL) \|\|
	694	vm_object_coalesce(prev_entry->object.vm_object,
	695	OFF_TO_IDX(prev_entry->offset),
	696	(vm_size_t)(prev_entry->end - prev_entry->start),
	697	(vm_size_t)(end - prev_entry->end)))) {
	698	/*
	699	* We were able to extend the object. Determine if we
	700	* can extend the previous map entry to include the
	701	* new range as well.
	702	*/
	703	if ((prev_entry->inheritance == VM_INHERIT_DEFAULT) &&
	704	(prev_entry->protection == prot) &&
	705	(prev_entry->max_protection == max)) {
	706	map->size += (end - prev_entry->end);
	707	prev_entry->end = end;
	708	vm_map_simplify_entry(map, prev_entry, countp);
	709	return (KERN_SUCCESS);
	710	}
	711
	712	/*
	713	* If we can extend the object but cannot extend the
	714	* map entry, we have to create a new map entry. We
	715	* must bump the ref count on the extended object to
	716	* account for it. object may be NULL.
	717	*/
	718	object = prev_entry->object.vm_object;
	719	offset = prev_entry->offset +
	720	(prev_entry->end - prev_entry->start);
	721	vm_object_reference(object);
	722	}
	723
	724	/*
	725	* NOTE: if conditionals fail, object can be NULL here. This occurs
	726	* in things like the buffer map where we manage kva but do not manage
	727	* backing objects.
	728	*/
	729
	730	/*
	731	* Create a new entry
	732	*/
	733
	734	new_entry = vm_map_entry_create(map, countp);
	735	new_entry->start = start;
	736	new_entry->end = end;
	737
	738	new_entry->eflags = protoeflags;
	739	new_entry->object.vm_object = object;
	740	new_entry->offset = offset;
	741	new_entry->avail_ssize = 0;
	742
	743	new_entry->inheritance = VM_INHERIT_DEFAULT;
	744	new_entry->protection = prot;
	745	new_entry->max_protection = max;
	746	new_entry->wired_count = 0;
	747
	748	/*
	749	* Insert the new entry into the list
	750	*/
	751
	752	vm_map_entry_link(map, prev_entry, new_entry);
	753	map->size += new_entry->end - new_entry->start;
	754
	755	/*
	756	* Update the free space hint
	757	*/
	758	if ((map->first_free == prev_entry) &&
	759	(prev_entry->end >= new_entry->start)) {
	760	map->first_free = new_entry;
	761	}
	762
	763	#if 0
	764	/*
	765	* Temporarily removed to avoid MAP_STACK panic, due to
	766	* MAP_STACK being a huge hack. Will be added back in
	767	* when MAP_STACK (and the user stack mapping) is fixed.
	768	*/
	769	/*
	770	* It may be possible to simplify the entry
	771	*/
	772	vm_map_simplify_entry(map, new_entry, countp);
	773	#endif
	774
	775	if (cow & (MAP_PREFAULT\|MAP_PREFAULT_PARTIAL)) {
	776	pmap_object_init_pt(map->pmap, start, prot,
	777	object, OFF_TO_IDX(offset), end - start,
	778	cow & MAP_PREFAULT_PARTIAL);
	779	}
	780
	781	return (KERN_SUCCESS);
	782	}
	783
	784	/*
	785	* Find sufficient space for `length' bytes in the given map, starting at
	786	* `start'. The map must be locked. Returns 0 on success, 1 on no space.
	787	*
	788	* This function will returned an arbitrarily aligned pointer. If no
	789	* particular alignment is required you should pass align as 1. Note that
	790	* the map may return PAGE_SIZE aligned pointers if all the lengths used in
	791	* the map are a multiple of PAGE_SIZE, even if you pass a smaller align
	792	* argument.
	793	*
	794	* 'align' should be a power of 2 but is not required to be.
	795	*/
	796	int
	797	vm_map_findspace(
	798	vm_map_t map,
	799	vm_offset_t start,
	800	vm_size_t length,
	801	vm_offset_t align,
	802	vm_offset_t *addr)
	803	{
	804	vm_map_entry_t entry, next;
	805	vm_offset_t end;
	806	vm_offset_t align_mask;
	807
	808	if (start < map->min_offset)
	809	start = map->min_offset;
	810	if (start > map->max_offset)
	811	return (1);
	812
	813	/*
	814	* If the alignment is not a power of 2 we will have to use
	815	* a mod/division, set align_mask to a special value.
	816	*/
	817	if ((align \| (align - 1)) + 1 != (align << 1))
	818	align_mask = (vm_offset_t)-1;
	819	else
	820	align_mask = align - 1;
	821
	822	retry:
	823	/*
	824	* Look for the first possible address; if there's already something
	825	* at this address, we have to start after it.
	826	*/
	827	if (start == map->min_offset) {
	828	if ((entry = map->first_free) != &map->header)
	829	start = entry->end;
	830	} else {
	831	vm_map_entry_t tmp;
	832
	833	if (vm_map_lookup_entry(map, start, &tmp))
	834	start = tmp->end;
	835	entry = tmp;
	836	}
	837
	838	/*
	839	* Look through the rest of the map, trying to fit a new region in the
	840	* gap between existing regions, or after the very last region.
	841	*/
	842	for (;; start = (entry = next)->end) {
	843	/*
	844	* Adjust the proposed start by the requested alignment,
	845	* be sure that we didn't wrap the address.
	846	*/
	847	if (align_mask == (vm_offset_t)-1)
	848	end = ((start + align - 1) / align) * align;
	849	else
	850	end = (start + align_mask) & ~align_mask;
	851	if (end < start)
	852	return (1);
	853	start = end;
	854	/*
	855	* Find the end of the proposed new region. Be sure we didn't
	856	* go beyond the end of the map, or wrap around the address.
	857	* Then check to see if this is the last entry or if the
	858	* proposed end fits in the gap between this and the next
	859	* entry.
	860	*/
	861	end = start + length;
	862	if (end > map->max_offset \|\| end < start)
	863	return (1);
	864	next = entry->next;
	865	if (next == &map->header \|\| next->start >= end)
	866	break;
	867	}
	868	SAVE_HINT(map, entry);
	869	if (map == kernel_map) {
	870	vm_offset_t ksize;
	871	if ((ksize = round_page(start + length)) > kernel_vm_end) {
	872	pmap_growkernel(ksize);
	873	goto retry;
	874	}
	875	}
	876	*addr = start;
	877	return (0);
	878	}
	879
	880	/*
	881	* vm_map_find finds an unallocated region in the target address
	882	* map with the given length. The search is defined to be
	883	* first-fit from the specified address; the region found is
	884	* returned in the same parameter.
	885	*
	886	* If object is non-NULL, ref count must be bumped by caller
	887	* prior to making call to account for the new entry.
	888	*/
	889	int
	890	vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
	891	vm_offset_t addr, / IN/OUT */
	892	vm_size_t length, boolean_t find_space, vm_prot_t prot,
	893	vm_prot_t max, int cow)
	894	{
	895	vm_offset_t start;
	896	int result;
	897	int count;
	898
	899	start = *addr;
	900
	901	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	902	vm_map_lock(map);
	903	if (find_space) {
	904	if (vm_map_findspace(map, start, length, 1, addr)) {
	905	vm_map_unlock(map);
	906	vm_map_entry_release(count);
	907	return (KERN_NO_SPACE);
	908	}
	909	start = *addr;
	910	}
	911	result = vm_map_insert(map, &count, object, offset,
	912	start, start + length, prot, max, cow);
	913	vm_map_unlock(map);
	914	vm_map_entry_release(count);
	915
	916	return (result);
	917	}
	918
	919	/*
	920	* vm_map_simplify_entry:
	921	*
	922	* Simplify the given map entry by merging with either neighbor. This
	923	* routine also has the ability to merge with both neighbors.
	924	*
	925	* The map must be locked.
	926	*
	927	* This routine guarentees that the passed entry remains valid (though
	928	* possibly extended). When merging, this routine may delete one or
	929	* both neighbors. No action is taken on entries which have their
	930	* in-transition flag set.
	931	*/
	932	void
	933	vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry, int *countp)
	934	{
	935	vm_map_entry_t next, prev;
	936	vm_size_t prevsize, esize;
	937
	938	if (entry->eflags & (MAP_ENTRY_IN_TRANSITION \| MAP_ENTRY_IS_SUB_MAP)) {
	939	++mycpu->gd_cnt.v_intrans_coll;
	940	return;
	941	}
	942
	943	prev = entry->prev;
	944	if (prev != &map->header) {
	945	prevsize = prev->end - prev->start;
	946	if ( (prev->end == entry->start) &&
	947	(prev->object.vm_object == entry->object.vm_object) &&
	948	(!prev->object.vm_object \|\|
	949	(prev->offset + prevsize == entry->offset)) &&
	950	(prev->eflags == entry->eflags) &&
	951	(prev->protection == entry->protection) &&
	952	(prev->max_protection == entry->max_protection) &&
	953	(prev->inheritance == entry->inheritance) &&
	954	(prev->wired_count == entry->wired_count)) {
	955	if (map->first_free == prev)
	956	map->first_free = entry;
	957	if (map->hint == prev)
	958	map->hint = entry;
	959	vm_map_entry_unlink(map, prev);
	960	entry->start = prev->start;
	961	entry->offset = prev->offset;
	962	if (prev->object.vm_object)
	963	vm_object_deallocate(prev->object.vm_object);
	964	vm_map_entry_dispose(map, prev, countp);
	965	}
	966	}
	967
	968	next = entry->next;
	969	if (next != &map->header) {
	970	esize = entry->end - entry->start;
	971	if ((entry->end == next->start) &&
	972	(next->object.vm_object == entry->object.vm_object) &&
	973	(!entry->object.vm_object \|\|
	974	(entry->offset + esize == next->offset)) &&
	975	(next->eflags == entry->eflags) &&
	976	(next->protection == entry->protection) &&
	977	(next->max_protection == entry->max_protection) &&
	978	(next->inheritance == entry->inheritance) &&
	979	(next->wired_count == entry->wired_count)) {
	980	if (map->first_free == next)
	981	map->first_free = entry;
	982	if (map->hint == next)
	983	map->hint = entry;
	984	vm_map_entry_unlink(map, next);
	985	entry->end = next->end;
	986	if (next->object.vm_object)
	987	vm_object_deallocate(next->object.vm_object);
	988	vm_map_entry_dispose(map, next, countp);
	989	}
	990	}
	991	}
	992	/*
	993	* vm_map_clip_start: [ internal use only ]
	994	*
	995	* Asserts that the given entry begins at or after
	996	* the specified address; if necessary,
	997	* it splits the entry into two.
	998	*/
	999	#define vm_map_clip_start(map, entry, startaddr, countp) \
	1000	{ \
	1001	if (startaddr > entry->start) \
	1002	_vm_map_clip_start(map, entry, startaddr, countp); \
	1003	}
	1004
	1005	/*
	1006	* This routine is called only when it is known that
	1007	* the entry must be split.
	1008	*/
	1009	static void
	1010	_vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start, int *countp)
	1011	{
	1012	vm_map_entry_t new_entry;
	1013
	1014	/*
	1015	* Split off the front portion -- note that we must insert the new
	1016	* entry BEFORE this one, so that this entry has the specified
	1017	* starting address.
	1018	*/
	1019
	1020	vm_map_simplify_entry(map, entry, countp);
	1021
	1022	/*
	1023	* If there is no object backing this entry, we might as well create
	1024	* one now. If we defer it, an object can get created after the map
	1025	* is clipped, and individual objects will be created for the split-up
	1026	* map. This is a bit of a hack, but is also about the best place to
	1027	* put this improvement.
	1028	*/
	1029
	1030	if (entry->object.vm_object == NULL && !map->system_map) {
	1031	vm_object_t object;
	1032	object = vm_object_allocate(OBJT_DEFAULT,
	1033	atop(entry->end - entry->start));
	1034	entry->object.vm_object = object;
	1035	entry->offset = 0;
	1036	}
	1037
	1038	new_entry = vm_map_entry_create(map, countp);
	1039	new_entry = entry;
	1040
	1041	new_entry->end = start;
	1042	entry->offset += (start - entry->start);
	1043	entry->start = start;
	1044
	1045	vm_map_entry_link(map, entry->prev, new_entry);
	1046
	1047	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
	1048	vm_object_reference(new_entry->object.vm_object);
	1049	}
	1050	}
	1051
	1052	/*
	1053	* vm_map_clip_end: [ internal use only ]
	1054	*
	1055	* Asserts that the given entry ends at or before
	1056	* the specified address; if necessary,
	1057	* it splits the entry into two.
	1058	*/
	1059
	1060	#define vm_map_clip_end(map, entry, endaddr, countp) \
	1061	{ \
	1062	if (endaddr < entry->end) \
	1063	_vm_map_clip_end(map, entry, endaddr, countp); \
	1064	}
	1065
	1066	/*
	1067	* This routine is called only when it is known that
	1068	* the entry must be split.
	1069	*/
	1070	static void
	1071	_vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end, int *countp)
	1072	{
	1073	vm_map_entry_t new_entry;
	1074
	1075	/*
	1076	* If there is no object backing this entry, we might as well create
	1077	* one now. If we defer it, an object can get created after the map
	1078	* is clipped, and individual objects will be created for the split-up
	1079	* map. This is a bit of a hack, but is also about the best place to
	1080	* put this improvement.
	1081	*/
	1082
	1083	if (entry->object.vm_object == NULL && !map->system_map) {
	1084	vm_object_t object;
	1085	object = vm_object_allocate(OBJT_DEFAULT,
	1086	atop(entry->end - entry->start));
	1087	entry->object.vm_object = object;
	1088	entry->offset = 0;
	1089	}
	1090
	1091	/*
	1092	* Create a new entry and insert it AFTER the specified entry
	1093	*/
	1094
	1095	new_entry = vm_map_entry_create(map, countp);
	1096	new_entry = entry;
	1097
	1098	new_entry->start = entry->end = end;
	1099	new_entry->offset += (end - entry->start);
	1100
	1101	vm_map_entry_link(map, entry, new_entry);
	1102
	1103	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
	1104	vm_object_reference(new_entry->object.vm_object);
	1105	}
	1106	}
	1107
	1108	/*
	1109	* VM_MAP_RANGE_CHECK: [ internal use only ]
	1110	*
	1111	* Asserts that the starting and ending region
	1112	* addresses fall within the valid range of the map.
	1113	*/
	1114	#define VM_MAP_RANGE_CHECK(map, start, end) \
	1115	{ \
	1116	if (start < vm_map_min(map)) \
	1117	start = vm_map_min(map); \
	1118	if (end > vm_map_max(map)) \
	1119	end = vm_map_max(map); \
	1120	if (start > end) \
	1121	start = end; \
	1122	}
	1123
	1124	/*
	1125	* vm_map_transition_wait: [ kernel use only ]
	1126	*
	1127	* Used to block when an in-transition collison occurs. The map
	1128	* is unlocked for the sleep and relocked before the return.
	1129	*/
	1130	static
	1131	void
	1132	vm_map_transition_wait(vm_map_t map)
	1133	{
	1134	vm_map_unlock(map);
	1135	tsleep(map, 0, "vment", 0);
	1136	vm_map_lock(map);
	1137	}
	1138
	1139	/*
	1140	* CLIP_CHECK_BACK
	1141	* CLIP_CHECK_FWD
	1142	*
	1143	* When we do blocking operations with the map lock held it is
	1144	* possible that a clip might have occured on our in-transit entry,
	1145	* requiring an adjustment to the entry in our loop. These macros
	1146	* help the pageable and clip_range code deal with the case. The
	1147	* conditional costs virtually nothing if no clipping has occured.
	1148	*/
	1149
	1150	#define CLIP_CHECK_BACK(entry, save_start) \
	1151	do { \
	1152	while (entry->start != save_start) { \
	1153	entry = entry->prev; \
	1154	KASSERT(entry != &map->header, ("bad entry clip")); \
	1155	} \
	1156	} while(0)
	1157
	1158	#define CLIP_CHECK_FWD(entry, save_end) \
	1159	do { \
	1160	while (entry->end != save_end) { \
	1161	entry = entry->next; \
	1162	KASSERT(entry != &map->header, ("bad entry clip")); \
	1163	} \
	1164	} while(0)
	1165
	1166
	1167	/*
	1168	* vm_map_clip_range: [ kernel use only ]
	1169	*
	1170	* Clip the specified range and return the base entry. The
	1171	* range may cover several entries starting at the returned base
	1172	* and the first and last entry in the covering sequence will be
	1173	* properly clipped to the requested start and end address.
	1174	*
	1175	* If no holes are allowed you should pass the MAP_CLIP_NO_HOLES
	1176	* flag.
	1177	*
	1178	* The MAP_ENTRY_IN_TRANSITION flag will be set for the entries
	1179	* covered by the requested range.
	1180	*
	1181	* The map must be exclusively locked on entry and will remain locked
	1182	* on return. If no range exists or the range contains holes and you
	1183	* specified that no holes were allowed, NULL will be returned. This
	1184	* routine may temporarily unlock the map in order avoid a deadlock when
	1185	* sleeping.
	1186	*/
	1187	static
	1188	vm_map_entry_t
	1189	vm_map_clip_range(vm_map_t map, vm_offset_t start, vm_offset_t end,
	1190	int *countp, int flags)
	1191	{
	1192	vm_map_entry_t start_entry;
	1193	vm_map_entry_t entry;
	1194
	1195	/*
	1196	* Locate the entry and effect initial clipping. The in-transition
	1197	* case does not occur very often so do not try to optimize it.
	1198	*/
	1199	again:
	1200	if (vm_map_lookup_entry(map, start, &start_entry) == FALSE)
	1201	return (NULL);
	1202	entry = start_entry;
	1203	if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
	1204	entry->eflags \|= MAP_ENTRY_NEEDS_WAKEUP;
	1205	++mycpu->gd_cnt.v_intrans_coll;
	1206	++mycpu->gd_cnt.v_intrans_wait;
	1207	vm_map_transition_wait(map);
	1208	/*
	1209	* entry and/or start_entry may have been clipped while
	1210	* we slept, or may have gone away entirely. We have
	1211	* to restart from the lookup.
	1212	*/
	1213	goto again;
	1214	}
	1215	/*
	1216	* Since we hold an exclusive map lock we do not have to restart
	1217	* after clipping, even though clipping may block in zalloc.
	1218	*/
	1219	vm_map_clip_start(map, entry, start, countp);
	1220	vm_map_clip_end(map, entry, end, countp);
	1221	entry->eflags \|= MAP_ENTRY_IN_TRANSITION;
	1222
	1223	/*
	1224	* Scan entries covered by the range. When working on the next
	1225	* entry a restart need only re-loop on the current entry which
	1226	* we have already locked, since 'next' may have changed. Also,
	1227	* even though entry is safe, it may have been clipped so we
	1228	* have to iterate forwards through the clip after sleeping.
	1229	*/
	1230	while (entry->next != &map->header && entry->next->start < end) {
	1231	vm_map_entry_t next = entry->next;
	1232
	1233	if (flags & MAP_CLIP_NO_HOLES) {
	1234	if (next->start > entry->end) {
	1235	vm_map_unclip_range(map, start_entry,
	1236	start, entry->end, countp, flags);
	1237	return(NULL);
	1238	}
	1239	}
	1240
	1241	if (next->eflags & MAP_ENTRY_IN_TRANSITION) {
	1242	vm_offset_t save_end = entry->end;
	1243	next->eflags \|= MAP_ENTRY_NEEDS_WAKEUP;
	1244	++mycpu->gd_cnt.v_intrans_coll;
	1245	++mycpu->gd_cnt.v_intrans_wait;
	1246	vm_map_transition_wait(map);
	1247
	1248	/*
	1249	* clips might have occured while we blocked.
	1250	*/
	1251	CLIP_CHECK_FWD(entry, save_end);
	1252	CLIP_CHECK_BACK(start_entry, start);
	1253	continue;
	1254	}
	1255	/*
	1256	* No restart necessary even though clip_end may block, we
	1257	* are holding the map lock.
	1258	*/
	1259	vm_map_clip_end(map, next, end, countp);
	1260	next->eflags \|= MAP_ENTRY_IN_TRANSITION;
	1261	entry = next;
	1262	}
	1263	if (flags & MAP_CLIP_NO_HOLES) {
	1264	if (entry->end != end) {
	1265	vm_map_unclip_range(map, start_entry,
	1266	start, entry->end, countp, flags);
	1267	return(NULL);
	1268	}
	1269	}
	1270	return(start_entry);
	1271	}
	1272
	1273	/*
	1274	* vm_map_unclip_range: [ kernel use only ]
	1275	*
	1276	* Undo the effect of vm_map_clip_range(). You should pass the same
	1277	* flags and the same range that you passed to vm_map_clip_range().
	1278	* This code will clear the in-transition flag on the entries and
	1279	* wake up anyone waiting. This code will also simplify the sequence
	1280	* and attempt to merge it with entries before and after the sequence.
	1281	*
	1282	* The map must be locked on entry and will remain locked on return.
	1283	*
	1284	* Note that you should also pass the start_entry returned by
	1285	* vm_map_clip_range(). However, if you block between the two calls
	1286	* with the map unlocked please be aware that the start_entry may
	1287	* have been clipped and you may need to scan it backwards to find
	1288	* the entry corresponding with the original start address. You are
	1289	* responsible for this, vm_map_unclip_range() expects the correct
	1290	* start_entry to be passed to it and will KASSERT otherwise.
	1291	*/
	1292	static
	1293	void
	1294	vm_map_unclip_range(
	1295	vm_map_t map,
	1296	vm_map_entry_t start_entry,
	1297	vm_offset_t start,
	1298	vm_offset_t end,
	1299	int *countp,
	1300	int flags)
	1301	{
	1302	vm_map_entry_t entry;
	1303
	1304	entry = start_entry;
	1305
	1306	KASSERT(entry->start == start, ("unclip_range: illegal base entry"));
	1307	while (entry != &map->header && entry->start < end) {
	1308	KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION, ("in-transition flag not set during unclip on: %p", entry));
	1309	KASSERT(entry->end <= end, ("unclip_range: tail wasn't clipped"));
	1310	entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
	1311	if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
	1312	entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
	1313	wakeup(map);
	1314	}
	1315	entry = entry->next;
	1316	}
	1317
	1318	/*
	1319	* Simplification does not block so there is no restart case.
	1320	*/
	1321	entry = start_entry;
	1322	while (entry != &map->header && entry->start < end) {
	1323	vm_map_simplify_entry(map, entry, countp);
	1324	entry = entry->next;
	1325	}
	1326	}
	1327
	1328	/*
	1329	* vm_map_submap: [ kernel use only ]
	1330	*
	1331	* Mark the given range as handled by a subordinate map.
	1332	*
	1333	* This range must have been created with vm_map_find,
	1334	* and no other operations may have been performed on this
	1335	* range prior to calling vm_map_submap.
	1336	*
	1337	* Only a limited number of operations can be performed
	1338	* within this rage after calling vm_map_submap:
	1339	* vm_fault
	1340	* [Don't try vm_map_copy!]
	1341	*
	1342	* To remove a submapping, one must first remove the
	1343	* range from the superior map, and then destroy the
	1344	* submap (if desired). [Better yet, don't try it.]
	1345	*/
	1346	int
	1347	vm_map_submap(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_map_t submap)
	1348	{
	1349	vm_map_entry_t entry;
	1350	int result = KERN_INVALID_ARGUMENT;
	1351	int count;
	1352
	1353	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	1354	vm_map_lock(map);
	1355
	1356	VM_MAP_RANGE_CHECK(map, start, end);
	1357
	1358	if (vm_map_lookup_entry(map, start, &entry)) {
	1359	vm_map_clip_start(map, entry, start, &count);
	1360	} else {
	1361	entry = entry->next;
	1362	}
	1363
	1364	vm_map_clip_end(map, entry, end, &count);
	1365
	1366	if ((entry->start == start) && (entry->end == end) &&
	1367	((entry->eflags & MAP_ENTRY_COW) == 0) &&
	1368	(entry->object.vm_object == NULL)) {
	1369	entry->object.sub_map = submap;
	1370	entry->eflags \|= MAP_ENTRY_IS_SUB_MAP;
	1371	result = KERN_SUCCESS;
	1372	}
	1373	vm_map_unlock(map);
	1374	vm_map_entry_release(count);
	1375
	1376	return (result);
	1377	}
	1378
	1379	/*
	1380	* vm_map_protect:
	1381	*
	1382	* Sets the protection of the specified address
	1383	* region in the target map. If "set_max" is
	1384	* specified, the maximum protection is to be set;
	1385	* otherwise, only the current protection is affected.
	1386	*/
	1387	int
	1388	vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
	1389	vm_prot_t new_prot, boolean_t set_max)
	1390	{
	1391	vm_map_entry_t current;
	1392	vm_map_entry_t entry;
	1393	int count;
	1394
	1395	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	1396	vm_map_lock(map);
	1397
	1398	VM_MAP_RANGE_CHECK(map, start, end);
	1399
	1400	if (vm_map_lookup_entry(map, start, &entry)) {
	1401	vm_map_clip_start(map, entry, start, &count);
	1402	} else {
	1403	entry = entry->next;
	1404	}
	1405
	1406	/*
	1407	* Make a first pass to check for protection violations.
	1408	*/
	1409
	1410	current = entry;
	1411	while ((current != &map->header) && (current->start < end)) {
	1412	if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
	1413	vm_map_unlock(map);
	1414	vm_map_entry_release(count);
	1415	return (KERN_INVALID_ARGUMENT);
	1416	}
	1417	if ((new_prot & current->max_protection) != new_prot) {
	1418	vm_map_unlock(map);
	1419	vm_map_entry_release(count);
	1420	return (KERN_PROTECTION_FAILURE);
	1421	}
	1422	current = current->next;
	1423	}
	1424
	1425	/*
	1426	* Go back and fix up protections. [Note that clipping is not
	1427	* necessary the second time.]
	1428	*/
	1429	current = entry;
	1430
	1431	while ((current != &map->header) && (current->start < end)) {
	1432	vm_prot_t old_prot;
	1433
	1434	vm_map_clip_end(map, current, end, &count);
	1435
	1436	old_prot = current->protection;
	1437	if (set_max)
	1438	current->protection =
	1439	(current->max_protection = new_prot) &
	1440	old_prot;
	1441	else
	1442	current->protection = new_prot;
	1443
	1444	/*
	1445	* Update physical map if necessary. Worry about copy-on-write
	1446	* here -- CHECK THIS XXX
	1447	*/
	1448
	1449	if (current->protection != old_prot) {
	1450	#define MASK(entry) (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
	1451	VM_PROT_ALL)
	1452
	1453	pmap_protect(map->pmap, current->start,
	1454	current->end,
	1455	current->protection & MASK(current));
	1456	#undef MASK
	1457	}
	1458
	1459	vm_map_simplify_entry(map, current, &count);
	1460
	1461	current = current->next;
	1462	}
	1463
	1464	vm_map_unlock(map);
	1465	vm_map_entry_release(count);
	1466	return (KERN_SUCCESS);
	1467	}
	1468
	1469	/*
	1470	* vm_map_madvise:
	1471	*
	1472	* This routine traverses a processes map handling the madvise
	1473	* system call. Advisories are classified as either those effecting
	1474	* the vm_map_entry structure, or those effecting the underlying
	1475	* objects.
	1476	*/
	1477
	1478	int
	1479	vm_map_madvise(vm_map_t map, vm_offset_t start, vm_offset_t end, int behav)
	1480	{
	1481	vm_map_entry_t current, entry;
	1482	int modify_map = 0;
	1483	int count;
	1484
	1485	/*
	1486	* Some madvise calls directly modify the vm_map_entry, in which case
	1487	* we need to use an exclusive lock on the map and we need to perform
	1488	* various clipping operations. Otherwise we only need a read-lock
	1489	* on the map.
	1490	*/
	1491
	1492	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	1493
	1494	switch(behav) {
	1495	case MADV_NORMAL:
	1496	case MADV_SEQUENTIAL:
	1497	case MADV_RANDOM:
	1498	case MADV_NOSYNC:
	1499	case MADV_AUTOSYNC:
	1500	case MADV_NOCORE:
	1501	case MADV_CORE:
	1502	modify_map = 1;
	1503	vm_map_lock(map);
	1504	break;
	1505	case MADV_WILLNEED:
	1506	case MADV_DONTNEED:
	1507	case MADV_FREE:
	1508	vm_map_lock_read(map);
	1509	break;
	1510	default:
	1511	vm_map_entry_release(count);
	1512	return (KERN_INVALID_ARGUMENT);
	1513	}
	1514
	1515	/*
	1516	* Locate starting entry and clip if necessary.
	1517	*/
	1518
	1519	VM_MAP_RANGE_CHECK(map, start, end);
	1520
	1521	if (vm_map_lookup_entry(map, start, &entry)) {
	1522	if (modify_map)
	1523	vm_map_clip_start(map, entry, start, &count);
	1524	} else {
	1525	entry = entry->next;
	1526	}
	1527
	1528	if (modify_map) {
	1529	/*
	1530	* madvise behaviors that are implemented in the vm_map_entry.
	1531	*
	1532	* We clip the vm_map_entry so that behavioral changes are
	1533	* limited to the specified address range.
	1534	*/
	1535	for (current = entry;
	1536	(current != &map->header) && (current->start < end);
	1537	current = current->next
	1538	) {
	1539	if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
	1540	continue;
	1541
	1542	vm_map_clip_end(map, current, end, &count);
	1543
	1544	switch (behav) {
	1545	case MADV_NORMAL:
	1546	vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL);
	1547	break;
	1548	case MADV_SEQUENTIAL:
	1549	vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL);
	1550	break;
	1551	case MADV_RANDOM:
	1552	vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM);
	1553	break;
	1554	case MADV_NOSYNC:
	1555	current->eflags \|= MAP_ENTRY_NOSYNC;
	1556	break;
	1557	case MADV_AUTOSYNC:
	1558	current->eflags &= ~MAP_ENTRY_NOSYNC;
	1559	break;
	1560	case MADV_NOCORE:
	1561	current->eflags \|= MAP_ENTRY_NOCOREDUMP;
	1562	break;
	1563	case MADV_CORE:
	1564	current->eflags &= ~MAP_ENTRY_NOCOREDUMP;
	1565	break;
	1566	default:
	1567	break;
	1568	}
	1569	vm_map_simplify_entry(map, current, &count);
	1570	}
	1571	vm_map_unlock(map);
	1572	} else {
	1573	vm_pindex_t pindex;
	1574	int count;
	1575
	1576	/*
	1577	* madvise behaviors that are implemented in the underlying
	1578	* vm_object.
	1579	*
	1580	* Since we don't clip the vm_map_entry, we have to clip
	1581	* the vm_object pindex and count.
	1582	*/
	1583	for (current = entry;
	1584	(current != &map->header) && (current->start < end);
	1585	current = current->next
	1586	) {
	1587	vm_offset_t useStart;
	1588
	1589	if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
	1590	continue;
	1591
	1592	pindex = OFF_TO_IDX(current->offset);
	1593	count = atop(current->end - current->start);
	1594	useStart = current->start;
	1595
	1596	if (current->start < start) {
	1597	pindex += atop(start - current->start);
	1598	count -= atop(start - current->start);
	1599	useStart = start;
	1600	}
	1601	if (current->end > end)
	1602	count -= atop(current->end - end);
	1603
	1604	if (count <= 0)
	1605	continue;
	1606
	1607	vm_object_madvise(current->object.vm_object,
	1608	pindex, count, behav);
	1609	if (behav == MADV_WILLNEED) {
	1610	pmap_object_init_pt(
	1611	map->pmap,
	1612	useStart,
	1613	current->protection,
	1614	current->object.vm_object,
	1615	pindex,
	1616	(count << PAGE_SHIFT),
	1617	MAP_PREFAULT_MADVISE
	1618	);
	1619	}
	1620	}
	1621	vm_map_unlock_read(map);
	1622	}
	1623	vm_map_entry_release(count);
	1624	return(0);
	1625	}
	1626
	1627
	1628	/*
	1629	* vm_map_inherit:
	1630	*
	1631	* Sets the inheritance of the specified address
	1632	* range in the target map. Inheritance
	1633	* affects how the map will be shared with
	1634	* child maps at the time of vm_map_fork.
	1635	*/
	1636	int
	1637	vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
	1638	vm_inherit_t new_inheritance)
	1639	{
	1640	vm_map_entry_t entry;
	1641	vm_map_entry_t temp_entry;
	1642	int count;
	1643
	1644	switch (new_inheritance) {
	1645	case VM_INHERIT_NONE:
	1646	case VM_INHERIT_COPY:
	1647	case VM_INHERIT_SHARE:
	1648	break;
	1649	default:
	1650	return (KERN_INVALID_ARGUMENT);
	1651	}
	1652
	1653	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	1654	vm_map_lock(map);
	1655
	1656	VM_MAP_RANGE_CHECK(map, start, end);
	1657
	1658	if (vm_map_lookup_entry(map, start, &temp_entry)) {
	1659	entry = temp_entry;
	1660	vm_map_clip_start(map, entry, start, &count);
	1661	} else
	1662	entry = temp_entry->next;
	1663
	1664	while ((entry != &map->header) && (entry->start < end)) {
	1665	vm_map_clip_end(map, entry, end, &count);
	1666
	1667	entry->inheritance = new_inheritance;
	1668
	1669	vm_map_simplify_entry(map, entry, &count);
	1670
	1671	entry = entry->next;
	1672	}
	1673	vm_map_unlock(map);
	1674	vm_map_entry_release(count);
	1675	return (KERN_SUCCESS);
	1676	}
	1677
	1678	/*
	1679	* Implement the semantics of mlock
	1680	*/
	1681	int
	1682	vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t real_end,
	1683	boolean_t new_pageable)
	1684	{
	1685	vm_map_entry_t entry;
	1686	vm_map_entry_t start_entry;
	1687	vm_offset_t end;
	1688	int rv = KERN_SUCCESS;
	1689	int count;
	1690
	1691	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	1692	vm_map_lock(map);
	1693	VM_MAP_RANGE_CHECK(map, start, real_end);
	1694	end = real_end;
	1695
	1696	start_entry = vm_map_clip_range(map, start, end, &count, MAP_CLIP_NO_HOLES);
	1697	if (start_entry == NULL) {
	1698	vm_map_unlock(map);
	1699	vm_map_entry_release(count);
	1700	return (KERN_INVALID_ADDRESS);
	1701	}
	1702
	1703	if (new_pageable == 0) {
	1704	entry = start_entry;
	1705	while ((entry != &map->header) && (entry->start < end)) {
	1706	vm_offset_t save_start;
	1707	vm_offset_t save_end;
	1708
	1709	/*
	1710	* Already user wired or hard wired (trivial cases)
	1711	*/
	1712	if (entry->eflags & MAP_ENTRY_USER_WIRED) {
	1713	entry = entry->next;
	1714	continue;
	1715	}
	1716	if (entry->wired_count != 0) {
	1717	entry->wired_count++;
	1718	entry->eflags \|= MAP_ENTRY_USER_WIRED;
	1719	entry = entry->next;
	1720	continue;
	1721	}
	1722
	1723	/*
	1724	* A new wiring requires instantiation of appropriate
	1725	* management structures and the faulting in of the
	1726	* page.
	1727	*/
	1728	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
	1729	int copyflag = entry->eflags & MAP_ENTRY_NEEDS_COPY;
	1730	if (copyflag && ((entry->protection & VM_PROT_WRITE) != 0)) {
	1731
	1732	vm_object_shadow(&entry->object.vm_object,
	1733	&entry->offset,
	1734	atop(entry->end - entry->start));
	1735	entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
	1736
	1737	} else if (entry->object.vm_object == NULL &&
	1738	!map->system_map) {
	1739
	1740	entry->object.vm_object =
	1741	vm_object_allocate(OBJT_DEFAULT,
	1742	atop(entry->end - entry->start));
	1743	entry->offset = (vm_offset_t) 0;
	1744
	1745	}
	1746	}
	1747	entry->wired_count++;
	1748	entry->eflags \|= MAP_ENTRY_USER_WIRED;
	1749
	1750	/*
	1751	* Now fault in the area. Note that vm_fault_wire()
	1752	* may release the map lock temporarily, it will be
	1753	* relocked on return. The in-transition
	1754	* flag protects the entries.
	1755	*/
	1756	save_start = entry->start;
	1757	save_end = entry->end;
	1758	rv = vm_fault_wire(map, entry, TRUE);
	1759	if (rv) {
	1760	CLIP_CHECK_BACK(entry, save_start);
	1761	for (;;) {
	1762	KASSERT(entry->wired_count == 1, ("bad wired_count on entry"));
	1763	entry->eflags &= ~MAP_ENTRY_USER_WIRED;
	1764	entry->wired_count = 0;
	1765	if (entry->end == save_end)
	1766	break;
	1767	entry = entry->next;
	1768	KASSERT(entry != &map->header, ("bad entry clip during backout"));
	1769	}
	1770	end = save_start; /* unwire the rest */
	1771	break;
	1772	}
	1773	/*
	1774	* note that even though the entry might have been
	1775	* clipped, the USER_WIRED flag we set prevents
	1776	* duplication so we do not have to do a
	1777	* clip check.
	1778	*/
	1779	entry = entry->next;
	1780	}
	1781
	1782	/*
	1783	* If we failed fall through to the unwiring section to
	1784	* unwire what we had wired so far. 'end' has already
	1785	* been adjusted.
	1786	*/
	1787	if (rv)
	1788	new_pageable = 1;
	1789
	1790	/*
	1791	* start_entry might have been clipped if we unlocked the
	1792	* map and blocked. No matter how clipped it has gotten
	1793	* there should be a fragment that is on our start boundary.
	1794	*/
	1795	CLIP_CHECK_BACK(start_entry, start);
	1796	}
	1797
	1798	/*
	1799	* Deal with the unwiring case.
	1800	*/
	1801	if (new_pageable) {
	1802	/*
	1803	* This is the unwiring case. We must first ensure that the
	1804	* range to be unwired is really wired down. We know there
	1805	* are no holes.
	1806	*/
	1807	entry = start_entry;
	1808	while ((entry != &map->header) && (entry->start < end)) {
	1809	if ((entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
	1810	rv = KERN_INVALID_ARGUMENT;
	1811	goto done;
	1812	}
	1813	KASSERT(entry->wired_count != 0, ("wired count was 0 with USER_WIRED set! %p", entry));
	1814	entry = entry->next;
	1815	}
	1816
	1817	/*
	1818	* Now decrement the wiring count for each region. If a region
	1819	* becomes completely unwired, unwire its physical pages and
	1820	* mappings.
	1821	*/
	1822	/*
	1823	* The map entries are processed in a loop, checking to
	1824	* make sure the entry is wired and asserting it has a wired
	1825	* count. However, another loop was inserted more-or-less in
	1826	* the middle of the unwiring path. This loop picks up the
	1827	* "entry" loop variable from the first loop without first
	1828	* setting it to start_entry. Naturally, the secound loop
	1829	* is never entered and the pages backing the entries are
	1830	* never unwired. This can lead to a leak of wired pages.
	1831	*/
	1832	entry = start_entry;
	1833	while ((entry != &map->header) && (entry->start < end)) {
	1834	KASSERT(entry->eflags & MAP_ENTRY_USER_WIRED,
	1835	("expected USER_WIRED on entry %p", entry));
	1836	entry->eflags &= ~MAP_ENTRY_USER_WIRED;
	1837	entry->wired_count--;
	1838	if (entry->wired_count == 0)
	1839	vm_fault_unwire(map, entry);
	1840	entry = entry->next;
	1841	}
	1842	}
	1843	done:
	1844	vm_map_unclip_range(map, start_entry, start, real_end, &count,
	1845	MAP_CLIP_NO_HOLES);
	1846	map->timestamp++;
	1847	vm_map_unlock(map);
	1848	vm_map_entry_release(count);
	1849	return (rv);
	1850	}
	1851
	1852	/*
	1853	* vm_map_wire:
	1854	*
	1855	* Sets the pageability of the specified address
	1856	* range in the target map. Regions specified
	1857	* as not pageable require locked-down physical
	1858	* memory and physical page maps.
	1859	*
	1860	* The map must not be locked, but a reference
	1861	* must remain to the map throughout the call.
	1862	*
	1863	* This function may be called via the zalloc path and must properly
	1864	* reserve map entries for kernel_map.
	1865	*/
	1866	int
	1867	vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t real_end, int kmflags)
	1868	{
	1869	vm_map_entry_t entry;
	1870	vm_map_entry_t start_entry;
	1871	vm_offset_t end;
	1872	int rv = KERN_SUCCESS;
	1873	int count;
	1874	int s;
	1875
	1876	if (kmflags & KM_KRESERVE)
	1877	count = vm_map_entry_kreserve(MAP_RESERVE_COUNT);
	1878	else
	1879	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	1880	vm_map_lock(map);
	1881	VM_MAP_RANGE_CHECK(map, start, real_end);
	1882	end = real_end;
	1883
	1884	start_entry = vm_map_clip_range(map, start, end, &count, MAP_CLIP_NO_HOLES);
	1885	if (start_entry == NULL) {
	1886	vm_map_unlock(map);
	1887	rv = KERN_INVALID_ADDRESS;
	1888	goto failure;
	1889	}
	1890	if ((kmflags & KM_PAGEABLE) == 0) {
	1891	/*
	1892	* Wiring.
	1893	*
	1894	* 1. Holding the write lock, we create any shadow or zero-fill
	1895	* objects that need to be created. Then we clip each map
	1896	* entry to the region to be wired and increment its wiring
	1897	* count. We create objects before clipping the map entries
	1898	* to avoid object proliferation.
	1899	*
	1900	* 2. We downgrade to a read lock, and call vm_fault_wire to
	1901	* fault in the pages for any newly wired area (wired_count is
	1902	* 1).
	1903	*
	1904	* Downgrading to a read lock for vm_fault_wire avoids a
	1905	* possible deadlock with another process that may have faulted
	1906	* on one of the pages to be wired (it would mark the page busy,
	1907	* blocking us, then in turn block on the map lock that we
	1908	* hold). Because of problems in the recursive lock package,
	1909	* we cannot upgrade to a write lock in vm_map_lookup. Thus,
	1910	* any actions that require the write lock must be done
	1911	* beforehand. Because we keep the read lock on the map, the
	1912	* copy-on-write status of the entries we modify here cannot
	1913	* change.
	1914	*/
	1915
	1916	entry = start_entry;
	1917	while ((entry != &map->header) && (entry->start < end)) {
	1918	/*
	1919	* Trivial case if the entry is already wired
	1920	*/
	1921	if (entry->wired_count) {
	1922	entry->wired_count++;
	1923	entry = entry->next;
	1924	continue;
	1925	}
	1926
	1927	/*
	1928	* The entry is being newly wired, we have to setup
	1929	* appropriate management structures. A shadow
	1930	* object is required for a copy-on-write region,
	1931	* or a normal object for a zero-fill region. We
	1932	* do not have to do this for entries that point to sub
	1933	* maps because we won't hold the lock on the sub map.
	1934	*/
	1935	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
	1936	int copyflag = entry->eflags & MAP_ENTRY_NEEDS_COPY;
	1937	if (copyflag &&
	1938	((entry->protection & VM_PROT_WRITE) != 0)) {
	1939
	1940	vm_object_shadow(&entry->object.vm_object,
	1941	&entry->offset,
	1942	atop(entry->end - entry->start));
	1943	entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
	1944	} else if (entry->object.vm_object == NULL &&
	1945	!map->system_map) {
	1946	entry->object.vm_object =
	1947	vm_object_allocate(OBJT_DEFAULT,
	1948	atop(entry->end - entry->start));
	1949	entry->offset = (vm_offset_t) 0;
	1950	}
	1951	}
	1952
	1953	entry->wired_count++;
	1954	entry = entry->next;
	1955	}
	1956
	1957	/*
	1958	* Pass 2.
	1959	*/
	1960
	1961	/*
	1962	* HACK HACK HACK HACK
	1963	*
	1964	* Unlock the map to avoid deadlocks. The in-transit flag
	1965	* protects us from most changes but note that
	1966	* clipping may still occur. To prevent clipping from
	1967	* occuring after the unlock, except for when we are
	1968	* blocking in vm_fault_wire, we must run at splvm().
	1969	* Otherwise our accesses to entry->start and entry->end
	1970	* could be corrupted. We have to set splvm() prior to
	1971	* unlocking so start_entry does not change out from
	1972	* under us at the very beginning of the loop.
	1973	*
	1974	* HACK HACK HACK HACK
	1975	*/
	1976
	1977	s = splvm();
	1978
	1979	entry = start_entry;
	1980	while (entry != &map->header && entry->start < end) {
	1981	/*
	1982	* If vm_fault_wire fails for any page we need to undo
	1983	* what has been done. We decrement the wiring count
	1984	* for those pages which have not yet been wired (now)
	1985	* and unwire those that have (later).
	1986	*/
	1987	vm_offset_t save_start = entry->start;
	1988	vm_offset_t save_end = entry->end;
	1989
	1990	if (entry->wired_count == 1)
	1991	rv = vm_fault_wire(map, entry, FALSE);
	1992	if (rv) {
	1993	CLIP_CHECK_BACK(entry, save_start);
	1994	for (;;) {
	1995	KASSERT(entry->wired_count == 1, ("wired_count changed unexpectedly"));
	1996	entry->wired_count = 0;
	1997	if (entry->end == save_end)
	1998	break;
	1999	entry = entry->next;
	2000	KASSERT(entry != &map->header, ("bad entry clip during backout"));
	2001	}
	2002	end = save_start;
	2003	break;
	2004	}
	2005	CLIP_CHECK_FWD(entry, save_end);
	2006	entry = entry->next;
	2007	}
	2008	splx(s);
	2009
	2010	/*
	2011	* If a failure occured undo everything by falling through
	2012	* to the unwiring code. 'end' has already been adjusted
	2013	* appropriately.
	2014	*/
	2015	if (rv)
	2016	kmflags \|= KM_PAGEABLE;
	2017
	2018	/*
	2019	* start_entry is still IN_TRANSITION but may have been
	2020	* clipped since vm_fault_wire() unlocks and relocks the
	2021	* map. No matter how clipped it has gotten there should
	2022	* be a fragment that is on our start boundary.
	2023	*/
	2024	CLIP_CHECK_BACK(start_entry, start);
	2025	}
	2026
	2027	if (kmflags & KM_PAGEABLE) {
	2028	/*
	2029	* This is the unwiring case. We must first ensure that the
	2030	* range to be unwired is really wired down. We know there
	2031	* are no holes.
	2032	*/
	2033	entry = start_entry;
	2034	while ((entry != &map->header) && (entry->start < end)) {
	2035	if (entry->wired_count == 0) {
	2036	rv = KERN_INVALID_ARGUMENT;
	2037	goto done;
	2038	}
	2039	entry = entry->next;
	2040	}
	2041
	2042	/*
	2043	* Now decrement the wiring count for each region. If a region
	2044	* becomes completely unwired, unwire its physical pages and
	2045	* mappings.
	2046	*/
	2047	entry = start_entry;
	2048	while ((entry != &map->header) && (entry->start < end)) {
	2049	entry->wired_count--;
	2050	if (entry->wired_count == 0)
	2051	vm_fault_unwire(map, entry);
	2052	entry = entry->next;
	2053	}
	2054	}
	2055	done:
	2056	vm_map_unclip_range(map, start_entry, start, real_end, &count,
	2057	MAP_CLIP_NO_HOLES);
	2058	map->timestamp++;
	2059	vm_map_unlock(map);
	2060	failure:
	2061	if (kmflags & KM_KRESERVE)
	2062	vm_map_entry_krelease(count);
	2063	else
	2064	vm_map_entry_release(count);
	2065	return (rv);
	2066	}
	2067
	2068	/*
	2069	* vm_map_set_wired_quick()
	2070	*
	2071	* Mark a newly allocated address range as wired but do not fault in
	2072	* the pages. The caller is expected to load the pages into the object.
	2073	*
	2074	* The map must be locked on entry and will remain locked on return.
	2075	*/
	2076	void
	2077	vm_map_set_wired_quick(vm_map_t map, vm_offset_t addr, vm_size_t size, int *countp)
	2078	{
	2079	vm_map_entry_t scan;
	2080	vm_map_entry_t entry;
	2081
	2082	entry = vm_map_clip_range(map, addr, addr + size, countp, MAP_CLIP_NO_HOLES);
	2083	for (scan = entry; scan != &map->header && scan->start < addr + size; scan = scan->next) {
	2084	KKASSERT(entry->wired_count == 0);
	2085	entry->wired_count = 1;
	2086	}
	2087	vm_map_unclip_range(map, entry, addr, addr + size, countp, MAP_CLIP_NO_HOLES);
	2088	}
	2089
	2090	/*
	2091	* vm_map_clean
	2092	*
	2093	* Push any dirty cached pages in the address range to their pager.
	2094	* If syncio is TRUE, dirty pages are written synchronously.
	2095	* If invalidate is TRUE, any cached pages are freed as well.
	2096	*
	2097	* Returns an error if any part of the specified range is not mapped.
	2098	*/
	2099	int
	2100	vm_map_clean(vm_map_t map, vm_offset_t start, vm_offset_t end, boolean_t syncio,
	2101	boolean_t invalidate)
	2102	{
	2103	vm_map_entry_t current;
	2104	vm_map_entry_t entry;
	2105	vm_size_t size;
	2106	vm_object_t object;
	2107	vm_ooffset_t offset;
	2108
	2109	vm_map_lock_read(map);
	2110	VM_MAP_RANGE_CHECK(map, start, end);
	2111	if (!vm_map_lookup_entry(map, start, &entry)) {
	2112	vm_map_unlock_read(map);
	2113	return (KERN_INVALID_ADDRESS);
	2114	}
	2115	/*
	2116	* Make a first pass to check for holes.
	2117	*/
	2118	for (current = entry; current->start < end; current = current->next) {
	2119	if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
	2120	vm_map_unlock_read(map);
	2121	return (KERN_INVALID_ARGUMENT);
	2122	}
	2123	if (end > current->end &&
	2124	(current->next == &map->header \|\|
	2125	current->end != current->next->start)) {
	2126	vm_map_unlock_read(map);
	2127	return (KERN_INVALID_ADDRESS);
	2128	}
	2129	}
	2130
	2131	if (invalidate)
	2132	pmap_remove(vm_map_pmap(map), start, end);
	2133	/*
	2134	* Make a second pass, cleaning/uncaching pages from the indicated
	2135	* objects as we go.
	2136	*/
	2137	for (current = entry; current->start < end; current = current->next) {
	2138	offset = current->offset + (start - current->start);
	2139	size = (end <= current->end ? end : current->end) - start;
	2140	if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
	2141	vm_map_t smap;
	2142	vm_map_entry_t tentry;
	2143	vm_size_t tsize;
	2144
	2145	smap = current->object.sub_map;
	2146	vm_map_lock_read(smap);
	2147	(void) vm_map_lookup_entry(smap, offset, &tentry);
	2148	tsize = tentry->end - offset;
	2149	if (tsize < size)
	2150	size = tsize;
	2151	object = tentry->object.vm_object;
	2152	offset = tentry->offset + (offset - tentry->start);
	2153	vm_map_unlock_read(smap);
	2154	} else {
	2155	object = current->object.vm_object;
	2156	}
	2157	/*
	2158	* Note that there is absolutely no sense in writing out
	2159	* anonymous objects, so we track down the vnode object
	2160	* to write out.
	2161	* We invalidate (remove) all pages from the address space
	2162	* anyway, for semantic correctness.
	2163	*
	2164	* note: certain anonymous maps, such as MAP_NOSYNC maps,
	2165	* may start out with a NULL object.
	2166	*/
	2167	while (object && object->backing_object) {
	2168	object = object->backing_object;
	2169	offset += object->backing_object_offset;
	2170	if (object->size < OFF_TO_IDX( offset + size))
	2171	size = IDX_TO_OFF(object->size) - offset;
	2172	}
	2173	if (object && (object->type == OBJT_VNODE) &&
	2174	(current->protection & VM_PROT_WRITE)) {
	2175	/*
	2176	* Flush pages if writing is allowed, invalidate them
	2177	* if invalidation requested. Pages undergoing I/O
	2178	* will be ignored by vm_object_page_remove().
	2179	*
	2180	* We cannot lock the vnode and then wait for paging
	2181	* to complete without deadlocking against vm_fault.
	2182	* Instead we simply call vm_object_page_remove() and
	2183	* allow it to block internally on a page-by-page
	2184	* basis when it encounters pages undergoing async
	2185	* I/O.
	2186	*/
	2187	int flags;
	2188
	2189	vm_object_reference(object);
	2190	vn_lock(object->handle, NULL,
	2191	LK_EXCLUSIVE \| LK_RETRY, curthread);
	2192	flags = (syncio \|\| invalidate) ? OBJPC_SYNC : 0;
	2193	flags \|= invalidate ? OBJPC_INVAL : 0;
	2194	vm_object_page_clean(object,
	2195	OFF_TO_IDX(offset),
	2196	OFF_TO_IDX(offset + size + PAGE_MASK),
	2197	flags);
	2198	VOP_UNLOCK(object->handle, NULL, 0, curthread);
	2199	vm_object_deallocate(object);
	2200	}
	2201	if (object && invalidate &&
	2202	((object->type == OBJT_VNODE) \|\|
	2203	(object->type == OBJT_DEVICE))) {
	2204	int clean_only =
	2205	(object->type == OBJT_DEVICE) ? FALSE : TRUE;
	2206	vm_object_reference(object);
	2207	vm_object_page_remove(object,
	2208	OFF_TO_IDX(offset),
	2209	OFF_TO_IDX(offset + size + PAGE_MASK),
	2210	clean_only);
	2211	vm_object_deallocate(object);
	2212	}
	2213	start += size;
	2214	}
	2215
	2216	vm_map_unlock_read(map);
	2217	return (KERN_SUCCESS);
	2218	}
	2219
	2220	/*
	2221	* vm_map_entry_unwire: [ internal use only ]
	2222	*
	2223	* Make the region specified by this entry pageable.
	2224	*
	2225	* The map in question should be locked.
	2226	* [This is the reason for this routine's existence.]
	2227	*/
	2228	static void
	2229	vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
	2230	{
	2231	entry->eflags &= ~MAP_ENTRY_USER_WIRED;
	2232	entry->wired_count = 0;
	2233	vm_fault_unwire(map, entry);
	2234	}
	2235
	2236	/*
	2237	* vm_map_entry_delete: [ internal use only ]
	2238	*
	2239	* Deallocate the given entry from the target map.
	2240	*/
	2241	static void
	2242	vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry, int *countp)
	2243	{
	2244	vm_map_entry_unlink(map, entry);
	2245	map->size -= entry->end - entry->start;
	2246
	2247	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
	2248	vm_object_deallocate(entry->object.vm_object);
	2249	}
	2250
	2251	vm_map_entry_dispose(map, entry, countp);
	2252	}
	2253
	2254	/*
	2255	* vm_map_delete: [ internal use only ]
	2256	*
	2257	* Deallocates the given address range from the target
	2258	* map.
	2259	*/
	2260	int
	2261	vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end, int *countp)
	2262	{
	2263	vm_object_t object;
	2264	vm_map_entry_t entry;
	2265	vm_map_entry_t first_entry;
	2266
	2267	/*
	2268	* Find the start of the region, and clip it
	2269	*/
	2270
	2271	again:
	2272	if (!vm_map_lookup_entry(map, start, &first_entry)) {
	2273	entry = first_entry->next;
	2274	} else {
	2275	entry = first_entry;
	2276	vm_map_clip_start(map, entry, start, countp);
	2277	/*
	2278	* Fix the lookup hint now, rather than each time though the
	2279	* loop.
	2280	*/
	2281	SAVE_HINT(map, entry->prev);
	2282	}
	2283
	2284	/*
	2285	* Save the free space hint
	2286	*/
	2287
	2288	if (entry == &map->header) {
	2289	map->first_free = &map->header;
	2290	} else if (map->first_free->start >= start) {
	2291	map->first_free = entry->prev;
	2292	}
	2293
	2294	/*
	2295	* Step through all entries in this region
	2296	*/
	2297
	2298	while ((entry != &map->header) && (entry->start < end)) {
	2299	vm_map_entry_t next;
	2300	vm_offset_t s, e;
	2301	vm_pindex_t offidxstart, offidxend, count;
	2302
	2303	/*
	2304	* If we hit an in-transition entry we have to sleep and
	2305	* retry. It's easier (and not really slower) to just retry
	2306	* since this case occurs so rarely and the hint is already
	2307	* pointing at the right place. We have to reset the
	2308	* start offset so as not to accidently delete an entry
	2309	* another process just created in vacated space.
	2310	*/
	2311	if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
	2312	entry->eflags \|= MAP_ENTRY_NEEDS_WAKEUP;
	2313	start = entry->start;
	2314	++mycpu->gd_cnt.v_intrans_coll;
	2315	++mycpu->gd_cnt.v_intrans_wait;
	2316	vm_map_transition_wait(map);
	2317	goto again;
	2318	}
	2319	vm_map_clip_end(map, entry, end, countp);
	2320
	2321	s = entry->start;
	2322	e = entry->end;
	2323	next = entry->next;
	2324
	2325	offidxstart = OFF_TO_IDX(entry->offset);
	2326	count = OFF_TO_IDX(e - s);
	2327	object = entry->object.vm_object;
	2328
	2329	/*
	2330	* Unwire before removing addresses from the pmap; otherwise,
	2331	* unwiring will put the entries back in the pmap.
	2332	*/
	2333	if (entry->wired_count != 0)
	2334	vm_map_entry_unwire(map, entry);
	2335
	2336	offidxend = offidxstart + count;
	2337
	2338	if ((object == kernel_object) \|\| (object == kmem_object)) {
	2339	vm_object_page_remove(object, offidxstart, offidxend, FALSE);
	2340	} else {
	2341	pmap_remove(map->pmap, s, e);
	2342	if (object != NULL &&
	2343	object->ref_count != 1 &&
	2344	(object->flags & (OBJ_NOSPLIT\|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING &&
	2345	(object->type == OBJT_DEFAULT \|\| object->type == OBJT_SWAP)) {
	2346	vm_object_collapse(object);
	2347	vm_object_page_remove(object, offidxstart, offidxend, FALSE);
	2348	if (object->type == OBJT_SWAP) {
	2349	swap_pager_freespace(object, offidxstart, count);
	2350	}
	2351	if (offidxend >= object->size &&
	2352	offidxstart < object->size) {
	2353	object->size = offidxstart;
	2354	}
	2355	}
	2356	}
	2357
	2358	/*
	2359	* Delete the entry (which may delete the object) only after
	2360	* removing all pmap entries pointing to its pages.
	2361	* (Otherwise, its page frames may be reallocated, and any
	2362	* modify bits will be set in the wrong object!)
	2363	*/
	2364	vm_map_entry_delete(map, entry, countp);
	2365	entry = next;
	2366	}
	2367	return (KERN_SUCCESS);
	2368	}
	2369
	2370	/*
	2371	* vm_map_remove:
	2372	*
	2373	* Remove the given address range from the target map.
	2374	* This is the exported form of vm_map_delete.
	2375	*/
	2376	int
	2377	vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
	2378	{
	2379	int result;
	2380	int count;
	2381
	2382	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	2383	vm_map_lock(map);
	2384	VM_MAP_RANGE_CHECK(map, start, end);
	2385	result = vm_map_delete(map, start, end, &count);
	2386	vm_map_unlock(map);
	2387	vm_map_entry_release(count);
	2388
	2389	return (result);
	2390	}
	2391
	2392	/*
	2393	* vm_map_check_protection:
	2394	*
	2395	* Assert that the target map allows the specified
	2396	* privilege on the entire address region given.
	2397	* The entire region must be allocated.
	2398	*/
	2399	boolean_t
	2400	vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
	2401	vm_prot_t protection)
	2402	{
	2403	vm_map_entry_t entry;
	2404	vm_map_entry_t tmp_entry;
	2405
	2406	if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
	2407	return (FALSE);
	2408	}
	2409	entry = tmp_entry;
	2410
	2411	while (start < end) {
	2412	if (entry == &map->header) {
	2413	return (FALSE);
	2414	}
	2415	/*
	2416	* No holes allowed!
	2417	*/
	2418
	2419	if (start < entry->start) {
	2420	return (FALSE);
	2421	}
	2422	/*
	2423	* Check protection associated with entry.
	2424	*/
	2425
	2426	if ((entry->protection & protection) != protection) {
	2427	return (FALSE);
	2428	}
	2429	/* go to next entry */
	2430
	2431	start = entry->end;
	2432	entry = entry->next;
	2433	}
	2434	return (TRUE);
	2435	}
	2436
	2437	/*
	2438	* Split the pages in a map entry into a new object. This affords
	2439	* easier removal of unused pages, and keeps object inheritance from
	2440	* being a negative impact on memory usage.
	2441	*/
	2442	static void
	2443	vm_map_split(vm_map_entry_t entry)
	2444	{
	2445	vm_page_t m;
	2446	vm_object_t orig_object, new_object, source;
	2447	vm_offset_t s, e;
	2448	vm_pindex_t offidxstart, offidxend, idx;
	2449	vm_size_t size;
	2450	vm_ooffset_t offset;
	2451
	2452	orig_object = entry->object.vm_object;
	2453	if (orig_object->type != OBJT_DEFAULT && orig_object->type != OBJT_SWAP)
	2454	return;
	2455	if (orig_object->ref_count <= 1)
	2456	return;
	2457
	2458	offset = entry->offset;
	2459	s = entry->start;
	2460	e = entry->end;
	2461
	2462	offidxstart = OFF_TO_IDX(offset);
	2463	offidxend = offidxstart + OFF_TO_IDX(e - s);
	2464	size = offidxend - offidxstart;
	2465
	2466	new_object = vm_pager_allocate(orig_object->type,
	2467	NULL, IDX_TO_OFF(size), VM_PROT_ALL, 0LL);
	2468	if (new_object == NULL)
	2469	return;
	2470
	2471	source = orig_object->backing_object;
	2472	if (source != NULL) {
	2473	vm_object_reference(source); /* Referenced by new_object */
	2474	LIST_INSERT_HEAD(&source->shadow_head,
	2475	new_object, shadow_list);
	2476	vm_object_clear_flag(source, OBJ_ONEMAPPING);
	2477	new_object->backing_object_offset =
	2478	orig_object->backing_object_offset + IDX_TO_OFF(offidxstart);
	2479	new_object->backing_object = source;
	2480	source->shadow_count++;
	2481	source->generation++;
	2482	}
	2483
	2484	for (idx = 0; idx < size; idx++) {
	2485	vm_page_t m;
	2486	int ss; /* s used */
	2487
	2488	/*
	2489	* splvm protection is required to avoid a race between
	2490	* the lookup and an interrupt/unbusy/free and our busy
	2491	* check.
	2492	*/
	2493	ss = splvm();
	2494	retry:
	2495	m = vm_page_lookup(orig_object, offidxstart + idx);
	2496	if (m == NULL) {
	2497	splx(ss);
	2498	continue;
	2499	}
	2500
	2501	/*
	2502	* We must wait for pending I/O to complete before we can
	2503	* rename the page.
	2504	*
	2505	* We do not have to VM_PROT_NONE the page as mappings should
	2506	* not be changed by this operation.
	2507	*/
	2508	if (vm_page_sleep_busy(m, TRUE, "spltwt"))
	2509	goto retry;
	2510	vm_page_busy(m);
	2511	vm_page_rename(m, new_object, idx);
	2512	/* page automatically made dirty by rename and cache handled */
	2513	vm_page_busy(m);
	2514	splx(ss);
	2515	}
	2516
	2517	if (orig_object->type == OBJT_SWAP) {
	2518	vm_object_pip_add(orig_object, 1);
	2519	/*
	2520	* copy orig_object pages into new_object
	2521	* and destroy unneeded pages in
	2522	* shadow object.
	2523	*/
	2524	swap_pager_copy(orig_object, new_object, offidxstart, 0);
	2525	vm_object_pip_wakeup(orig_object);
	2526	}
	2527
	2528	/*
	2529	* Wakeup the pages we played with. No spl protection is needed
	2530	* for a simple wakeup.
	2531	*/
	2532	for (idx = 0; idx < size; idx++) {
	2533	m = vm_page_lookup(new_object, idx);
	2534	if (m)
	2535	vm_page_wakeup(m);
	2536	}
	2537
	2538	entry->object.vm_object = new_object;
	2539	entry->offset = 0LL;
	2540	vm_object_deallocate(orig_object);
	2541	}
	2542
	2543	/*
	2544	* vm_map_copy_entry:
	2545	*
	2546	* Copies the contents of the source entry to the destination
	2547	* entry. The entries must be aligned properly.
	2548	*/
	2549	static void
	2550	vm_map_copy_entry(vm_map_t src_map, vm_map_t dst_map,
	2551	vm_map_entry_t src_entry, vm_map_entry_t dst_entry)
	2552	{
	2553	vm_object_t src_object;
	2554
	2555	if ((dst_entry->eflags\|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP)
	2556	return;
	2557
	2558	if (src_entry->wired_count == 0) {
	2559
	2560	/*
	2561	* If the source entry is marked needs_copy, it is already
	2562	* write-protected.
	2563	*/
	2564	if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) {
	2565	pmap_protect(src_map->pmap,
	2566	src_entry->start,
	2567	src_entry->end,
	2568	src_entry->protection & ~VM_PROT_WRITE);
	2569	}
	2570
	2571	/*
	2572	* Make a copy of the object.
	2573	*/
	2574	if ((src_object = src_entry->object.vm_object) != NULL) {
	2575
	2576	if ((src_object->handle == NULL) &&
	2577	(src_object->type == OBJT_DEFAULT \|\|
	2578	src_object->type == OBJT_SWAP)) {
	2579	vm_object_collapse(src_object);
	2580	if ((src_object->flags & (OBJ_NOSPLIT\|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING) {
	2581	vm_map_split(src_entry);
	2582	src_object = src_entry->object.vm_object;
	2583	}
	2584	}
	2585
	2586	vm_object_reference(src_object);
	2587	vm_object_clear_flag(src_object, OBJ_ONEMAPPING);
	2588	dst_entry->object.vm_object = src_object;
	2589	src_entry->eflags \|= (MAP_ENTRY_COW\|MAP_ENTRY_NEEDS_COPY);
	2590	dst_entry->eflags \|= (MAP_ENTRY_COW\|MAP_ENTRY_NEEDS_COPY);
	2591	dst_entry->offset = src_entry->offset;
	2592	} else {
	2593	dst_entry->object.vm_object = NULL;
	2594	dst_entry->offset = 0;
	2595	}
	2596
	2597	pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start,
	2598	dst_entry->end - dst_entry->start, src_entry->start);
	2599	} else {
	2600	/*
	2601	* Of course, wired down pages can't be set copy-on-write.
	2602	* Cause wired pages to be copied into the new map by
	2603	* simulating faults (the new pages are pageable)
	2604	*/
	2605	vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry);
	2606	}
	2607	}
	2608
	2609	/*
	2610	* vmspace_fork:
	2611	* Create a new process vmspace structure and vm_map
	2612	* based on those of an existing process. The new map
	2613	* is based on the old map, according to the inheritance
	2614	* values on the regions in that map.
	2615	*
	2616	* The source map must not be locked.
	2617	*/
	2618	struct vmspace *
	2619	vmspace_fork(struct vmspace *vm1)
	2620	{
	2621	struct vmspace *vm2;
	2622	vm_map_t old_map = &vm1->vm_map;
	2623	vm_map_t new_map;
	2624	vm_map_entry_t old_entry;
	2625	vm_map_entry_t new_entry;
	2626	vm_object_t object;
	2627	int count;
	2628
	2629	vm_map_lock(old_map);
	2630	old_map->infork = 1;
	2631
	2632	/*
	2633	* XXX Note: upcalls are not copied.
	2634	*/
	2635	vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset);
	2636	bcopy(&vm1->vm_startcopy, &vm2->vm_startcopy,
	2637	(caddr_t)&vm1->vm_endcopy - (caddr_t)&vm1->vm_startcopy);
	2638	new_map = &vm2->vm_map; /* XXX */
	2639	new_map->timestamp = 1;
	2640
	2641	count = 0;
	2642	old_entry = old_map->header.next;
	2643	while (old_entry != &old_map->header) {
	2644	++count;
	2645	old_entry = old_entry->next;
	2646	}
	2647
	2648	count = vm_map_entry_reserve(count + MAP_RESERVE_COUNT);
	2649
	2650	old_entry = old_map->header.next;
	2651	while (old_entry != &old_map->header) {
	2652	if (old_entry->eflags & MAP_ENTRY_IS_SUB_MAP)
	2653	panic("vm_map_fork: encountered a submap");
	2654
	2655	switch (old_entry->inheritance) {
	2656	case VM_INHERIT_NONE:
	2657	break;
	2658
	2659	case VM_INHERIT_SHARE:
	2660	/*
	2661	* Clone the entry, creating the shared object if necessary.
	2662	*/
	2663	object = old_entry->object.vm_object;
	2664	if (object == NULL) {
	2665	object = vm_object_allocate(OBJT_DEFAULT,
	2666	atop(old_entry->end - old_entry->start));
	2667	old_entry->object.vm_object = object;
	2668	old_entry->offset = (vm_offset_t) 0;
	2669	}
	2670
	2671	/*
	2672	* Add the reference before calling vm_object_shadow
	2673	* to insure that a shadow object is created.
	2674	*/
	2675	vm_object_reference(object);
	2676	if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
	2677	vm_object_shadow(&old_entry->object.vm_object,
	2678	&old_entry->offset,
	2679	atop(old_entry->end - old_entry->start));
	2680	old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
	2681	/* Transfer the second reference too. */
	2682	vm_object_reference(
	2683	old_entry->object.vm_object);
	2684	vm_object_deallocate(object);
	2685	object = old_entry->object.vm_object;
	2686	}
	2687	vm_object_clear_flag(object, OBJ_ONEMAPPING);
	2688
	2689	/*
	2690	* Clone the entry, referencing the shared object.
	2691	*/
	2692	new_entry = vm_map_entry_create(new_map, &count);
	2693	new_entry = old_entry;
	2694	new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
	2695	new_entry->wired_count = 0;
	2696
	2697	/*
	2698	* Insert the entry into the new map -- we know we're
	2699	* inserting at the end of the new map.
	2700	*/
	2701
	2702	vm_map_entry_link(new_map, new_map->header.prev,
	2703	new_entry);
	2704
	2705	/*
	2706	* Update the physical map
	2707	*/
	2708
	2709	pmap_copy(new_map->pmap, old_map->pmap,
	2710	new_entry->start,
	2711	(old_entry->end - old_entry->start),
	2712	old_entry->start);
	2713	break;
	2714
	2715	case VM_INHERIT_COPY:
	2716	/*
	2717	* Clone the entry and link into the map.
	2718	*/
	2719	new_entry = vm_map_entry_create(new_map, &count);
	2720	new_entry = old_entry;
	2721	new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
	2722	new_entry->wired_count = 0;
	2723	new_entry->object.vm_object = NULL;
	2724	vm_map_entry_link(new_map, new_map->header.prev,
	2725	new_entry);
	2726	vm_map_copy_entry(old_map, new_map, old_entry,
	2727	new_entry);
	2728	break;
	2729	}
	2730	old_entry = old_entry->next;
	2731	}
	2732
	2733	new_map->size = old_map->size;
	2734	old_map->infork = 0;
	2735	vm_map_unlock(old_map);
	2736	vm_map_entry_release(count);
	2737
	2738	return (vm2);
	2739	}
	2740
	2741	int
	2742	vm_map_stack (vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
	2743	vm_prot_t prot, vm_prot_t max, int cow)
	2744	{
	2745	vm_map_entry_t prev_entry;
	2746	vm_map_entry_t new_stack_entry;
	2747	vm_size_t init_ssize;
	2748	int rv;
	2749	int count;
	2750
	2751	if (VM_MIN_ADDRESS > 0 && addrbos < VM_MIN_ADDRESS)
	2752	return (KERN_NO_SPACE);
	2753
	2754	if (max_ssize < sgrowsiz)
	2755	init_ssize = max_ssize;
	2756	else
	2757	init_ssize = sgrowsiz;
	2758
	2759	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	2760	vm_map_lock(map);
	2761
	2762	/* If addr is already mapped, no go */
	2763	if (vm_map_lookup_entry(map, addrbos, &prev_entry)) {
	2764	vm_map_unlock(map);
	2765	vm_map_entry_release(count);
	2766	return (KERN_NO_SPACE);
	2767	}
	2768
	2769	/* If we would blow our VMEM resource limit, no go */
	2770	if (map->size + init_ssize >
	2771	curproc->p_rlimit[RLIMIT_VMEM].rlim_cur) {
	2772	vm_map_unlock(map);
	2773	vm_map_entry_release(count);
	2774	return (KERN_NO_SPACE);
	2775	}
	2776
	2777	/* If we can't accomodate max_ssize in the current mapping,
	2778	* no go. However, we need to be aware that subsequent user
	2779	* mappings might map into the space we have reserved for
	2780	* stack, and currently this space is not protected.
	2781	*
	2782	* Hopefully we will at least detect this condition
	2783	* when we try to grow the stack.
	2784	*/
	2785	if ((prev_entry->next != &map->header) &&
	2786	(prev_entry->next->start < addrbos + max_ssize)) {
	2787	vm_map_unlock(map);
	2788	vm_map_entry_release(count);
	2789	return (KERN_NO_SPACE);
	2790	}
	2791
	2792	/* We initially map a stack of only init_ssize. We will
	2793	* grow as needed later. Since this is to be a grow
	2794	* down stack, we map at the top of the range.
	2795	*
	2796	* Note: we would normally expect prot and max to be
	2797	* VM_PROT_ALL, and cow to be 0. Possibly we should
	2798	* eliminate these as input parameters, and just
	2799	* pass these values here in the insert call.
	2800	*/
	2801	rv = vm_map_insert(map, &count,
	2802	NULL, 0, addrbos + max_ssize - init_ssize,
	2803	addrbos + max_ssize, prot, max, cow);
	2804
	2805	/* Now set the avail_ssize amount */
	2806	if (rv == KERN_SUCCESS){
	2807	if (prev_entry != &map->header)
	2808	vm_map_clip_end(map, prev_entry, addrbos + max_ssize - init_ssize, &count);
	2809	new_stack_entry = prev_entry->next;
	2810	if (new_stack_entry->end != addrbos + max_ssize \|\|
	2811	new_stack_entry->start != addrbos + max_ssize - init_ssize)
	2812	panic ("Bad entry start/end for new stack entry");
	2813	else
	2814	new_stack_entry->avail_ssize = max_ssize - init_ssize;
	2815	}
	2816
	2817	vm_map_unlock(map);
	2818	vm_map_entry_release(count);
	2819	return (rv);
	2820	}
	2821
	2822	/* Attempts to grow a vm stack entry. Returns KERN_SUCCESS if the
	2823	* desired address is already mapped, or if we successfully grow
	2824	* the stack. Also returns KERN_SUCCESS if addr is outside the
	2825	* stack range (this is strange, but preserves compatibility with
	2826	* the grow function in vm_machdep.c).
	2827	*/
	2828	int
	2829	vm_map_growstack (struct proc *p, vm_offset_t addr)
	2830	{
	2831	vm_map_entry_t prev_entry;
	2832	vm_map_entry_t stack_entry;
	2833	vm_map_entry_t new_stack_entry;
	2834	struct vmspace *vm = p->p_vmspace;
	2835	vm_map_t map = &vm->vm_map;
	2836	vm_offset_t end;
	2837	int grow_amount;
	2838	int rv = KERN_SUCCESS;
	2839	int is_procstack;
	2840	int use_read_lock = 1;
	2841	int count;
	2842
	2843	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	2844	Retry:
	2845	if (use_read_lock)
	2846	vm_map_lock_read(map);
	2847	else
	2848	vm_map_lock(map);
	2849
	2850	/* If addr is already in the entry range, no need to grow.*/
	2851	if (vm_map_lookup_entry(map, addr, &prev_entry))
	2852	goto done;
	2853
	2854	if ((stack_entry = prev_entry->next) == &map->header)
	2855	goto done;
	2856	if (prev_entry == &map->header)
	2857	end = stack_entry->start - stack_entry->avail_ssize;
	2858	else
	2859	end = prev_entry->end;
	2860
	2861	/* This next test mimics the old grow function in vm_machdep.c.
	2862	* It really doesn't quite make sense, but we do it anyway
	2863	* for compatibility.
	2864	*
	2865	* If not growable stack, return success. This signals the
	2866	* caller to proceed as he would normally with normal vm.
	2867	*/
	2868	if (stack_entry->avail_ssize < 1 \|\|
	2869	addr >= stack_entry->start \|\|
	2870	addr < stack_entry->start - stack_entry->avail_ssize) {
	2871	goto done;
	2872	}
	2873
	2874	/* Find the minimum grow amount */
	2875	grow_amount = roundup (stack_entry->start - addr, PAGE_SIZE);
	2876	if (grow_amount > stack_entry->avail_ssize) {
	2877	rv = KERN_NO_SPACE;
	2878	goto done;
	2879	}
	2880
	2881	/* If there is no longer enough space between the entries
	2882	* nogo, and adjust the available space. Note: this
	2883	* should only happen if the user has mapped into the
	2884	* stack area after the stack was created, and is
	2885	* probably an error.
	2886	*
	2887	* This also effectively destroys any guard page the user
	2888	* might have intended by limiting the stack size.
	2889	*/
	2890	if (grow_amount > stack_entry->start - end) {
	2891	if (use_read_lock && vm_map_lock_upgrade(map)) {
	2892	use_read_lock = 0;
	2893	goto Retry;
	2894	}
	2895	use_read_lock = 0;
	2896	stack_entry->avail_ssize = stack_entry->start - end;
	2897	rv = KERN_NO_SPACE;
	2898	goto done;
	2899	}
	2900
	2901	is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr;
	2902
	2903	/* If this is the main process stack, see if we're over the
	2904	* stack limit.
	2905	*/
	2906	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount >
	2907	p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
	2908	rv = KERN_NO_SPACE;
	2909	goto done;
	2910	}
	2911
	2912	/* Round up the grow amount modulo SGROWSIZ */
	2913	grow_amount = roundup (grow_amount, sgrowsiz);
	2914	if (grow_amount > stack_entry->avail_ssize) {
	2915	grow_amount = stack_entry->avail_ssize;
	2916	}
	2917	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount >
	2918	p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
	2919	grow_amount = p->p_rlimit[RLIMIT_STACK].rlim_cur -
	2920	ctob(vm->vm_ssize);
	2921	}
	2922
	2923	/* If we would blow our VMEM resource limit, no go */
	2924	if (map->size + grow_amount > p->p_rlimit[RLIMIT_VMEM].rlim_cur) {
	2925	rv = KERN_NO_SPACE;
	2926	goto done;
	2927	}
	2928
	2929	if (use_read_lock && vm_map_lock_upgrade(map)) {
	2930	use_read_lock = 0;
	2931	goto Retry;
	2932	}
	2933	use_read_lock = 0;
	2934
	2935	/* Get the preliminary new entry start value */
	2936	addr = stack_entry->start - grow_amount;
	2937
	2938	/* If this puts us into the previous entry, cut back our growth
	2939	* to the available space. Also, see the note above.
	2940	*/
	2941	if (addr < end) {
	2942	stack_entry->avail_ssize = stack_entry->start - end;
	2943	addr = end;
	2944	}
	2945
	2946	rv = vm_map_insert(map, &count,
	2947	NULL, 0, addr, stack_entry->start,
	2948	VM_PROT_ALL,
	2949	VM_PROT_ALL,
	2950	0);
	2951
	2952	/* Adjust the available stack space by the amount we grew. */
	2953	if (rv == KERN_SUCCESS) {
	2954	if (prev_entry != &map->header)
	2955	vm_map_clip_end(map, prev_entry, addr, &count);
	2956	new_stack_entry = prev_entry->next;
	2957	if (new_stack_entry->end != stack_entry->start \|\|
	2958	new_stack_entry->start != addr)
	2959	panic ("Bad stack grow start/end in new stack entry");
	2960	else {
	2961	new_stack_entry->avail_ssize = stack_entry->avail_ssize -
	2962	(new_stack_entry->end -
	2963	new_stack_entry->start);
	2964	if (is_procstack)
	2965	vm->vm_ssize += btoc(new_stack_entry->end -
	2966	new_stack_entry->start);
	2967	}
	2968	}
	2969
	2970	done:
	2971	if (use_read_lock)
	2972	vm_map_unlock_read(map);
	2973	else
	2974	vm_map_unlock(map);
	2975	vm_map_entry_release(count);
	2976	return (rv);
	2977	}
	2978
	2979	/*
	2980	* Unshare the specified VM space for exec. If other processes are
	2981	* mapped to it, then create a new one. The new vmspace is null.
	2982	*/
	2983
	2984	void
	2985	vmspace_exec(struct proc p, struct vmspace vmcopy)
	2986	{
	2987	struct vmspace *oldvmspace = p->p_vmspace;
	2988	struct vmspace *newvmspace;
	2989	vm_map_t map = &p->p_vmspace->vm_map;
	2990
	2991	/*
	2992	* If we are execing a resident vmspace we fork it, otherwise
	2993	* we create a new vmspace. Note that exitingcnt and upcalls
	2994	* are not copied to the new vmspace.
	2995	*/
	2996	if (vmcopy) {
	2997	newvmspace = vmspace_fork(vmcopy);
	2998	} else {
	2999	newvmspace = vmspace_alloc(map->min_offset, map->max_offset);
	3000	bcopy(&oldvmspace->vm_startcopy, &newvmspace->vm_startcopy,
	3001	(caddr_t)&oldvmspace->vm_endcopy -
	3002	(caddr_t)&oldvmspace->vm_startcopy);
	3003	}
	3004
	3005	/*
	3006	* This code is written like this for prototype purposes. The
	3007	* goal is to avoid running down the vmspace here, but let the
	3008	* other process's that are still using the vmspace to finally
	3009	* run it down. Even though there is little or no chance of blocking
	3010	* here, it is a good idea to keep this form for future mods.
	3011	*/
	3012	p->p_vmspace = newvmspace;
	3013	pmap_pinit2(vmspace_pmap(newvmspace));
	3014	if (p == curproc)
	3015	pmap_activate(p);
	3016	vmspace_free(oldvmspace);
	3017	}
	3018
	3019	/*
	3020	* Unshare the specified VM space for forcing COW. This
	3021	* is called by rfork, for the (RFMEM\|RFPROC) == 0 case.
	3022	*
	3023	* The exitingcnt test is not strictly necessary but has been
	3024	* included for code sanity (to make the code a bit more deterministic).
	3025	*/
	3026
	3027	void
	3028	vmspace_unshare(struct proc *p)
	3029	{
	3030	struct vmspace *oldvmspace = p->p_vmspace;
	3031	struct vmspace *newvmspace;
	3032
	3033	if (oldvmspace->vm_refcnt == 1 && oldvmspace->vm_exitingcnt == 0)
	3034	return;
	3035	newvmspace = vmspace_fork(oldvmspace);
	3036	p->p_vmspace = newvmspace;
	3037	pmap_pinit2(vmspace_pmap(newvmspace));
	3038	if (p == curproc)
	3039	pmap_activate(p);
	3040	vmspace_free(oldvmspace);
	3041	}
	3042
	3043	/*
	3044	* vm_map_lookup:
	3045	*
	3046	* Finds the VM object, offset, and
	3047	* protection for a given virtual address in the
	3048	* specified map, assuming a page fault of the
	3049	* type specified.
	3050	*
	3051	* Leaves the map in question locked for read; return
	3052	* values are guaranteed until a vm_map_lookup_done
	3053	* call is performed. Note that the map argument
	3054	* is in/out; the returned map must be used in
	3055	* the call to vm_map_lookup_done.
	3056	*
	3057	* A handle (out_entry) is returned for use in
	3058	* vm_map_lookup_done, to make that fast.
	3059	*
	3060	* If a lookup is requested with "write protection"
	3061	* specified, the map may be changed to perform virtual
	3062	* copying operations, although the data referenced will
	3063	* remain the same.
	3064	*/
	3065	int
	3066	vm_map_lookup(vm_map_t var_map, / IN/OUT */
	3067	vm_offset_t vaddr,
	3068	vm_prot_t fault_typea,
	3069	vm_map_entry_t out_entry, / OUT */
	3070	vm_object_t object, / OUT */
	3071	vm_pindex_t pindex, / OUT */
	3072	vm_prot_t out_prot, / OUT */
	3073	boolean_t wired) / OUT */
	3074	{
	3075	vm_map_entry_t entry;
	3076	vm_map_t map = *var_map;
	3077	vm_prot_t prot;
	3078	vm_prot_t fault_type = fault_typea;
	3079	int use_read_lock = 1;
	3080	int rv = KERN_SUCCESS;
	3081
	3082	RetryLookup:
	3083	if (use_read_lock)
	3084	vm_map_lock_read(map);
	3085	else
	3086	vm_map_lock(map);
	3087
	3088	/*
	3089	* If the map has an interesting hint, try it before calling full
	3090	* blown lookup routine.
	3091	*/
	3092	entry = map->hint;
	3093	*out_entry = entry;
	3094
	3095	if ((entry == &map->header) \|\|
	3096	(vaddr < entry->start) \|\| (vaddr >= entry->end)) {
	3097	vm_map_entry_t tmp_entry;
	3098
	3099	/*
	3100	* Entry was either not a valid hint, or the vaddr was not
	3101	* contained in the entry, so do a full lookup.
	3102	*/
	3103	if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
	3104	rv = KERN_INVALID_ADDRESS;
	3105	goto done;
	3106	}
	3107
	3108	entry = tmp_entry;
	3109	*out_entry = entry;
	3110	}
	3111
	3112	/*
	3113	* Handle submaps.
	3114	*/
	3115
	3116	if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
	3117	vm_map_t old_map = map;
	3118
	3119	*var_map = map = entry->object.sub_map;
	3120	if (use_read_lock)
	3121	vm_map_unlock_read(old_map);
	3122	else
	3123	vm_map_unlock(old_map);
	3124	use_read_lock = 1;
	3125	goto RetryLookup;
	3126	}
	3127
	3128	/*
	3129	* Check whether this task is allowed to have this page.
	3130	* Note the special case for MAP_ENTRY_COW
	3131	* pages with an override. This is to implement a forced
	3132	* COW for debuggers.
	3133	*/
	3134
	3135	if (fault_type & VM_PROT_OVERRIDE_WRITE)
	3136	prot = entry->max_protection;
	3137	else
	3138	prot = entry->protection;
	3139
	3140	fault_type &= (VM_PROT_READ\|VM_PROT_WRITE\|VM_PROT_EXECUTE);
	3141	if ((fault_type & prot) != fault_type) {
	3142	rv = KERN_PROTECTION_FAILURE;
	3143	goto done;
	3144	}
	3145
	3146	if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
	3147	(entry->eflags & MAP_ENTRY_COW) &&
	3148	(fault_type & VM_PROT_WRITE) &&
	3149	(fault_typea & VM_PROT_OVERRIDE_WRITE) == 0) {
	3150	rv = KERN_PROTECTION_FAILURE;
	3151	goto done;
	3152	}
	3153
	3154	/*
	3155	* If this page is not pageable, we have to get it for all possible
	3156	* accesses.
	3157	*/
	3158
	3159	*wired = (entry->wired_count != 0);
	3160	if (*wired)
	3161	prot = fault_type = entry->protection;
	3162
	3163	/*
	3164	* If the entry was copy-on-write, we either ...
	3165	*/
	3166
	3167	if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
	3168	/*
	3169	* If we want to write the page, we may as well handle that
	3170	* now since we've got the map locked.
	3171	*
	3172	* If we don't need to write the page, we just demote the
	3173	* permissions allowed.
	3174	*/
	3175
	3176	if (fault_type & VM_PROT_WRITE) {
	3177	/*
	3178	* Make a new object, and place it in the object
	3179	* chain. Note that no new references have appeared
	3180	* -- one just moved from the map to the new
	3181	* object.
	3182	*/
	3183
	3184	if (use_read_lock && vm_map_lock_upgrade(map)) {
	3185	use_read_lock = 0;
	3186	goto RetryLookup;
	3187	}
	3188	use_read_lock = 0;
	3189
	3190	vm_object_shadow(
	3191	&entry->object.vm_object,
	3192	&entry->offset,
	3193	atop(entry->end - entry->start));
	3194
	3195	entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
	3196	} else {
	3197	/*
	3198	* We're attempting to read a copy-on-write page --
	3199	* don't allow writes.
	3200	*/
	3201
	3202	prot &= ~VM_PROT_WRITE;
	3203	}
	3204	}
	3205
	3206	/*
	3207	* Create an object if necessary.
	3208	*/
	3209	if (entry->object.vm_object == NULL &&
	3210	!map->system_map) {
	3211	if (use_read_lock && vm_map_lock_upgrade(map)) {
	3212	use_read_lock = 0;
	3213	goto RetryLookup;
	3214	}
	3215	use_read_lock = 0;
	3216	entry->object.vm_object = vm_object_allocate(OBJT_DEFAULT,
	3217	atop(entry->end - entry->start));
	3218	entry->offset = 0;
	3219	}
	3220
	3221	/*
	3222	* Return the object/offset from this entry. If the entry was
	3223	* copy-on-write or empty, it has been fixed up.
	3224	*/
	3225
	3226	*pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
	3227	*object = entry->object.vm_object;
	3228
	3229	/*
	3230	* Return whether this is the only map sharing this data. On
	3231	* success we return with a read lock held on the map. On failure
	3232	* we return with the map unlocked.
	3233	*/
	3234	*out_prot = prot;
	3235	done:
	3236	if (rv == KERN_SUCCESS) {
	3237	if (use_read_lock == 0)
	3238	vm_map_lock_downgrade(map);
	3239	} else if (use_read_lock) {
	3240	vm_map_unlock_read(map);
	3241	} else {
	3242	vm_map_unlock(map);
	3243	}
	3244	return (rv);
	3245	}
	3246
	3247	/*
	3248	* vm_map_lookup_done:
	3249	*
	3250	* Releases locks acquired by a vm_map_lookup
	3251	* (according to the handle returned by that lookup).
	3252	*/
	3253
	3254	void
	3255	vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry, int count)
	3256	{
	3257	/*
	3258	* Unlock the main-level map
	3259	*/
	3260	vm_map_unlock_read(map);
	3261	if (count)
	3262	vm_map_entry_release(count);
	3263	}
	3264
	3265	#ifdef ENABLE_VFS_IOOPT
	3266
	3267	/*
	3268	* Implement uiomove with VM operations. This handles (and collateral changes)
	3269	* support every combination of source object modification, and COW type
	3270	* operations.
	3271	*
	3272	* XXX this is extremely dangerous, enabling this option is NOT recommended.
	3273	*/
	3274	int
	3275	vm_uiomove(vm_map_t mapa, vm_object_t srcobject, off_t cp, int cnta,
	3276	vm_offset_t uaddra, int *npages)
	3277	{
	3278	vm_map_t map;
	3279	vm_object_t first_object, oldobject, object;
	3280	vm_map_entry_t entry;
	3281	vm_prot_t prot;
	3282	boolean_t wired;
	3283	int tcnt, rv;
	3284	vm_offset_t uaddr, start, end, tend;
	3285	vm_pindex_t first_pindex, osize, oindex;
	3286	off_t ooffset;
	3287	int cnt;
	3288	int count;
	3289	int s;
	3290
	3291	if (npages)
	3292	*npages = 0;
	3293
	3294	cnt = cnta;
	3295	uaddr = uaddra;
	3296
	3297	while (cnt > 0) {
	3298	map = mapa;
	3299
	3300	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	3301
	3302	if ((vm_map_lookup(&map, uaddr,
	3303	VM_PROT_READ, &entry, &first_object,
	3304	&first_pindex, &prot, &wired)) != KERN_SUCCESS) {
	3305	return EFAULT;
	3306	}
	3307
	3308	vm_map_clip_start(map, entry, uaddr, &count);
	3309
	3310	tcnt = cnt;
	3311	tend = uaddr + tcnt;
	3312	if (tend > entry->end) {
	3313	tcnt = entry->end - uaddr;
	3314	tend = entry->end;
	3315	}
	3316
	3317	vm_map_clip_end(map, entry, tend, &count);
	3318
	3319	start = entry->start;
	3320	end = entry->end;
	3321
	3322	osize = atop(tcnt);
	3323
	3324	oindex = OFF_TO_IDX(cp);
	3325	if (npages) {
	3326	vm_pindex_t idx;
	3327
	3328	/*
	3329	* spl protection is needed to avoid a race between
	3330	* the lookup and an interrupt/unbusy/free occuring
	3331	* prior to our busy check.
	3332	*/
	3333	s = splvm();
	3334	for (idx = 0; idx < osize; idx++) {
	3335	vm_page_t m;
	3336	if ((m = vm_page_lookup(srcobject, oindex + idx)) == NULL) {
	3337	splx(s);
	3338	vm_map_lookup_done(map, entry, count);
	3339	return 0;
	3340	}
	3341	/*
	3342	* disallow busy or invalid pages, but allow
	3343	* m->busy pages if they are entirely valid.
	3344	*/
	3345	if ((m->flags & PG_BUSY) \|\|
	3346	((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL)) {
	3347	splx(s);
	3348	vm_map_lookup_done(map, entry, count);
	3349	return 0;
	3350	}
	3351	}
	3352	splx(s);
	3353	}
	3354
	3355	/*
	3356	* If we are changing an existing map entry, just redirect
	3357	* the object, and change mappings.
	3358	*/
	3359	if ((first_object->type == OBJT_VNODE) &&
	3360	((oldobject = entry->object.vm_object) == first_object)) {
	3361
	3362	if ((entry->offset != cp) \|\| (oldobject != srcobject)) {
	3363	/*
	3364	* Remove old window into the file
	3365	*/
	3366	pmap_remove (map->pmap, uaddr, tend);
	3367
	3368	/*
	3369	* Force copy on write for mmaped regions
	3370	*/
	3371	vm_object_pmap_copy_1 (srcobject, oindex, oindex + osize);
	3372
	3373	/*
	3374	* Point the object appropriately
	3375	*/
	3376	if (oldobject != srcobject) {
	3377
	3378	/*
	3379	* Set the object optimization hint flag
	3380	*/
	3381	vm_object_set_flag(srcobject, OBJ_OPT);
	3382	vm_object_reference(srcobject);
	3383	entry->object.vm_object = srcobject;
	3384
	3385	if (oldobject) {
	3386	vm_object_deallocate(oldobject);
	3387	}
	3388	}
	3389
	3390	entry->offset = cp;
	3391	map->timestamp++;
	3392	} else {
	3393	pmap_remove (map->pmap, uaddr, tend);
	3394	}
	3395
	3396	} else if ((first_object->ref_count == 1) &&
	3397	(first_object->size == osize) &&
	3398	((first_object->type == OBJT_DEFAULT) \|\|
	3399	(first_object->type == OBJT_SWAP)) ) {
	3400
	3401	oldobject = first_object->backing_object;
	3402
	3403	if ((first_object->backing_object_offset != cp) \|\|
	3404	(oldobject != srcobject)) {
	3405	/*
	3406	* Remove old window into the file
	3407	*/
	3408	pmap_remove (map->pmap, uaddr, tend);
	3409
	3410	/*
	3411	* Remove unneeded old pages
	3412	*/
	3413	vm_object_page_remove(first_object, 0, 0, 0);
	3414
	3415	/*
	3416	* Invalidate swap space
	3417	*/
	3418	if (first_object->type == OBJT_SWAP) {
	3419	swap_pager_freespace(first_object,
	3420	0,
	3421	first_object->size);
	3422	}
	3423
	3424	/*
	3425	* Force copy on write for mmaped regions
	3426	*/
	3427	vm_object_pmap_copy_1 (srcobject, oindex, oindex + osize);
	3428
	3429	/*
	3430	* Point the object appropriately
	3431	*/
	3432	if (oldobject != srcobject) {
	3433
	3434	/*
	3435	* Set the object optimization hint flag
	3436	*/
	3437	vm_object_set_flag(srcobject, OBJ_OPT);
	3438	vm_object_reference(srcobject);
	3439
	3440	if (oldobject) {
	3441	LIST_REMOVE(
	3442	first_object, shadow_list);
	3443	oldobject->shadow_count--;
	3444	/* XXX bump generation? */
	3445	vm_object_deallocate(oldobject);
	3446	}
	3447
	3448	LIST_INSERT_HEAD(&srcobject->shadow_head,
	3449	first_object, shadow_list);
	3450	srcobject->shadow_count++;
	3451	/* XXX bump generation? */
	3452
	3453	first_object->backing_object = srcobject;
	3454	}
	3455	first_object->backing_object_offset = cp;
	3456	map->timestamp++;
	3457	} else {
	3458	pmap_remove (map->pmap, uaddr, tend);
	3459	}
	3460	/*
	3461	* Otherwise, we have to do a logical mmap.
	3462	*/
	3463	} else {
	3464
	3465	vm_object_set_flag(srcobject, OBJ_OPT);
	3466	vm_object_reference(srcobject);
	3467
	3468	pmap_remove (map->pmap, uaddr, tend);
	3469
	3470	vm_object_pmap_copy_1 (srcobject, oindex, oindex + osize);
	3471	vm_map_lock_upgrade(map);
	3472
	3473	if (entry == &map->header) {
	3474	map->first_free = &map->header;
	3475	} else if (map->first_free->start >= start) {
	3476	map->first_free = entry->prev;
	3477	}
	3478
	3479	SAVE_HINT(map, entry->prev);
	3480	vm_map_entry_delete(map, entry, &count);
	3481
	3482	object = srcobject;
	3483	ooffset = cp;
	3484
	3485	rv = vm_map_insert(map, &count,
	3486	object, ooffset, start, tend,
	3487	VM_PROT_ALL, VM_PROT_ALL, MAP_COPY_ON_WRITE);
	3488
	3489	if (rv != KERN_SUCCESS)
	3490	panic("vm_uiomove: could not insert new entry: %d", rv);
	3491	}
	3492
	3493	/*
	3494	* Map the window directly, if it is already in memory
	3495	*/
	3496	pmap_object_init_pt(map->pmap, uaddr, entry->protection,
	3497	srcobject, oindex, tcnt, 0);
	3498
	3499	map->timestamp++;
	3500	vm_map_unlock(map);
	3501	vm_map_entry_release(count);
	3502
	3503	cnt -= tcnt;
	3504	uaddr += tcnt;
	3505	cp += tcnt;
	3506	if (npages)
	3507	*npages += osize;
	3508	}
	3509	return 0;
	3510	}
	3511
	3512	#endif
	3513
	3514	/*
	3515	* Performs the copy_on_write operations necessary to allow the virtual copies
	3516	* into user space to work. This has to be called for write(2) system calls
	3517	* from other processes, file unlinking, and file size shrinkage.
	3518	*/
	3519	void
	3520	vm_freeze_copyopts(vm_object_t object, vm_pindex_t froma, vm_pindex_t toa)
	3521	{
	3522	int rv;
	3523	vm_object_t robject;
	3524	vm_pindex_t idx;
	3525
	3526	if ((object == NULL) \|\|
	3527	((object->flags & OBJ_OPT) == 0))
	3528	return;
	3529
	3530	if (object->shadow_count > object->ref_count)
	3531	panic("vm_freeze_copyopts: sc > rc");
	3532
	3533	while ((robject = LIST_FIRST(&object->shadow_head)) != NULL) {
	3534	vm_pindex_t bo_pindex;
	3535	vm_page_t m_in, m_out;
	3536
	3537	bo_pindex = OFF_TO_IDX(robject->backing_object_offset);
	3538
	3539	vm_object_reference(robject);
	3540
	3541	vm_object_pip_wait(robject, "objfrz");
	3542
	3543	if (robject->ref_count == 1) {
	3544	vm_object_deallocate(robject);
	3545	continue;
	3546	}
	3547
	3548	vm_object_pip_add(robject, 1);
	3549
	3550	for (idx = 0; idx < robject->size; idx++) {
	3551
	3552	m_out = vm_page_grab(robject, idx,
	3553	VM_ALLOC_NORMAL \| VM_ALLOC_RETRY);
	3554
	3555	if (m_out->valid == 0) {
	3556	m_in = vm_page_grab(object, bo_pindex + idx,
	3557	VM_ALLOC_NORMAL \| VM_ALLOC_RETRY);
	3558	if (m_in->valid == 0) {
	3559	rv = vm_pager_get_pages(object, &m_in, 1, 0);
	3560	if (rv != VM_PAGER_OK) {
	3561	printf("vm_freeze_copyopts: cannot read page from file: %lx\n", (long)m_in->pindex);
	3562	continue;
	3563	}
	3564	vm_page_deactivate(m_in);
	3565	}
	3566
	3567	vm_page_protect(m_in, VM_PROT_NONE);
	3568	pmap_copy_page(VM_PAGE_TO_PHYS(m_in), VM_PAGE_TO_PHYS(m_out));
	3569	m_out->valid = m_in->valid;
	3570	vm_page_dirty(m_out);
	3571	vm_page_activate(m_out);
	3572	vm_page_wakeup(m_in);
	3573	}
	3574	vm_page_wakeup(m_out);
	3575	}
	3576
	3577	object->shadow_count--;
	3578	object->ref_count--;
	3579	LIST_REMOVE(robject, shadow_list);
	3580	robject->backing_object = NULL;
	3581	robject->backing_object_offset = 0;
	3582
	3583	vm_object_pip_wakeup(robject);
	3584	vm_object_deallocate(robject);
	3585	}
	3586
	3587	vm_object_clear_flag(object, OBJ_OPT);
	3588	}
	3589
	3590	#include "opt_ddb.h"
	3591	#ifdef DDB
	3592	#include <sys/kernel.h>
	3593
	3594	#include <ddb/ddb.h>
	3595
	3596	/*
	3597	* vm_map_print: [ debug ]
	3598	*/
	3599	DB_SHOW_COMMAND(map, vm_map_print)
	3600	{
	3601	static int nlines;
	3602	/* XXX convert args. */
	3603	vm_map_t map = (vm_map_t)addr;
	3604	boolean_t full = have_addr;
	3605
	3606	vm_map_entry_t entry;
	3607
	3608	db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
	3609	(void *)map,
	3610	(void *)map->pmap, map->nentries, map->timestamp);
	3611	nlines++;
	3612
	3613	if (!full && db_indent)
	3614	return;
	3615
	3616	db_indent += 2;
	3617	for (entry = map->header.next; entry != &map->header;
	3618	entry = entry->next) {
	3619	db_iprintf("map entry %p: start=%p, end=%p\n",
	3620	(void )entry, (void )entry->start, (void *)entry->end);
	3621	nlines++;
	3622	{
	3623	static char *inheritance_name[4] =
	3624	{"share", "copy", "none", "donate_copy"};
	3625
	3626	db_iprintf(" prot=%x/%x/%s",
	3627	entry->protection,
	3628	entry->max_protection,
	3629	inheritance_name[(int)(unsigned char)entry->inheritance]);
	3630	if (entry->wired_count != 0)
	3631	db_printf(", wired");
	3632	}
	3633	if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
	3634	/* XXX no %qd in kernel. Truncate entry->offset. */
	3635	db_printf(", share=%p, offset=0x%lx\n",
	3636	(void *)entry->object.sub_map,
	3637	(long)entry->offset);
	3638	nlines++;
	3639	if ((entry->prev == &map->header) \|\|
	3640	(entry->prev->object.sub_map !=
	3641	entry->object.sub_map)) {
	3642	db_indent += 2;
	3643	vm_map_print((db_expr_t)(intptr_t)
	3644	entry->object.sub_map,
	3645	full, 0, (char *)0);
	3646	db_indent -= 2;
	3647	}
	3648	} else {
	3649	/* XXX no %qd in kernel. Truncate entry->offset. */
	3650	db_printf(", object=%p, offset=0x%lx",
	3651	(void *)entry->object.vm_object,
	3652	(long)entry->offset);
	3653	if (entry->eflags & MAP_ENTRY_COW)
	3654	db_printf(", copy (%s)",
	3655	(entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
	3656	db_printf("\n");
	3657	nlines++;
	3658
	3659	if ((entry->prev == &map->header) \|\|
	3660	(entry->prev->object.vm_object !=
	3661	entry->object.vm_object)) {
	3662	db_indent += 2;
	3663	vm_object_print((db_expr_t)(intptr_t)
	3664	entry->object.vm_object,
	3665	full, 0, (char *)0);
	3666	nlines += 4;
	3667	db_indent -= 2;
	3668	}
	3669	}
	3670	}
	3671	db_indent -= 2;
	3672	if (db_indent == 0)
	3673	nlines = 0;
	3674	}
	3675
	3676
	3677	DB_SHOW_COMMAND(procvm, procvm)
	3678	{
	3679	struct proc *p;
	3680
	3681	if (have_addr) {
	3682	p = (struct proc *) addr;
	3683	} else {
	3684	p = curproc;
	3685	}
	3686
	3687	db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
	3688	(void )p, (void )p->p_vmspace, (void *)&p->p_vmspace->vm_map,
	3689	(void *)vmspace_pmap(p->p_vmspace));
	3690
	3691	vm_map_print((db_expr_t)(intptr_t)&p->p_vmspace->vm_map, 1, 0, NULL);
	3692	}
	3693
	3694	#endif /* DDB */