gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 1991, 1993
	3	* The Regents of the University of California. All rights reserved.
	4	*
	5	* This code is derived from software contributed to Berkeley by
	6	* The Mach Operating System project at Carnegie-Mellon University.
	7	*
	8	* Redistribution and use in source and binary forms, with or without
	9	* modification, are permitted provided that the following conditions
	10	* are met:
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in the
	15	* documentation and/or other materials provided with the distribution.
	16	* 3. All advertising materials mentioning features or use of this software
	17	* must display the following acknowledgement:
	18	* This product includes software developed by the University of
	19	* California, Berkeley and its contributors.
	20	* 4. Neither the name of the University nor the names of its contributors
	21	* may be used to endorse or promote products derived from this software
	22	* without specific prior written permission.
	23	*
	24	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	25	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	26	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	27	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	28	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	29	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	30	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	31	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	32	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	33	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	34	* SUCH DAMAGE.
	35	*
	36	* from: @(#)vm_map.c 8.3 (Berkeley) 1/12/94
	37	*
	38	*
	39	* Copyright (c) 1987, 1990 Carnegie-Mellon University.
	40	* All rights reserved.
	41	*
	42	* Authors: Avadis Tevanian, Jr., Michael Wayne Young
	43	*
	44	* Permission to use, copy, modify and distribute this software and
	45	* its documentation is hereby granted, provided that both the copyright
	46	* notice and this permission notice appear in all copies of the
	47	* software, derivative works or modified versions, and any portions
	48	* thereof, and that both notices appear in supporting documentation.
	49	*
	50	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
	51	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
	52	* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
	53	*
	54	* Carnegie Mellon requests users of this software to return to
	55	*
	56	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
	57	* School of Computer Science
	58	* Carnegie Mellon University
	59	* Pittsburgh PA 15213-3890
	60	*
	61	* any improvements or extensions that they make and grant Carnegie the
	62	* rights to redistribute these changes.
	63	*
	64	* $FreeBSD: src/sys/vm/vm_map.c,v 1.187.2.19 2003/05/27 00:47:02 alc Exp $
	65	* $DragonFly: src/sys/vm/vm_map.c,v 1.56 2007/04/29 18:25:41 dillon Exp $
	66	*/
	67
	68	/*
	69	* Virtual memory mapping module.
	70	*/
	71
	72	#include <sys/param.h>
	73	#include <sys/systm.h>
	74	#include <sys/kernel.h>
	75	#include <sys/proc.h>
	76	#include <sys/lock.h>
	77	#include <sys/vmmeter.h>
	78	#include <sys/mman.h>
	79	#include <sys/vnode.h>
	80	#include <sys/resourcevar.h>
	81	#include <sys/shm.h>
	82	#include <sys/tree.h>
	83	#include <sys/malloc.h>
	84
	85	#include <vm/vm.h>
	86	#include <vm/vm_param.h>
	87	#include <vm/pmap.h>
	88	#include <vm/vm_map.h>
	89	#include <vm/vm_page.h>
	90	#include <vm/vm_object.h>
	91	#include <vm/vm_pager.h>
	92	#include <vm/vm_kern.h>
	93	#include <vm/vm_extern.h>
	94	#include <vm/swap_pager.h>
	95	#include <vm/vm_zone.h>
	96
	97	#include <sys/thread2.h>
	98	#include <sys/sysref2.h>
	99
	100	/*
	101	* Virtual memory maps provide for the mapping, protection,
	102	* and sharing of virtual memory objects. In addition,
	103	* this module provides for an efficient virtual copy of
	104	* memory from one map to another.
	105	*
	106	* Synchronization is required prior to most operations.
	107	*
	108	* Maps consist of an ordered doubly-linked list of simple
	109	* entries; a single hint is used to speed up lookups.
	110	*
	111	* Since portions of maps are specified by start/end addresses,
	112	* which may not align with existing map entries, all
	113	* routines merely "clip" entries to these start/end values.
	114	* [That is, an entry is split into two, bordering at a
	115	* start or end value.] Note that these clippings may not
	116	* always be necessary (as the two resulting entries are then
	117	* not changed); however, the clipping is done for convenience.
	118	*
	119	* As mentioned above, virtual copy operations are performed
	120	* by copying VM object references from one map to
	121	* another, and then marking both regions as copy-on-write.
	122	*/
	123
	124	static void vmspace_terminate(struct vmspace *vm);
	125	static void vmspace_lock(struct vmspace *vm);
	126	static void vmspace_unlock(struct vmspace *vm);
	127	static void vmspace_dtor(void obj, void private);
	128
	129	MALLOC_DEFINE(M_VMSPACE, "vmspace", "vmspace objcache backingstore");
	130
	131	struct sysref_class vmspace_sysref_class = {
	132	.name = "vmspace",
	133	.mtype = M_VMSPACE,
	134	.proto = SYSREF_PROTO_VMSPACE,
	135	.offset = offsetof(struct vmspace, vm_sysref),
	136	.objsize = sizeof(struct vmspace),
	137	.mag_capacity = 32,
	138	.flags = SRC_MANAGEDINIT,
	139	.dtor = vmspace_dtor,
	140	.ops = {
	141	.terminate = (sysref_terminate_func_t)vmspace_terminate,
	142	.lock = (sysref_lock_func_t)vmspace_lock,
	143	.unlock = (sysref_lock_func_t)vmspace_unlock
	144	}
	145	};
	146
	147	#define VMEPERCPU 2
	148
	149	static struct vm_zone mapentzone_store, mapzone_store;
	150	static vm_zone_t mapentzone, mapzone;
	151	static struct vm_object mapentobj, mapobj;
	152
	153	static struct vm_map_entry map_entry_init[MAX_MAPENT];
	154	static struct vm_map_entry cpu_map_entry_init[MAXCPU][VMEPERCPU];
	155	static struct vm_map map_init[MAX_KMAP];
	156
	157	static void vm_map_entry_shadow(vm_map_entry_t entry);
	158	static vm_map_entry_t vm_map_entry_create(vm_map_t map, int *);
	159	static void vm_map_entry_dispose (vm_map_t map, vm_map_entry_t entry, int *);
	160	static void _vm_map_clip_end (vm_map_t, vm_map_entry_t, vm_offset_t, int *);
	161	static void _vm_map_clip_start (vm_map_t, vm_map_entry_t, vm_offset_t, int *);
	162	static void vm_map_entry_delete (vm_map_t, vm_map_entry_t, int *);
	163	static void vm_map_entry_unwire (vm_map_t, vm_map_entry_t);
	164	static void vm_map_copy_entry (vm_map_t, vm_map_t, vm_map_entry_t,
	165	vm_map_entry_t);
	166	static void vm_map_split (vm_map_entry_t);
	167	static void vm_map_unclip_range (vm_map_t map, vm_map_entry_t start_entry, vm_offset_t start, vm_offset_t end, int *count, int flags);
	168
	169	/*
	170	* vm_map_startup:
	171	*
	172	* Initialize the vm_map module. Must be called before
	173	* any other vm_map routines.
	174	*
	175	* Map and entry structures are allocated from the general
	176	* purpose memory pool with some exceptions:
	177	*
	178	* - The kernel map and kmem submap are allocated statically.
	179	* - Kernel map entries are allocated out of a static pool.
	180	*
	181	* These restrictions are necessary since malloc() uses the
	182	* maps and requires map entries.
	183	*/
	184	void
	185	vm_map_startup(void)
	186	{
	187	mapzone = &mapzone_store;
	188	zbootinit(mapzone, "MAP", sizeof (struct vm_map),
	189	map_init, MAX_KMAP);
	190	mapentzone = &mapentzone_store;
	191	zbootinit(mapentzone, "MAP ENTRY", sizeof (struct vm_map_entry),
	192	map_entry_init, MAX_MAPENT);
	193	}
	194
	195	/*
	196	* vm_init2 - called prior to any vmspace allocations
	197	*/
	198	void
	199	vm_init2(void)
	200	{
	201	zinitna(mapentzone, &mapentobj, NULL, 0, 0,
	202	ZONE_USE_RESERVE \| ZONE_SPECIAL, 1);
	203	zinitna(mapzone, &mapobj, NULL, 0, 0, 0, 1);
	204	pmap_init2();
	205	vm_object_init2();
	206	}
	207
	208
	209	/*
	210	* Red black tree functions
	211	*/
	212	static int rb_vm_map_compare(vm_map_entry_t a, vm_map_entry_t b);
	213	RB_GENERATE(vm_map_rb_tree, vm_map_entry, rb_entry, rb_vm_map_compare);
	214
	215	/* a->start is address, and the only field has to be initialized */
	216	static int
	217	rb_vm_map_compare(vm_map_entry_t a, vm_map_entry_t b)
	218	{
	219	if (a->start < b->start)
	220	return(-1);
	221	else if (a->start > b->start)
	222	return(1);
	223	return(0);
	224	}
	225
	226	/*
	227	* Allocate a vmspace structure, including a vm_map and pmap.
	228	* Initialize numerous fields. While the initial allocation is zerod,
	229	* subsequence reuse from the objcache leaves elements of the structure
	230	* intact (particularly the pmap), so portions must be zerod.
	231	*
	232	* The structure is not considered activated until we call sysref_activate().
	233	*/
	234	struct vmspace *
	235	vmspace_alloc(vm_offset_t min, vm_offset_t max)
	236	{
	237	struct vmspace *vm;
	238
	239	vm = sysref_alloc(&vmspace_sysref_class);
	240	bzero(&vm->vm_startcopy,
	241	(char )&vm->vm_endcopy - (char )&vm->vm_startcopy);
	242	vm_map_init(&vm->vm_map, min, max, NULL);
	243	pmap_pinit(vmspace_pmap(vm)); /* (some fields reused) */
	244	vm->vm_map.pmap = vmspace_pmap(vm); /* XXX */
	245	vm->vm_shm = NULL;
	246	vm->vm_exitingcnt = 0;
	247	cpu_vmspace_alloc(vm);
	248	sysref_activate(&vm->vm_sysref);
	249	return (vm);
	250	}
	251
	252	/*
	253	* dtor function - Some elements of the pmap are retained in the
	254	* free-cached vmspaces to improve performance. We have to clean them up
	255	* here before returning the vmspace to the memory pool.
	256	*/
	257	static void
	258	vmspace_dtor(void obj, void private)
	259	{
	260	struct vmspace *vm = obj;
	261
	262	pmap_puninit(vmspace_pmap(vm));
	263	}
	264
	265	/*
	266	* Called in two cases:
	267	*
	268	* (1) When the last sysref is dropped, but exitingcnt might still be
	269	* non-zero.
	270	*
	271	* (2) When there are no sysrefs (i.e. refcnt is negative) left and the
	272	* exitingcnt becomes zero
	273	*
	274	* sysref will not scrap the object until we call sysref_put() once more
	275	* after the last ref has been dropped.
	276	*/
	277	static void
	278	vmspace_terminate(struct vmspace *vm)
	279	{
	280	int count;
	281
	282	/*
	283	* If exitingcnt is non-zero we can't get rid of the entire vmspace
	284	* yet, but we can scrap user memory.
	285	*/
	286	if (vm->vm_exitingcnt) {
	287	shmexit(vm);
	288	pmap_remove_pages(vmspace_pmap(vm), VM_MIN_USER_ADDRESS,
	289	VM_MAX_USER_ADDRESS);
	290	vm_map_remove(&vm->vm_map, VM_MIN_USER_ADDRESS,
	291	VM_MAX_USER_ADDRESS);
	292
	293	return;
	294	}
	295	cpu_vmspace_free(vm);
	296
	297	/*
	298	* Make sure any SysV shm is freed, it might not have in
	299	* exit1()
	300	*/
	301	shmexit(vm);
	302
	303	KKASSERT(vm->vm_upcalls == NULL);
	304
	305	/*
	306	* Lock the map, to wait out all other references to it.
	307	* Delete all of the mappings and pages they hold, then call
	308	* the pmap module to reclaim anything left.
	309	*/
	310	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	311	vm_map_lock(&vm->vm_map);
	312	vm_map_delete(&vm->vm_map, vm->vm_map.min_offset,
	313	vm->vm_map.max_offset, &count);
	314	vm_map_unlock(&vm->vm_map);
	315	vm_map_entry_release(count);
	316
	317	pmap_release(vmspace_pmap(vm));
	318	sysref_put(&vm->vm_sysref);
	319	}
	320
	321	static void
	322	vmspace_lock(struct vmspace *vm __unused)
	323	{
	324	}
	325
	326	static void
	327	vmspace_unlock(struct vmspace *vm __unused)
	328	{
	329	}
	330
	331	/*
	332	* This is called in the wait*() handling code. The vmspace can be terminated
	333	* after the last wait is finished using it.
	334	*/
	335	void
	336	vmspace_exitfree(struct proc *p)
	337	{
	338	struct vmspace *vm;
	339
	340	vm = p->p_vmspace;
	341	p->p_vmspace = NULL;
	342
	343	if (--vm->vm_exitingcnt == 0 && sysref_isinactive(&vm->vm_sysref))
	344	vmspace_terminate(vm);
	345	}
	346
	347	/*
	348	* vmspace_swap_count()
	349	*
	350	* Swap useage is determined by taking the proportional swap used by
	351	* VM objects backing the VM map. To make up for fractional losses,
	352	* if the VM object has any swap use at all the associated map entries
	353	* count for at least 1 swap page.
	354	*/
	355	int
	356	vmspace_swap_count(struct vmspace *vmspace)
	357	{
	358	vm_map_t map = &vmspace->vm_map;
	359	vm_map_entry_t cur;
	360	vm_object_t object;
	361	int count = 0;
	362	int n;
	363
	364	for (cur = map->header.next; cur != &map->header; cur = cur->next) {
	365	switch(cur->maptype) {
	366	case VM_MAPTYPE_NORMAL:
	367	case VM_MAPTYPE_VPAGETABLE:
	368	if ((object = cur->object.vm_object) == NULL)
	369	break;
	370	if (object->swblock_count) {
	371	n = (cur->end - cur->start) / PAGE_SIZE;
	372	count += object->swblock_count *
	373	SWAP_META_PAGES * n / object->size + 1;
	374	}
	375	break;
	376	default:
	377	break;
	378	}
	379	}
	380	return(count);
	381	}
	382
	383	/*
	384	* vmspace_anonymous_count()
	385	*
	386	* Calculate the approximate number of anonymous pages in use by
	387	* this vmspace. To make up for fractional losses, we count each
	388	* VM object as having at least 1 anonymous page.
	389	*/
	390	int
	391	vmspace_anonymous_count(struct vmspace *vmspace)
	392	{
	393	vm_map_t map = &vmspace->vm_map;
	394	vm_map_entry_t cur;
	395	vm_object_t object;
	396	int count = 0;
	397
	398	for (cur = map->header.next; cur != &map->header; cur = cur->next) {
	399	switch(cur->maptype) {
	400	case VM_MAPTYPE_NORMAL:
	401	case VM_MAPTYPE_VPAGETABLE:
	402	if ((object = cur->object.vm_object) == NULL)
	403	break;
	404	if (object->type != OBJT_DEFAULT &&
	405	object->type != OBJT_SWAP) {
	406	break;
	407	}
	408	count += object->resident_page_count;
	409	break;
	410	default:
	411	break;
	412	}
	413	}
	414	return(count);
	415	}
	416
	417
	418
	419
	420	/*
	421	* vm_map_create:
	422	*
	423	* Creates and returns a new empty VM map with
	424	* the given physical map structure, and having
	425	* the given lower and upper address bounds.
	426	*/
	427	vm_map_t
	428	vm_map_create(vm_map_t result, pmap_t pmap, vm_offset_t min, vm_offset_t max)
	429	{
	430	if (result == NULL)
	431	result = zalloc(mapzone);
	432	vm_map_init(result, min, max, pmap);
	433	return (result);
	434	}
	435
	436	/*
	437	* Initialize an existing vm_map structure
	438	* such as that in the vmspace structure.
	439	* The pmap is set elsewhere.
	440	*/
	441	void
	442	vm_map_init(struct vm_map *map, vm_offset_t min, vm_offset_t max, pmap_t pmap)
	443	{
	444	map->header.next = map->header.prev = &map->header;
	445	RB_INIT(&map->rb_root);
	446	map->nentries = 0;
	447	map->size = 0;
	448	map->system_map = 0;
	449	map->infork = 0;
	450	map->min_offset = min;
	451	map->max_offset = max;
	452	map->pmap = pmap;
	453	map->first_free = &map->header;
	454	map->hint = &map->header;
	455	map->timestamp = 0;
	456	lockinit(&map->lock, "thrd_sleep", 0, 0);
	457	}
	458
	459	/*
	460	* Shadow the vm_map_entry's object. This typically needs to be done when
	461	* a write fault is taken on an entry which had previously been cloned by
	462	* fork(). The shared object (which might be NULL) must become private so
	463	* we add a shadow layer above it.
	464	*
	465	* Object allocation for anonymous mappings is defered as long as possible.
	466	* When creating a shadow, however, the underlying object must be instantiated
	467	* so it can be shared.
	468	*
	469	* If the map segment is governed by a virtual page table then it is
	470	* possible to address offsets beyond the mapped area. Just allocate
	471	* a maximally sized object for this case.
	472	*/
	473	static
	474	void
	475	vm_map_entry_shadow(vm_map_entry_t entry)
	476	{
	477	if (entry->maptype == VM_MAPTYPE_VPAGETABLE) {
	478	vm_object_shadow(&entry->object.vm_object, &entry->offset,
	479	0x7FFFFFFF); /* XXX */
	480	} else {
	481	vm_object_shadow(&entry->object.vm_object, &entry->offset,
	482	atop(entry->end - entry->start));
	483	}
	484	entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
	485	}
	486
	487	/*
	488	* Allocate an object for a vm_map_entry.
	489	*
	490	* Object allocation for anonymous mappings is defered as long as possible.
	491	* This function is called when we can defer no longer, generally when a map
	492	* entry might be split or forked or takes a page fault.
	493	*
	494	* If the map segment is governed by a virtual page table then it is
	495	* possible to address offsets beyond the mapped area. Just allocate
	496	* a maximally sized object for this case.
	497	*/
	498	void
	499	vm_map_entry_allocate_object(vm_map_entry_t entry)
	500	{
	501	vm_object_t obj;
	502
	503	if (entry->maptype == VM_MAPTYPE_VPAGETABLE) {
	504	obj = vm_object_allocate(OBJT_DEFAULT, 0x7FFFFFFF); /* XXX */
	505	} else {
	506	obj = vm_object_allocate(OBJT_DEFAULT,
	507	atop(entry->end - entry->start));
	508	}
	509	entry->object.vm_object = obj;
	510	entry->offset = 0;
	511	}
	512
	513	/*
	514	* vm_map_entry_reserve_cpu_init:
	515	*
	516	* Set an initial negative count so the first attempt to reserve
	517	* space preloads a bunch of vm_map_entry's for this cpu. Also
	518	* pre-allocate 2 vm_map_entries which will be needed by zalloc() to
	519	* map a new page for vm_map_entry structures. SMP systems are
	520	* particularly sensitive.
	521	*
	522	* This routine is called in early boot so we cannot just call
	523	* vm_map_entry_reserve().
	524	*
	525	* May be called for a gd other then mycpu, but may only be called
	526	* during early boot.
	527	*/
	528	void
	529	vm_map_entry_reserve_cpu_init(globaldata_t gd)
	530	{
	531	vm_map_entry_t entry;
	532	int i;
	533
	534	gd->gd_vme_avail -= MAP_RESERVE_COUNT * 2;
	535	entry = &cpu_map_entry_init[gd->gd_cpuid][0];
	536	for (i = 0; i < VMEPERCPU; ++i, ++entry) {
	537	entry->next = gd->gd_vme_base;
	538	gd->gd_vme_base = entry;
	539	}
	540	}
	541
	542	/*
	543	* vm_map_entry_reserve:
	544	*
	545	* Reserves vm_map_entry structures so code later on can manipulate
	546	* map_entry structures within a locked map without blocking trying
	547	* to allocate a new vm_map_entry.
	548	*/
	549	int
	550	vm_map_entry_reserve(int count)
	551	{
	552	struct globaldata *gd = mycpu;
	553	vm_map_entry_t entry;
	554
	555	crit_enter();
	556
	557	/*
	558	* Make sure we have enough structures in gd_vme_base to handle
	559	* the reservation request.
	560	*/
	561	while (gd->gd_vme_avail < count) {
	562	entry = zalloc(mapentzone);
	563	entry->next = gd->gd_vme_base;
	564	gd->gd_vme_base = entry;
	565	++gd->gd_vme_avail;
	566	}
	567	gd->gd_vme_avail -= count;
	568	crit_exit();
	569	return(count);
	570	}
	571
	572	/*
	573	* vm_map_entry_release:
	574	*
	575	* Releases previously reserved vm_map_entry structures that were not
	576	* used. If we have too much junk in our per-cpu cache clean some of
	577	* it out.
	578	*/
	579	void
	580	vm_map_entry_release(int count)
	581	{
	582	struct globaldata *gd = mycpu;
	583	vm_map_entry_t entry;
	584
	585	crit_enter();
	586	gd->gd_vme_avail += count;
	587	while (gd->gd_vme_avail > MAP_RESERVE_SLOP) {
	588	entry = gd->gd_vme_base;
	589	KKASSERT(entry != NULL);
	590	gd->gd_vme_base = entry->next;
	591	--gd->gd_vme_avail;
	592	crit_exit();
	593	zfree(mapentzone, entry);
	594	crit_enter();
	595	}
	596	crit_exit();
	597	}
	598
	599	/*
	600	* vm_map_entry_kreserve:
	601	*
	602	* Reserve map entry structures for use in kernel_map itself. These
	603	* entries have ALREADY been reserved on a per-cpu basis when the map
	604	* was inited. This function is used by zalloc() to avoid a recursion
	605	* when zalloc() itself needs to allocate additional kernel memory.
	606	*
	607	* This function works like the normal reserve but does not load the
	608	* vm_map_entry cache (because that would result in an infinite
	609	* recursion). Note that gd_vme_avail may go negative. This is expected.
	610	*
	611	* Any caller of this function must be sure to renormalize after
	612	* potentially eating entries to ensure that the reserve supply
	613	* remains intact.
	614	*/
	615	int
	616	vm_map_entry_kreserve(int count)
	617	{
	618	struct globaldata *gd = mycpu;
	619
	620	crit_enter();
	621	gd->gd_vme_avail -= count;
	622	crit_exit();
	623	KASSERT(gd->gd_vme_base != NULL, ("no reserved entries left, gd_vme_avail = %d\n", gd->gd_vme_avail));
	624	return(count);
	625	}
	626
	627	/*
	628	* vm_map_entry_krelease:
	629	*
	630	* Release previously reserved map entries for kernel_map. We do not
	631	* attempt to clean up like the normal release function as this would
	632	* cause an unnecessary (but probably not fatal) deep procedure call.
	633	*/
	634	void
	635	vm_map_entry_krelease(int count)
	636	{
	637	struct globaldata *gd = mycpu;
	638
	639	crit_enter();
	640	gd->gd_vme_avail += count;
	641	crit_exit();
	642	}
	643
	644	/*
	645	* vm_map_entry_create: [ internal use only ]
	646	*
	647	* Allocates a VM map entry for insertion. No entry fields are filled
	648	* in.
	649	*
	650	* This routine may be called from an interrupt thread but not a FAST
	651	* interrupt. This routine may recurse the map lock.
	652	*/
	653	static vm_map_entry_t
	654	vm_map_entry_create(vm_map_t map, int *countp)
	655	{
	656	struct globaldata *gd = mycpu;
	657	vm_map_entry_t entry;
	658
	659	KKASSERT(*countp > 0);
	660	--*countp;
	661	crit_enter();
	662	entry = gd->gd_vme_base;
	663	KASSERT(entry != NULL, ("gd_vme_base NULL! count %d", *countp));
	664	gd->gd_vme_base = entry->next;
	665	crit_exit();
	666	return(entry);
	667	}
	668
	669	/*
	670	* vm_map_entry_dispose: [ internal use only ]
	671	*
	672	* Dispose of a vm_map_entry that is no longer being referenced. This
	673	* function may be called from an interrupt.
	674	*/
	675	static void
	676	vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry, int *countp)
	677	{
	678	struct globaldata *gd = mycpu;
	679
	680	KKASSERT(map->hint != entry);
	681	KKASSERT(map->first_free != entry);
	682
	683	++*countp;
	684	crit_enter();
	685	entry->next = gd->gd_vme_base;
	686	gd->gd_vme_base = entry;
	687	crit_exit();
	688	}
	689
	690
	691	/*
	692	* vm_map_entry_{un,}link:
	693	*
	694	* Insert/remove entries from maps.
	695	*/
	696	static __inline void
	697	vm_map_entry_link(vm_map_t map,
	698	vm_map_entry_t after_where,
	699	vm_map_entry_t entry)
	700	{
	701	map->nentries++;
	702	entry->prev = after_where;
	703	entry->next = after_where->next;
	704	entry->next->prev = entry;
	705	after_where->next = entry;
	706	if (vm_map_rb_tree_RB_INSERT(&map->rb_root, entry))
	707	panic("vm_map_entry_link: dup addr map %p ent %p", map, entry);
	708	}
	709
	710	static __inline void
	711	vm_map_entry_unlink(vm_map_t map,
	712	vm_map_entry_t entry)
	713	{
	714	vm_map_entry_t prev;
	715	vm_map_entry_t next;
	716
	717	if (entry->eflags & MAP_ENTRY_IN_TRANSITION)
	718	panic("vm_map_entry_unlink: attempt to mess with locked entry! %p", entry);
	719	prev = entry->prev;
	720	next = entry->next;
	721	next->prev = prev;
	722	prev->next = next;
	723	vm_map_rb_tree_RB_REMOVE(&map->rb_root, entry);
	724	map->nentries--;
	725	}
	726
	727	/*
	728	* vm_map_lookup_entry: [ internal use only ]
	729	*
	730	* Finds the map entry containing (or
	731	* immediately preceding) the specified address
	732	* in the given map; the entry is returned
	733	* in the "entry" parameter. The boolean
	734	* result indicates whether the address is
	735	* actually contained in the map.
	736	*/
	737	boolean_t
	738	vm_map_lookup_entry(vm_map_t map, vm_offset_t address,
	739	vm_map_entry_t entry / OUT */)
	740	{
	741	vm_map_entry_t tmp;
	742	vm_map_entry_t last;
	743
	744	#if 0
	745	/*
	746	* XXX TEMPORARILY DISABLED. For some reason our attempt to revive
	747	* the hint code with the red-black lookup meets with system crashes
	748	* and lockups. We do not yet know why.
	749	*
	750	* It is possible that the problem is related to the setting
	751	* of the hint during map_entry deletion, in the code specified
	752	* at the GGG comment later on in this file.
	753	*/
	754	/*
	755	* Quickly check the cached hint, there's a good chance of a match.
	756	*/
	757	if (map->hint != &map->header) {
	758	tmp = map->hint;
	759	if (address >= tmp->start && address < tmp->end) {
	760	*entry = tmp;
	761	return(TRUE);
	762	}
	763	}
	764	#endif
	765
	766	/*
	767	* Locate the record from the top of the tree. 'last' tracks the
	768	* closest prior record and is returned if no match is found, which
	769	* in binary tree terms means tracking the most recent right-branch
	770	* taken. If there is no prior record, &map->header is returned.
	771	*/
	772	last = &map->header;
	773	tmp = RB_ROOT(&map->rb_root);
	774
	775	while (tmp) {
	776	if (address >= tmp->start) {
	777	if (address < tmp->end) {
	778	*entry = tmp;
	779	map->hint = tmp;
	780	return(TRUE);
	781	}
	782	last = tmp;
	783	tmp = RB_RIGHT(tmp, rb_entry);
	784	} else {
	785	tmp = RB_LEFT(tmp, rb_entry);
	786	}
	787	}
	788	*entry = last;
	789	return (FALSE);
	790	}
	791
	792	/*
	793	* vm_map_insert:
	794	*
	795	* Inserts the given whole VM object into the target
	796	* map at the specified address range. The object's
	797	* size should match that of the address range.
	798	*
	799	* Requires that the map be locked, and leaves it so. Requires that
	800	* sufficient vm_map_entry structures have been reserved and tracks
	801	* the use via countp.
	802	*
	803	* If object is non-NULL, ref count must be bumped by caller
	804	* prior to making call to account for the new entry.
	805	*/
	806	int
	807	vm_map_insert(vm_map_t map, int *countp,
	808	vm_object_t object, vm_ooffset_t offset,
	809	vm_offset_t start, vm_offset_t end,
	810	vm_maptype_t maptype,
	811	vm_prot_t prot, vm_prot_t max,
	812	int cow)
	813	{
	814	vm_map_entry_t new_entry;
	815	vm_map_entry_t prev_entry;
	816	vm_map_entry_t temp_entry;
	817	vm_eflags_t protoeflags;
	818
	819	/*
	820	* Check that the start and end points are not bogus.
	821	*/
	822
	823	if ((start < map->min_offset) \|\| (end > map->max_offset) \|\|
	824	(start >= end))
	825	return (KERN_INVALID_ADDRESS);
	826
	827	/*
	828	* Find the entry prior to the proposed starting address; if it's part
	829	* of an existing entry, this range is bogus.
	830	*/
	831
	832	if (vm_map_lookup_entry(map, start, &temp_entry))
	833	return (KERN_NO_SPACE);
	834
	835	prev_entry = temp_entry;
	836
	837	/*
	838	* Assert that the next entry doesn't overlap the end point.
	839	*/
	840
	841	if ((prev_entry->next != &map->header) &&
	842	(prev_entry->next->start < end))
	843	return (KERN_NO_SPACE);
	844
	845	protoeflags = 0;
	846
	847	if (cow & MAP_COPY_ON_WRITE)
	848	protoeflags \|= MAP_ENTRY_COW\|MAP_ENTRY_NEEDS_COPY;
	849
	850	if (cow & MAP_NOFAULT) {
	851	protoeflags \|= MAP_ENTRY_NOFAULT;
	852
	853	KASSERT(object == NULL,
	854	("vm_map_insert: paradoxical MAP_NOFAULT request"));
	855	}
	856	if (cow & MAP_DISABLE_SYNCER)
	857	protoeflags \|= MAP_ENTRY_NOSYNC;
	858	if (cow & MAP_DISABLE_COREDUMP)
	859	protoeflags \|= MAP_ENTRY_NOCOREDUMP;
	860	if (cow & MAP_IS_STACK)
	861	protoeflags \|= MAP_ENTRY_STACK;
	862
	863	if (object) {
	864	/*
	865	* When object is non-NULL, it could be shared with another
	866	* process. We have to set or clear OBJ_ONEMAPPING
	867	* appropriately.
	868	*/
	869	if ((object->ref_count > 1) \|\| (object->shadow_count != 0)) {
	870	vm_object_clear_flag(object, OBJ_ONEMAPPING);
	871	}
	872	}
	873	else if ((prev_entry != &map->header) &&
	874	(prev_entry->eflags == protoeflags) &&
	875	(prev_entry->end == start) &&
	876	(prev_entry->wired_count == 0) &&
	877	prev_entry->maptype == maptype &&
	878	((prev_entry->object.vm_object == NULL) \|\|
	879	vm_object_coalesce(prev_entry->object.vm_object,
	880	OFF_TO_IDX(prev_entry->offset),
	881	(vm_size_t)(prev_entry->end - prev_entry->start),
	882	(vm_size_t)(end - prev_entry->end)))) {
	883	/*
	884	* We were able to extend the object. Determine if we
	885	* can extend the previous map entry to include the
	886	* new range as well.
	887	*/
	888	if ((prev_entry->inheritance == VM_INHERIT_DEFAULT) &&
	889	(prev_entry->protection == prot) &&
	890	(prev_entry->max_protection == max)) {
	891	map->size += (end - prev_entry->end);
	892	prev_entry->end = end;
	893	vm_map_simplify_entry(map, prev_entry, countp);
	894	return (KERN_SUCCESS);
	895	}
	896
	897	/*
	898	* If we can extend the object but cannot extend the
	899	* map entry, we have to create a new map entry. We
	900	* must bump the ref count on the extended object to
	901	* account for it. object may be NULL.
	902	*/
	903	object = prev_entry->object.vm_object;
	904	offset = prev_entry->offset +
	905	(prev_entry->end - prev_entry->start);
	906	vm_object_reference(object);
	907	}
	908
	909	/*
	910	* NOTE: if conditionals fail, object can be NULL here. This occurs
	911	* in things like the buffer map where we manage kva but do not manage
	912	* backing objects.
	913	*/
	914
	915	/*
	916	* Create a new entry
	917	*/
	918
	919	new_entry = vm_map_entry_create(map, countp);
	920	new_entry->start = start;
	921	new_entry->end = end;
	922
	923	new_entry->maptype = maptype;
	924	new_entry->eflags = protoeflags;
	925	new_entry->object.vm_object = object;
	926	new_entry->offset = offset;
	927	new_entry->aux.master_pde = 0;
	928
	929	new_entry->inheritance = VM_INHERIT_DEFAULT;
	930	new_entry->protection = prot;
	931	new_entry->max_protection = max;
	932	new_entry->wired_count = 0;
	933
	934	/*
	935	* Insert the new entry into the list
	936	*/
	937
	938	vm_map_entry_link(map, prev_entry, new_entry);
	939	map->size += new_entry->end - new_entry->start;
	940
	941	/*
	942	* Update the free space hint. Entries cannot overlap.
	943	* An exact comparison is needed to avoid matching
	944	* against the map->header.
	945	*/
	946	if ((map->first_free == prev_entry) &&
	947	(prev_entry->end == new_entry->start)) {
	948	map->first_free = new_entry;
	949	}
	950
	951	#if 0
	952	/*
	953	* Temporarily removed to avoid MAP_STACK panic, due to
	954	* MAP_STACK being a huge hack. Will be added back in
	955	* when MAP_STACK (and the user stack mapping) is fixed.
	956	*/
	957	/*
	958	* It may be possible to simplify the entry
	959	*/
	960	vm_map_simplify_entry(map, new_entry, countp);
	961	#endif
	962
	963	/*
	964	* Try to pre-populate the page table. Mappings governed by virtual
	965	* page tables cannot be prepopulated without a lot of work, so
	966	* don't try.
	967	*/
	968	if ((cow & (MAP_PREFAULT\|MAP_PREFAULT_PARTIAL)) &&
	969	maptype != VM_MAPTYPE_VPAGETABLE) {
	970	pmap_object_init_pt(map->pmap, start, prot,
	971	object, OFF_TO_IDX(offset), end - start,
	972	cow & MAP_PREFAULT_PARTIAL);
	973	}
	974
	975	return (KERN_SUCCESS);
	976	}
	977
	978	/*
	979	* Find sufficient space for `length' bytes in the given map, starting at
	980	* `start'. The map must be locked. Returns 0 on success, 1 on no space.
	981	*
	982	* This function will returned an arbitrarily aligned pointer. If no
	983	* particular alignment is required you should pass align as 1. Note that
	984	* the map may return PAGE_SIZE aligned pointers if all the lengths used in
	985	* the map are a multiple of PAGE_SIZE, even if you pass a smaller align
	986	* argument.
	987	*
	988	* 'align' should be a power of 2 but is not required to be.
	989	*/
	990	int
	991	vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length,
	992	vm_size_t align, int flags, vm_offset_t *addr)
	993	{
	994	vm_map_entry_t entry, next;
	995	vm_offset_t end;
	996	vm_offset_t align_mask;
	997
	998	if (start < map->min_offset)
	999	start = map->min_offset;
	1000	if (start > map->max_offset)
	1001	return (1);
	1002
	1003	/*
	1004	* If the alignment is not a power of 2 we will have to use
	1005	* a mod/division, set align_mask to a special value.
	1006	*/
	1007	if ((align \| (align - 1)) + 1 != (align << 1))
	1008	align_mask = (vm_offset_t)-1;
	1009	else
	1010	align_mask = align - 1;
	1011
	1012	retry:
	1013	/*
	1014	* Look for the first possible address; if there's already something
	1015	* at this address, we have to start after it.
	1016	*/
	1017	if (start == map->min_offset) {
	1018	if ((entry = map->first_free) != &map->header)
	1019	start = entry->end;
	1020	} else {
	1021	vm_map_entry_t tmp;
	1022
	1023	if (vm_map_lookup_entry(map, start, &tmp))
	1024	start = tmp->end;
	1025	entry = tmp;
	1026	}
	1027
	1028	/*
	1029	* Look through the rest of the map, trying to fit a new region in the
	1030	* gap between existing regions, or after the very last region.
	1031	*/
	1032	for (;; start = (entry = next)->end) {
	1033	/*
	1034	* Adjust the proposed start by the requested alignment,
	1035	* be sure that we didn't wrap the address.
	1036	*/
	1037	if (align_mask == (vm_offset_t)-1)
	1038	end = ((start + align - 1) / align) * align;
	1039	else
	1040	end = (start + align_mask) & ~align_mask;
	1041	if (end < start)
	1042	return (1);
	1043	start = end;
	1044	/*
	1045	* Find the end of the proposed new region. Be sure we didn't
	1046	* go beyond the end of the map, or wrap around the address.
	1047	* Then check to see if this is the last entry or if the
	1048	* proposed end fits in the gap between this and the next
	1049	* entry.
	1050	*/
	1051	end = start + length;
	1052	if (end > map->max_offset \|\| end < start)
	1053	return (1);
	1054	next = entry->next;
	1055
	1056	/*
	1057	* If the next entry's start address is beyond the desired
	1058	* end address we may have found a good entry.
	1059	*
	1060	* If the next entry is a stack mapping we do not map into
	1061	* the stack's reserved space.
	1062	*
	1063	* XXX continue to allow mapping into the stack's reserved
	1064	* space if doing a MAP_STACK mapping inside a MAP_STACK
	1065	* mapping, for backwards compatibility. But the caller
	1066	* really should use MAP_STACK \| MAP_TRYFIXED if they
	1067	* want to do that.
	1068	*/
	1069	if (next == &map->header)
	1070	break;
	1071	if (next->start >= end) {
	1072	if ((next->eflags & MAP_ENTRY_STACK) == 0)
	1073	break;
	1074	if (flags & MAP_STACK)
	1075	break;
	1076	if (next->start - next->aux.avail_ssize >= end)
	1077	break;
	1078	}
	1079	}
	1080	map->hint = entry;
	1081	if (map == &kernel_map) {
	1082	vm_offset_t ksize;
	1083	if ((ksize = round_page(start + length)) > kernel_vm_end) {
	1084	pmap_growkernel(ksize);
	1085	goto retry;
	1086	}
	1087	}
	1088	*addr = start;
	1089	return (0);
	1090	}
	1091
	1092	/*
	1093	* vm_map_find finds an unallocated region in the target address
	1094	* map with the given length. The search is defined to be
	1095	* first-fit from the specified address; the region found is
	1096	* returned in the same parameter.
	1097	*
	1098	* If object is non-NULL, ref count must be bumped by caller
	1099	* prior to making call to account for the new entry.
	1100	*/
	1101	int
	1102	vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
	1103	vm_offset_t *addr, vm_size_t length, vm_size_t align,
	1104	boolean_t fitit,
	1105	vm_maptype_t maptype,
	1106	vm_prot_t prot, vm_prot_t max,
	1107	int cow)
	1108	{
	1109	vm_offset_t start;
	1110	int result;
	1111	int count;
	1112
	1113	start = *addr;
	1114
	1115	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	1116	vm_map_lock(map);
	1117	if (fitit) {
	1118	if (vm_map_findspace(map, start, length, align, 0, addr)) {
	1119	vm_map_unlock(map);
	1120	vm_map_entry_release(count);
	1121	return (KERN_NO_SPACE);
	1122	}
	1123	start = *addr;
	1124	}
	1125	result = vm_map_insert(map, &count, object, offset,
	1126	start, start + length,
	1127	maptype,
	1128	prot, max,
	1129	cow);
	1130	vm_map_unlock(map);
	1131	vm_map_entry_release(count);
	1132
	1133	return (result);
	1134	}
	1135
	1136	/*
	1137	* vm_map_simplify_entry:
	1138	*
	1139	* Simplify the given map entry by merging with either neighbor. This
	1140	* routine also has the ability to merge with both neighbors.
	1141	*
	1142	* The map must be locked.
	1143	*
	1144	* This routine guarentees that the passed entry remains valid (though
	1145	* possibly extended). When merging, this routine may delete one or
	1146	* both neighbors. No action is taken on entries which have their
	1147	* in-transition flag set.
	1148	*/
	1149	void
	1150	vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry, int *countp)
	1151	{
	1152	vm_map_entry_t next, prev;
	1153	vm_size_t prevsize, esize;
	1154
	1155	if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
	1156	++mycpu->gd_cnt.v_intrans_coll;
	1157	return;
	1158	}
	1159
	1160	if (entry->maptype == VM_MAPTYPE_SUBMAP)
	1161	return;
	1162
	1163	prev = entry->prev;
	1164	if (prev != &map->header) {
	1165	prevsize = prev->end - prev->start;
	1166	if ( (prev->end == entry->start) &&
	1167	(prev->maptype == entry->maptype) &&
	1168	(prev->object.vm_object == entry->object.vm_object) &&
	1169	(!prev->object.vm_object \|\|
	1170	(prev->offset + prevsize == entry->offset)) &&
	1171	(prev->eflags == entry->eflags) &&
	1172	(prev->protection == entry->protection) &&
	1173	(prev->max_protection == entry->max_protection) &&
	1174	(prev->inheritance == entry->inheritance) &&
	1175	(prev->wired_count == entry->wired_count)) {
	1176	if (map->first_free == prev)
	1177	map->first_free = entry;
	1178	if (map->hint == prev)
	1179	map->hint = entry;
	1180	vm_map_entry_unlink(map, prev);
	1181	entry->start = prev->start;
	1182	entry->offset = prev->offset;
	1183	if (prev->object.vm_object)
	1184	vm_object_deallocate(prev->object.vm_object);
	1185	vm_map_entry_dispose(map, prev, countp);
	1186	}
	1187	}
	1188
	1189	next = entry->next;
	1190	if (next != &map->header) {
	1191	esize = entry->end - entry->start;
	1192	if ((entry->end == next->start) &&
	1193	(next->maptype == entry->maptype) &&
	1194	(next->object.vm_object == entry->object.vm_object) &&
	1195	(!entry->object.vm_object \|\|
	1196	(entry->offset + esize == next->offset)) &&
	1197	(next->eflags == entry->eflags) &&
	1198	(next->protection == entry->protection) &&
	1199	(next->max_protection == entry->max_protection) &&
	1200	(next->inheritance == entry->inheritance) &&
	1201	(next->wired_count == entry->wired_count)) {
	1202	if (map->first_free == next)
	1203	map->first_free = entry;
	1204	if (map->hint == next)
	1205	map->hint = entry;
	1206	vm_map_entry_unlink(map, next);
	1207	entry->end = next->end;
	1208	if (next->object.vm_object)
	1209	vm_object_deallocate(next->object.vm_object);
	1210	vm_map_entry_dispose(map, next, countp);
	1211	}
	1212	}
	1213	}
	1214	/*
	1215	* vm_map_clip_start: [ internal use only ]
	1216	*
	1217	* Asserts that the given entry begins at or after
	1218	* the specified address; if necessary,
	1219	* it splits the entry into two.
	1220	*/
	1221	#define vm_map_clip_start(map, entry, startaddr, countp) \
	1222	{ \
	1223	if (startaddr > entry->start) \
	1224	_vm_map_clip_start(map, entry, startaddr, countp); \
	1225	}
	1226
	1227	/*
	1228	* This routine is called only when it is known that
	1229	* the entry must be split.
	1230	*/
	1231	static void
	1232	_vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start, int *countp)
	1233	{
	1234	vm_map_entry_t new_entry;
	1235
	1236	/*
	1237	* Split off the front portion -- note that we must insert the new
	1238	* entry BEFORE this one, so that this entry has the specified
	1239	* starting address.
	1240	*/
	1241
	1242	vm_map_simplify_entry(map, entry, countp);
	1243
	1244	/*
	1245	* If there is no object backing this entry, we might as well create
	1246	* one now. If we defer it, an object can get created after the map
	1247	* is clipped, and individual objects will be created for the split-up
	1248	* map. This is a bit of a hack, but is also about the best place to
	1249	* put this improvement.
	1250	*/
	1251	if (entry->object.vm_object == NULL && !map->system_map) {
	1252	vm_map_entry_allocate_object(entry);
	1253	}
	1254
	1255	new_entry = vm_map_entry_create(map, countp);
	1256	new_entry = entry;
	1257
	1258	new_entry->end = start;
	1259	entry->offset += (start - entry->start);
	1260	entry->start = start;
	1261
	1262	vm_map_entry_link(map, entry->prev, new_entry);
	1263
	1264	switch(entry->maptype) {
	1265	case VM_MAPTYPE_NORMAL:
	1266	case VM_MAPTYPE_VPAGETABLE:
	1267	vm_object_reference(new_entry->object.vm_object);
	1268	break;
	1269	default:
	1270	break;
	1271	}
	1272	}
	1273
	1274	/*
	1275	* vm_map_clip_end: [ internal use only ]
	1276	*
	1277	* Asserts that the given entry ends at or before
	1278	* the specified address; if necessary,
	1279	* it splits the entry into two.
	1280	*/
	1281
	1282	#define vm_map_clip_end(map, entry, endaddr, countp) \
	1283	{ \
	1284	if (endaddr < entry->end) \
	1285	_vm_map_clip_end(map, entry, endaddr, countp); \
	1286	}
	1287
	1288	/*
	1289	* This routine is called only when it is known that
	1290	* the entry must be split.
	1291	*/
	1292	static void
	1293	_vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end, int *countp)
	1294	{
	1295	vm_map_entry_t new_entry;
	1296
	1297	/*
	1298	* If there is no object backing this entry, we might as well create
	1299	* one now. If we defer it, an object can get created after the map
	1300	* is clipped, and individual objects will be created for the split-up
	1301	* map. This is a bit of a hack, but is also about the best place to
	1302	* put this improvement.
	1303	*/
	1304
	1305	if (entry->object.vm_object == NULL && !map->system_map) {
	1306	vm_map_entry_allocate_object(entry);
	1307	}
	1308
	1309	/*
	1310	* Create a new entry and insert it AFTER the specified entry
	1311	*/
	1312
	1313	new_entry = vm_map_entry_create(map, countp);
	1314	new_entry = entry;
	1315
	1316	new_entry->start = entry->end = end;
	1317	new_entry->offset += (end - entry->start);
	1318
	1319	vm_map_entry_link(map, entry, new_entry);
	1320
	1321	switch(entry->maptype) {
	1322	case VM_MAPTYPE_NORMAL:
	1323	case VM_MAPTYPE_VPAGETABLE:
	1324	vm_object_reference(new_entry->object.vm_object);
	1325	break;
	1326	default:
	1327	break;
	1328	}
	1329	}
	1330
	1331	/*
	1332	* VM_MAP_RANGE_CHECK: [ internal use only ]
	1333	*
	1334	* Asserts that the starting and ending region
	1335	* addresses fall within the valid range of the map.
	1336	*/
	1337	#define VM_MAP_RANGE_CHECK(map, start, end) \
	1338	{ \
	1339	if (start < vm_map_min(map)) \
	1340	start = vm_map_min(map); \
	1341	if (end > vm_map_max(map)) \
	1342	end = vm_map_max(map); \
	1343	if (start > end) \
	1344	start = end; \
	1345	}
	1346
	1347	/*
	1348	* vm_map_transition_wait: [ kernel use only ]
	1349	*
	1350	* Used to block when an in-transition collison occurs. The map
	1351	* is unlocked for the sleep and relocked before the return.
	1352	*/
	1353	static
	1354	void
	1355	vm_map_transition_wait(vm_map_t map)
	1356	{
	1357	vm_map_unlock(map);
	1358	tsleep(map, 0, "vment", 0);
	1359	vm_map_lock(map);
	1360	}
	1361
	1362	/*
	1363	* CLIP_CHECK_BACK
	1364	* CLIP_CHECK_FWD
	1365	*
	1366	* When we do blocking operations with the map lock held it is
	1367	* possible that a clip might have occured on our in-transit entry,
	1368	* requiring an adjustment to the entry in our loop. These macros
	1369	* help the pageable and clip_range code deal with the case. The
	1370	* conditional costs virtually nothing if no clipping has occured.
	1371	*/
	1372
	1373	#define CLIP_CHECK_BACK(entry, save_start) \
	1374	do { \
	1375	while (entry->start != save_start) { \
	1376	entry = entry->prev; \
	1377	KASSERT(entry != &map->header, ("bad entry clip")); \
	1378	} \
	1379	} while(0)
	1380
	1381	#define CLIP_CHECK_FWD(entry, save_end) \
	1382	do { \
	1383	while (entry->end != save_end) { \
	1384	entry = entry->next; \
	1385	KASSERT(entry != &map->header, ("bad entry clip")); \
	1386	} \
	1387	} while(0)
	1388
	1389
	1390	/*
	1391	* vm_map_clip_range: [ kernel use only ]
	1392	*
	1393	* Clip the specified range and return the base entry. The
	1394	* range may cover several entries starting at the returned base
	1395	* and the first and last entry in the covering sequence will be
	1396	* properly clipped to the requested start and end address.
	1397	*
	1398	* If no holes are allowed you should pass the MAP_CLIP_NO_HOLES
	1399	* flag.
	1400	*
	1401	* The MAP_ENTRY_IN_TRANSITION flag will be set for the entries
	1402	* covered by the requested range.
	1403	*
	1404	* The map must be exclusively locked on entry and will remain locked
	1405	* on return. If no range exists or the range contains holes and you
	1406	* specified that no holes were allowed, NULL will be returned. This
	1407	* routine may temporarily unlock the map in order avoid a deadlock when
	1408	* sleeping.
	1409	*/
	1410	static
	1411	vm_map_entry_t
	1412	vm_map_clip_range(vm_map_t map, vm_offset_t start, vm_offset_t end,
	1413	int *countp, int flags)
	1414	{
	1415	vm_map_entry_t start_entry;
	1416	vm_map_entry_t entry;
	1417
	1418	/*
	1419	* Locate the entry and effect initial clipping. The in-transition
	1420	* case does not occur very often so do not try to optimize it.
	1421	*/
	1422	again:
	1423	if (vm_map_lookup_entry(map, start, &start_entry) == FALSE)
	1424	return (NULL);
	1425	entry = start_entry;
	1426	if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
	1427	entry->eflags \|= MAP_ENTRY_NEEDS_WAKEUP;
	1428	++mycpu->gd_cnt.v_intrans_coll;
	1429	++mycpu->gd_cnt.v_intrans_wait;
	1430	vm_map_transition_wait(map);
	1431	/*
	1432	* entry and/or start_entry may have been clipped while
	1433	* we slept, or may have gone away entirely. We have
	1434	* to restart from the lookup.
	1435	*/
	1436	goto again;
	1437	}
	1438	/*
	1439	* Since we hold an exclusive map lock we do not have to restart
	1440	* after clipping, even though clipping may block in zalloc.
	1441	*/
	1442	vm_map_clip_start(map, entry, start, countp);
	1443	vm_map_clip_end(map, entry, end, countp);
	1444	entry->eflags \|= MAP_ENTRY_IN_TRANSITION;
	1445
	1446	/*
	1447	* Scan entries covered by the range. When working on the next
	1448	* entry a restart need only re-loop on the current entry which
	1449	* we have already locked, since 'next' may have changed. Also,
	1450	* even though entry is safe, it may have been clipped so we
	1451	* have to iterate forwards through the clip after sleeping.
	1452	*/
	1453	while (entry->next != &map->header && entry->next->start < end) {
	1454	vm_map_entry_t next = entry->next;
	1455
	1456	if (flags & MAP_CLIP_NO_HOLES) {
	1457	if (next->start > entry->end) {
	1458	vm_map_unclip_range(map, start_entry,
	1459	start, entry->end, countp, flags);
	1460	return(NULL);
	1461	}
	1462	}
	1463
	1464	if (next->eflags & MAP_ENTRY_IN_TRANSITION) {
	1465	vm_offset_t save_end = entry->end;
	1466	next->eflags \|= MAP_ENTRY_NEEDS_WAKEUP;
	1467	++mycpu->gd_cnt.v_intrans_coll;
	1468	++mycpu->gd_cnt.v_intrans_wait;
	1469	vm_map_transition_wait(map);
	1470
	1471	/*
	1472	* clips might have occured while we blocked.
	1473	*/
	1474	CLIP_CHECK_FWD(entry, save_end);
	1475	CLIP_CHECK_BACK(start_entry, start);
	1476	continue;
	1477	}
	1478	/*
	1479	* No restart necessary even though clip_end may block, we
	1480	* are holding the map lock.
	1481	*/
	1482	vm_map_clip_end(map, next, end, countp);
	1483	next->eflags \|= MAP_ENTRY_IN_TRANSITION;
	1484	entry = next;
	1485	}
	1486	if (flags & MAP_CLIP_NO_HOLES) {
	1487	if (entry->end != end) {
	1488	vm_map_unclip_range(map, start_entry,
	1489	start, entry->end, countp, flags);
	1490	return(NULL);
	1491	}
	1492	}
	1493	return(start_entry);
	1494	}
	1495
	1496	/*
	1497	* vm_map_unclip_range: [ kernel use only ]
	1498	*
	1499	* Undo the effect of vm_map_clip_range(). You should pass the same
	1500	* flags and the same range that you passed to vm_map_clip_range().
	1501	* This code will clear the in-transition flag on the entries and
	1502	* wake up anyone waiting. This code will also simplify the sequence
	1503	* and attempt to merge it with entries before and after the sequence.
	1504	*
	1505	* The map must be locked on entry and will remain locked on return.
	1506	*
	1507	* Note that you should also pass the start_entry returned by
	1508	* vm_map_clip_range(). However, if you block between the two calls
	1509	* with the map unlocked please be aware that the start_entry may
	1510	* have been clipped and you may need to scan it backwards to find
	1511	* the entry corresponding with the original start address. You are
	1512	* responsible for this, vm_map_unclip_range() expects the correct
	1513	* start_entry to be passed to it and will KASSERT otherwise.
	1514	*/
	1515	static
	1516	void
	1517	vm_map_unclip_range(
	1518	vm_map_t map,
	1519	vm_map_entry_t start_entry,
	1520	vm_offset_t start,
	1521	vm_offset_t end,
	1522	int *countp,
	1523	int flags)
	1524	{
	1525	vm_map_entry_t entry;
	1526
	1527	entry = start_entry;
	1528
	1529	KASSERT(entry->start == start, ("unclip_range: illegal base entry"));
	1530	while (entry != &map->header && entry->start < end) {
	1531	KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION, ("in-transition flag not set during unclip on: %p", entry));
	1532	KASSERT(entry->end <= end, ("unclip_range: tail wasn't clipped"));
	1533	entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
	1534	if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
	1535	entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
	1536	wakeup(map);
	1537	}
	1538	entry = entry->next;
	1539	}
	1540
	1541	/*
	1542	* Simplification does not block so there is no restart case.
	1543	*/
	1544	entry = start_entry;
	1545	while (entry != &map->header && entry->start < end) {
	1546	vm_map_simplify_entry(map, entry, countp);
	1547	entry = entry->next;
	1548	}
	1549	}
	1550
	1551	/*
	1552	* vm_map_submap: [ kernel use only ]
	1553	*
	1554	* Mark the given range as handled by a subordinate map.
	1555	*
	1556	* This range must have been created with vm_map_find,
	1557	* and no other operations may have been performed on this
	1558	* range prior to calling vm_map_submap.
	1559	*
	1560	* Only a limited number of operations can be performed
	1561	* within this rage after calling vm_map_submap:
	1562	* vm_fault
	1563	* [Don't try vm_map_copy!]
	1564	*
	1565	* To remove a submapping, one must first remove the
	1566	* range from the superior map, and then destroy the
	1567	* submap (if desired). [Better yet, don't try it.]
	1568	*/
	1569	int
	1570	vm_map_submap(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_map_t submap)
	1571	{
	1572	vm_map_entry_t entry;
	1573	int result = KERN_INVALID_ARGUMENT;
	1574	int count;
	1575
	1576	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	1577	vm_map_lock(map);
	1578
	1579	VM_MAP_RANGE_CHECK(map, start, end);
	1580
	1581	if (vm_map_lookup_entry(map, start, &entry)) {
	1582	vm_map_clip_start(map, entry, start, &count);
	1583	} else {
	1584	entry = entry->next;
	1585	}
	1586
	1587	vm_map_clip_end(map, entry, end, &count);
	1588
	1589	if ((entry->start == start) && (entry->end == end) &&
	1590	((entry->eflags & MAP_ENTRY_COW) == 0) &&
	1591	(entry->object.vm_object == NULL)) {
	1592	entry->object.sub_map = submap;
	1593	entry->maptype = VM_MAPTYPE_SUBMAP;
	1594	result = KERN_SUCCESS;
	1595	}
	1596	vm_map_unlock(map);
	1597	vm_map_entry_release(count);
	1598
	1599	return (result);
	1600	}
	1601
	1602	/*
	1603	* vm_map_protect:
	1604	*
	1605	* Sets the protection of the specified address region in the target map.
	1606	* If "set_max" is specified, the maximum protection is to be set;
	1607	* otherwise, only the current protection is affected.
	1608	*
	1609	* The protection is not applicable to submaps, but is applicable to normal
	1610	* maps and maps governed by virtual page tables. For example, when operating
	1611	* on a virtual page table our protection basically controls how COW occurs
	1612	* on the backing object, whereas the virtual page table abstraction itself
	1613	* is an abstraction for userland.
	1614	*/
	1615	int
	1616	vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
	1617	vm_prot_t new_prot, boolean_t set_max)
	1618	{
	1619	vm_map_entry_t current;
	1620	vm_map_entry_t entry;
	1621	int count;
	1622
	1623	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	1624	vm_map_lock(map);
	1625
	1626	VM_MAP_RANGE_CHECK(map, start, end);
	1627
	1628	if (vm_map_lookup_entry(map, start, &entry)) {
	1629	vm_map_clip_start(map, entry, start, &count);
	1630	} else {
	1631	entry = entry->next;
	1632	}
	1633
	1634	/*
	1635	* Make a first pass to check for protection violations.
	1636	*/
	1637	current = entry;
	1638	while ((current != &map->header) && (current->start < end)) {
	1639	if (current->maptype == VM_MAPTYPE_SUBMAP) {
	1640	vm_map_unlock(map);
	1641	vm_map_entry_release(count);
	1642	return (KERN_INVALID_ARGUMENT);
	1643	}
	1644	if ((new_prot & current->max_protection) != new_prot) {
	1645	vm_map_unlock(map);
	1646	vm_map_entry_release(count);
	1647	return (KERN_PROTECTION_FAILURE);
	1648	}
	1649	current = current->next;
	1650	}
	1651
	1652	/*
	1653	* Go back and fix up protections. [Note that clipping is not
	1654	* necessary the second time.]
	1655	*/
	1656	current = entry;
	1657
	1658	while ((current != &map->header) && (current->start < end)) {
	1659	vm_prot_t old_prot;
	1660
	1661	vm_map_clip_end(map, current, end, &count);
	1662
	1663	old_prot = current->protection;
	1664	if (set_max) {
	1665	current->protection =
	1666	(current->max_protection = new_prot) &
	1667	old_prot;
	1668	} else {
	1669	current->protection = new_prot;
	1670	}
	1671
	1672	/*
	1673	* Update physical map if necessary. Worry about copy-on-write
	1674	* here -- CHECK THIS XXX
	1675	*/
	1676
	1677	if (current->protection != old_prot) {
	1678	#define MASK(entry) (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
	1679	VM_PROT_ALL)
	1680
	1681	pmap_protect(map->pmap, current->start,
	1682	current->end,
	1683	current->protection & MASK(current));
	1684	#undef MASK
	1685	}
	1686
	1687	vm_map_simplify_entry(map, current, &count);
	1688
	1689	current = current->next;
	1690	}
	1691
	1692	vm_map_unlock(map);
	1693	vm_map_entry_release(count);
	1694	return (KERN_SUCCESS);
	1695	}
	1696
	1697	/*
	1698	* vm_map_madvise:
	1699	*
	1700	* This routine traverses a processes map handling the madvise
	1701	* system call. Advisories are classified as either those effecting
	1702	* the vm_map_entry structure, or those effecting the underlying
	1703	* objects.
	1704	*
	1705	* The <value> argument is used for extended madvise calls.
	1706	*/
	1707	int
	1708	vm_map_madvise(vm_map_t map, vm_offset_t start, vm_offset_t end,
	1709	int behav, off_t value)
	1710	{
	1711	vm_map_entry_t current, entry;
	1712	int modify_map = 0;
	1713	int error = 0;
	1714	int count;
	1715
	1716	/*
	1717	* Some madvise calls directly modify the vm_map_entry, in which case
	1718	* we need to use an exclusive lock on the map and we need to perform
	1719	* various clipping operations. Otherwise we only need a read-lock
	1720	* on the map.
	1721	*/
	1722
	1723	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	1724
	1725	switch(behav) {
	1726	case MADV_NORMAL:
	1727	case MADV_SEQUENTIAL:
	1728	case MADV_RANDOM:
	1729	case MADV_NOSYNC:
	1730	case MADV_AUTOSYNC:
	1731	case MADV_NOCORE:
	1732	case MADV_CORE:
	1733	case MADV_SETMAP:
	1734	case MADV_INVAL:
	1735	modify_map = 1;
	1736	vm_map_lock(map);
	1737	break;
	1738	case MADV_WILLNEED:
	1739	case MADV_DONTNEED:
	1740	case MADV_FREE:
	1741	vm_map_lock_read(map);
	1742	break;
	1743	default:
	1744	vm_map_entry_release(count);
	1745	return (EINVAL);
	1746	}
	1747
	1748	/*
	1749	* Locate starting entry and clip if necessary.
	1750	*/
	1751
	1752	VM_MAP_RANGE_CHECK(map, start, end);
	1753
	1754	if (vm_map_lookup_entry(map, start, &entry)) {
	1755	if (modify_map)
	1756	vm_map_clip_start(map, entry, start, &count);
	1757	} else {
	1758	entry = entry->next;
	1759	}
	1760
	1761	if (modify_map) {
	1762	/*
	1763	* madvise behaviors that are implemented in the vm_map_entry.
	1764	*
	1765	* We clip the vm_map_entry so that behavioral changes are
	1766	* limited to the specified address range.
	1767	*/
	1768	for (current = entry;
	1769	(current != &map->header) && (current->start < end);
	1770	current = current->next
	1771	) {
	1772	if (current->maptype == VM_MAPTYPE_SUBMAP)
	1773	continue;
	1774
	1775	vm_map_clip_end(map, current, end, &count);
	1776
	1777	switch (behav) {
	1778	case MADV_NORMAL:
	1779	vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL);
	1780	break;
	1781	case MADV_SEQUENTIAL:
	1782	vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL);
	1783	break;
	1784	case MADV_RANDOM:
	1785	vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM);
	1786	break;
	1787	case MADV_NOSYNC:
	1788	current->eflags \|= MAP_ENTRY_NOSYNC;
	1789	break;
	1790	case MADV_AUTOSYNC:
	1791	current->eflags &= ~MAP_ENTRY_NOSYNC;
	1792	break;
	1793	case MADV_NOCORE:
	1794	current->eflags \|= MAP_ENTRY_NOCOREDUMP;
	1795	break;
	1796	case MADV_CORE:
	1797	current->eflags &= ~MAP_ENTRY_NOCOREDUMP;
	1798	break;
	1799	case MADV_INVAL:
	1800	/*
	1801	* Invalidate the related pmap entries, used
	1802	* to flush portions of the real kernel's
	1803	* pmap when the caller has removed or
	1804	* modified existing mappings in a virtual
	1805	* page table.
	1806	*/
	1807	pmap_remove(map->pmap,
	1808	current->start, current->end);
	1809	break;
	1810	case MADV_SETMAP:
	1811	/*
	1812	* Set the page directory page for a map
	1813	* governed by a virtual page table. Mark
	1814	* the entry as being governed by a virtual
	1815	* page table if it is not.
	1816	*
	1817	* XXX the page directory page is stored
	1818	* in the avail_ssize field if the map_entry.
	1819	*
	1820	* XXX the map simplification code does not
	1821	* compare this field so weird things may
	1822	* happen if you do not apply this function
	1823	* to the entire mapping governed by the
	1824	* virtual page table.
	1825	*/
	1826	if (current->maptype != VM_MAPTYPE_VPAGETABLE) {
	1827	error = EINVAL;
	1828	break;
	1829	}
	1830	current->aux.master_pde = value;
	1831	pmap_remove(map->pmap,
	1832	current->start, current->end);
	1833	break;
	1834	default:
	1835	error = EINVAL;
	1836	break;
	1837	}
	1838	vm_map_simplify_entry(map, current, &count);
	1839	}
	1840	vm_map_unlock(map);
	1841	} else {
	1842	vm_pindex_t pindex;
	1843	int count;
	1844
	1845	/*
	1846	* madvise behaviors that are implemented in the underlying
	1847	* vm_object.
	1848	*
	1849	* Since we don't clip the vm_map_entry, we have to clip
	1850	* the vm_object pindex and count.
	1851	*
	1852	* NOTE! We currently do not support these functions on
	1853	* virtual page tables.
	1854	*/
	1855	for (current = entry;
	1856	(current != &map->header) && (current->start < end);
	1857	current = current->next
	1858	) {
	1859	vm_offset_t useStart;
	1860
	1861	if (current->maptype != VM_MAPTYPE_NORMAL)
	1862	continue;
	1863
	1864	pindex = OFF_TO_IDX(current->offset);
	1865	count = atop(current->end - current->start);
	1866	useStart = current->start;
	1867
	1868	if (current->start < start) {
	1869	pindex += atop(start - current->start);
	1870	count -= atop(start - current->start);
	1871	useStart = start;
	1872	}
	1873	if (current->end > end)
	1874	count -= atop(current->end - end);
	1875
	1876	if (count <= 0)
	1877	continue;
	1878
	1879	vm_object_madvise(current->object.vm_object,
	1880	pindex, count, behav);
	1881
	1882	/*
	1883	* Try to populate the page table. Mappings governed
	1884	* by virtual page tables cannot be pre-populated
	1885	* without a lot of work so don't try.
	1886	*/
	1887	if (behav == MADV_WILLNEED &&
	1888	current->maptype != VM_MAPTYPE_VPAGETABLE) {
	1889	pmap_object_init_pt(
	1890	map->pmap,
	1891	useStart,
	1892	current->protection,
	1893	current->object.vm_object,
	1894	pindex,
	1895	(count << PAGE_SHIFT),
	1896	MAP_PREFAULT_MADVISE
	1897	);
	1898	}
	1899	}
	1900	vm_map_unlock_read(map);
	1901	}
	1902	vm_map_entry_release(count);
	1903	return(error);
	1904	}
	1905
	1906
	1907	/*
	1908	* vm_map_inherit:
	1909	*
	1910	* Sets the inheritance of the specified address
	1911	* range in the target map. Inheritance
	1912	* affects how the map will be shared with
	1913	* child maps at the time of vm_map_fork.
	1914	*/
	1915	int
	1916	vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
	1917	vm_inherit_t new_inheritance)
	1918	{
	1919	vm_map_entry_t entry;
	1920	vm_map_entry_t temp_entry;
	1921	int count;
	1922
	1923	switch (new_inheritance) {
	1924	case VM_INHERIT_NONE:
	1925	case VM_INHERIT_COPY:
	1926	case VM_INHERIT_SHARE:
	1927	break;
	1928	default:
	1929	return (KERN_INVALID_ARGUMENT);
	1930	}
	1931
	1932	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	1933	vm_map_lock(map);
	1934
	1935	VM_MAP_RANGE_CHECK(map, start, end);
	1936
	1937	if (vm_map_lookup_entry(map, start, &temp_entry)) {
	1938	entry = temp_entry;
	1939	vm_map_clip_start(map, entry, start, &count);
	1940	} else
	1941	entry = temp_entry->next;
	1942
	1943	while ((entry != &map->header) && (entry->start < end)) {
	1944	vm_map_clip_end(map, entry, end, &count);
	1945
	1946	entry->inheritance = new_inheritance;
	1947
	1948	vm_map_simplify_entry(map, entry, &count);
	1949
	1950	entry = entry->next;
	1951	}
	1952	vm_map_unlock(map);
	1953	vm_map_entry_release(count);
	1954	return (KERN_SUCCESS);
	1955	}
	1956
	1957	/*
	1958	* Implement the semantics of mlock
	1959	*/
	1960	int
	1961	vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t real_end,
	1962	boolean_t new_pageable)
	1963	{
	1964	vm_map_entry_t entry;
	1965	vm_map_entry_t start_entry;
	1966	vm_offset_t end;
	1967	int rv = KERN_SUCCESS;
	1968	int count;
	1969
	1970	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	1971	vm_map_lock(map);
	1972	VM_MAP_RANGE_CHECK(map, start, real_end);
	1973	end = real_end;
	1974
	1975	start_entry = vm_map_clip_range(map, start, end, &count, MAP_CLIP_NO_HOLES);
	1976	if (start_entry == NULL) {
	1977	vm_map_unlock(map);
	1978	vm_map_entry_release(count);
	1979	return (KERN_INVALID_ADDRESS);
	1980	}
	1981
	1982	if (new_pageable == 0) {
	1983	entry = start_entry;
	1984	while ((entry != &map->header) && (entry->start < end)) {
	1985	vm_offset_t save_start;
	1986	vm_offset_t save_end;
	1987
	1988	/*
	1989	* Already user wired or hard wired (trivial cases)
	1990	*/
	1991	if (entry->eflags & MAP_ENTRY_USER_WIRED) {
	1992	entry = entry->next;
	1993	continue;
	1994	}
	1995	if (entry->wired_count != 0) {
	1996	entry->wired_count++;
	1997	entry->eflags \|= MAP_ENTRY_USER_WIRED;
	1998	entry = entry->next;
	1999	continue;
	2000	}
	2001
	2002	/*
	2003	* A new wiring requires instantiation of appropriate
	2004	* management structures and the faulting in of the
	2005	* page.
	2006	*/
	2007	if (entry->maptype != VM_MAPTYPE_SUBMAP) {
	2008	int copyflag = entry->eflags & MAP_ENTRY_NEEDS_COPY;
	2009	if (copyflag && ((entry->protection & VM_PROT_WRITE) != 0)) {
	2010	vm_map_entry_shadow(entry);
	2011	} else if (entry->object.vm_object == NULL &&
	2012	!map->system_map) {
	2013	vm_map_entry_allocate_object(entry);
	2014	}
	2015	}
	2016	entry->wired_count++;
	2017	entry->eflags \|= MAP_ENTRY_USER_WIRED;
	2018
	2019	/*
	2020	* Now fault in the area. Note that vm_fault_wire()
	2021	* may release the map lock temporarily, it will be
	2022	* relocked on return. The in-transition
	2023	* flag protects the entries.
	2024	*/
	2025	save_start = entry->start;
	2026	save_end = entry->end;
	2027	rv = vm_fault_wire(map, entry, TRUE);
	2028	if (rv) {
	2029	CLIP_CHECK_BACK(entry, save_start);
	2030	for (;;) {
	2031	KASSERT(entry->wired_count == 1, ("bad wired_count on entry"));
	2032	entry->eflags &= ~MAP_ENTRY_USER_WIRED;
	2033	entry->wired_count = 0;
	2034	if (entry->end == save_end)
	2035	break;
	2036	entry = entry->next;
	2037	KASSERT(entry != &map->header, ("bad entry clip during backout"));
	2038	}
	2039	end = save_start; /* unwire the rest */
	2040	break;
	2041	}
	2042	/*
	2043	* note that even though the entry might have been
	2044	* clipped, the USER_WIRED flag we set prevents
	2045	* duplication so we do not have to do a
	2046	* clip check.
	2047	*/
	2048	entry = entry->next;
	2049	}
	2050
	2051	/*
	2052	* If we failed fall through to the unwiring section to
	2053	* unwire what we had wired so far. 'end' has already
	2054	* been adjusted.
	2055	*/
	2056	if (rv)
	2057	new_pageable = 1;
	2058
	2059	/*
	2060	* start_entry might have been clipped if we unlocked the
	2061	* map and blocked. No matter how clipped it has gotten
	2062	* there should be a fragment that is on our start boundary.
	2063	*/
	2064	CLIP_CHECK_BACK(start_entry, start);
	2065	}
	2066
	2067	/*
	2068	* Deal with the unwiring case.
	2069	*/
	2070	if (new_pageable) {
	2071	/*
	2072	* This is the unwiring case. We must first ensure that the
	2073	* range to be unwired is really wired down. We know there
	2074	* are no holes.
	2075	*/
	2076	entry = start_entry;
	2077	while ((entry != &map->header) && (entry->start < end)) {
	2078	if ((entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
	2079	rv = KERN_INVALID_ARGUMENT;
	2080	goto done;
	2081	}
	2082	KASSERT(entry->wired_count != 0, ("wired count was 0 with USER_WIRED set! %p", entry));
	2083	entry = entry->next;
	2084	}
	2085
	2086	/*
	2087	* Now decrement the wiring count for each region. If a region
	2088	* becomes completely unwired, unwire its physical pages and
	2089	* mappings.
	2090	*/
	2091	/*
	2092	* The map entries are processed in a loop, checking to
	2093	* make sure the entry is wired and asserting it has a wired
	2094	* count. However, another loop was inserted more-or-less in
	2095	* the middle of the unwiring path. This loop picks up the
	2096	* "entry" loop variable from the first loop without first
	2097	* setting it to start_entry. Naturally, the secound loop
	2098	* is never entered and the pages backing the entries are
	2099	* never unwired. This can lead to a leak of wired pages.
	2100	*/
	2101	entry = start_entry;
	2102	while ((entry != &map->header) && (entry->start < end)) {
	2103	KASSERT(entry->eflags & MAP_ENTRY_USER_WIRED,
	2104	("expected USER_WIRED on entry %p", entry));
	2105	entry->eflags &= ~MAP_ENTRY_USER_WIRED;
	2106	entry->wired_count--;
	2107	if (entry->wired_count == 0)
	2108	vm_fault_unwire(map, entry);
	2109	entry = entry->next;
	2110	}
	2111	}
	2112	done:
	2113	vm_map_unclip_range(map, start_entry, start, real_end, &count,
	2114	MAP_CLIP_NO_HOLES);
	2115	map->timestamp++;
	2116	vm_map_unlock(map);
	2117	vm_map_entry_release(count);
	2118	return (rv);
	2119	}
	2120
	2121	/*
	2122	* vm_map_wire:
	2123	*
	2124	* Sets the pageability of the specified address
	2125	* range in the target map. Regions specified
	2126	* as not pageable require locked-down physical
	2127	* memory and physical page maps.
	2128	*
	2129	* The map must not be locked, but a reference
	2130	* must remain to the map throughout the call.
	2131	*
	2132	* This function may be called via the zalloc path and must properly
	2133	* reserve map entries for kernel_map.
	2134	*/
	2135	int
	2136	vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t real_end, int kmflags)
	2137	{
	2138	vm_map_entry_t entry;
	2139	vm_map_entry_t start_entry;
	2140	vm_offset_t end;
	2141	int rv = KERN_SUCCESS;
	2142	int count;
	2143
	2144	if (kmflags & KM_KRESERVE)
	2145	count = vm_map_entry_kreserve(MAP_RESERVE_COUNT);
	2146	else
	2147	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	2148	vm_map_lock(map);
	2149	VM_MAP_RANGE_CHECK(map, start, real_end);
	2150	end = real_end;
	2151
	2152	start_entry = vm_map_clip_range(map, start, end, &count, MAP_CLIP_NO_HOLES);
	2153	if (start_entry == NULL) {
	2154	vm_map_unlock(map);
	2155	rv = KERN_INVALID_ADDRESS;
	2156	goto failure;
	2157	}
	2158	if ((kmflags & KM_PAGEABLE) == 0) {
	2159	/*
	2160	* Wiring.
	2161	*
	2162	* 1. Holding the write lock, we create any shadow or zero-fill
	2163	* objects that need to be created. Then we clip each map
	2164	* entry to the region to be wired and increment its wiring
	2165	* count. We create objects before clipping the map entries
	2166	* to avoid object proliferation.
	2167	*
	2168	* 2. We downgrade to a read lock, and call vm_fault_wire to
	2169	* fault in the pages for any newly wired area (wired_count is
	2170	* 1).
	2171	*
	2172	* Downgrading to a read lock for vm_fault_wire avoids a
	2173	* possible deadlock with another process that may have faulted
	2174	* on one of the pages to be wired (it would mark the page busy,
	2175	* blocking us, then in turn block on the map lock that we
	2176	* hold). Because of problems in the recursive lock package,
	2177	* we cannot upgrade to a write lock in vm_map_lookup. Thus,
	2178	* any actions that require the write lock must be done
	2179	* beforehand. Because we keep the read lock on the map, the
	2180	* copy-on-write status of the entries we modify here cannot
	2181	* change.
	2182	*/
	2183
	2184	entry = start_entry;
	2185	while ((entry != &map->header) && (entry->start < end)) {
	2186	/*
	2187	* Trivial case if the entry is already wired
	2188	*/
	2189	if (entry->wired_count) {
	2190	entry->wired_count++;
	2191	entry = entry->next;
	2192	continue;
	2193	}
	2194
	2195	/*
	2196	* The entry is being newly wired, we have to setup
	2197	* appropriate management structures. A shadow
	2198	* object is required for a copy-on-write region,
	2199	* or a normal object for a zero-fill region. We
	2200	* do not have to do this for entries that point to sub
	2201	* maps because we won't hold the lock on the sub map.
	2202	*/
	2203	if (entry->maptype != VM_MAPTYPE_SUBMAP) {
	2204	int copyflag = entry->eflags & MAP_ENTRY_NEEDS_COPY;
	2205	if (copyflag &&
	2206	((entry->protection & VM_PROT_WRITE) != 0)) {
	2207	vm_map_entry_shadow(entry);
	2208	} else if (entry->object.vm_object == NULL &&
	2209	!map->system_map) {
	2210	vm_map_entry_allocate_object(entry);
	2211	}
	2212	}
	2213
	2214	entry->wired_count++;
	2215	entry = entry->next;
	2216	}
	2217
	2218	/*
	2219	* Pass 2.
	2220	*/
	2221
	2222	/*
	2223	* HACK HACK HACK HACK
	2224	*
	2225	* Unlock the map to avoid deadlocks. The in-transit flag
	2226	* protects us from most changes but note that
	2227	* clipping may still occur. To prevent clipping from
	2228	* occuring after the unlock, except for when we are
	2229	* blocking in vm_fault_wire, we must run in a critical
	2230	* section, otherwise our accesses to entry->start and
	2231	* entry->end could be corrupted. We have to enter the
	2232	* critical section prior to unlocking so start_entry does
	2233	* not change out from under us at the very beginning of the
	2234	* loop.
	2235	*
	2236	* HACK HACK HACK HACK
	2237	*/
	2238
	2239	crit_enter();
	2240
	2241	entry = start_entry;
	2242	while (entry != &map->header && entry->start < end) {
	2243	/*
	2244	* If vm_fault_wire fails for any page we need to undo
	2245	* what has been done. We decrement the wiring count
	2246	* for those pages which have not yet been wired (now)
	2247	* and unwire those that have (later).
	2248	*/
	2249	vm_offset_t save_start = entry->start;
	2250	vm_offset_t save_end = entry->end;
	2251
	2252	if (entry->wired_count == 1)
	2253	rv = vm_fault_wire(map, entry, FALSE);
	2254	if (rv) {
	2255	CLIP_CHECK_BACK(entry, save_start);
	2256	for (;;) {
	2257	KASSERT(entry->wired_count == 1, ("wired_count changed unexpectedly"));
	2258	entry->wired_count = 0;
	2259	if (entry->end == save_end)
	2260	break;
	2261	entry = entry->next;
	2262	KASSERT(entry != &map->header, ("bad entry clip during backout"));
	2263	}
	2264	end = save_start;
	2265	break;
	2266	}
	2267	CLIP_CHECK_FWD(entry, save_end);
	2268	entry = entry->next;
	2269	}
	2270	crit_exit();
	2271
	2272	/*
	2273	* If a failure occured undo everything by falling through
	2274	* to the unwiring code. 'end' has already been adjusted
	2275	* appropriately.
	2276	*/
	2277	if (rv)
	2278	kmflags \|= KM_PAGEABLE;
	2279
	2280	/*
	2281	* start_entry is still IN_TRANSITION but may have been
	2282	* clipped since vm_fault_wire() unlocks and relocks the
	2283	* map. No matter how clipped it has gotten there should
	2284	* be a fragment that is on our start boundary.
	2285	*/
	2286	CLIP_CHECK_BACK(start_entry, start);
	2287	}
	2288
	2289	if (kmflags & KM_PAGEABLE) {
	2290	/*
	2291	* This is the unwiring case. We must first ensure that the
	2292	* range to be unwired is really wired down. We know there
	2293	* are no holes.
	2294	*/
	2295	entry = start_entry;
	2296	while ((entry != &map->header) && (entry->start < end)) {
	2297	if (entry->wired_count == 0) {
	2298	rv = KERN_INVALID_ARGUMENT;
	2299	goto done;
	2300	}
	2301	entry = entry->next;
	2302	}
	2303
	2304	/*
	2305	* Now decrement the wiring count for each region. If a region
	2306	* becomes completely unwired, unwire its physical pages and
	2307	* mappings.
	2308	*/
	2309	entry = start_entry;
	2310	while ((entry != &map->header) && (entry->start < end)) {
	2311	entry->wired_count--;
	2312	if (entry->wired_count == 0)
	2313	vm_fault_unwire(map, entry);
	2314	entry = entry->next;
	2315	}
	2316	}
	2317	done:
	2318	vm_map_unclip_range(map, start_entry, start, real_end, &count,
	2319	MAP_CLIP_NO_HOLES);
	2320	map->timestamp++;
	2321	vm_map_unlock(map);
	2322	failure:
	2323	if (kmflags & KM_KRESERVE)
	2324	vm_map_entry_krelease(count);
	2325	else
	2326	vm_map_entry_release(count);
	2327	return (rv);
	2328	}
	2329
	2330	/*
	2331	* vm_map_set_wired_quick()
	2332	*
	2333	* Mark a newly allocated address range as wired but do not fault in
	2334	* the pages. The caller is expected to load the pages into the object.
	2335	*
	2336	* The map must be locked on entry and will remain locked on return.
	2337	*/
	2338	void
	2339	vm_map_set_wired_quick(vm_map_t map, vm_offset_t addr, vm_size_t size, int *countp)
	2340	{
	2341	vm_map_entry_t scan;
	2342	vm_map_entry_t entry;
	2343
	2344	entry = vm_map_clip_range(map, addr, addr + size, countp, MAP_CLIP_NO_HOLES);
	2345	for (scan = entry; scan != &map->header && scan->start < addr + size; scan = scan->next) {
	2346	KKASSERT(entry->wired_count == 0);
	2347	entry->wired_count = 1;
	2348	}
	2349	vm_map_unclip_range(map, entry, addr, addr + size, countp, MAP_CLIP_NO_HOLES);
	2350	}
	2351
	2352	/*
	2353	* vm_map_clean
	2354	*
	2355	* Push any dirty cached pages in the address range to their pager.
	2356	* If syncio is TRUE, dirty pages are written synchronously.
	2357	* If invalidate is TRUE, any cached pages are freed as well.
	2358	*
	2359	* This routine is called by sys_msync()
	2360	*
	2361	* Returns an error if any part of the specified range is not mapped.
	2362	*/
	2363	int
	2364	vm_map_clean(vm_map_t map, vm_offset_t start, vm_offset_t end,
	2365	boolean_t syncio, boolean_t invalidate)
	2366	{
	2367	vm_map_entry_t current;
	2368	vm_map_entry_t entry;
	2369	vm_size_t size;
	2370	vm_object_t object;
	2371	vm_ooffset_t offset;
	2372
	2373	vm_map_lock_read(map);
	2374	VM_MAP_RANGE_CHECK(map, start, end);
	2375	if (!vm_map_lookup_entry(map, start, &entry)) {
	2376	vm_map_unlock_read(map);
	2377	return (KERN_INVALID_ADDRESS);
	2378	}
	2379	/*
	2380	* Make a first pass to check for holes.
	2381	*/
	2382	for (current = entry; current->start < end; current = current->next) {
	2383	if (current->maptype == VM_MAPTYPE_SUBMAP) {
	2384	vm_map_unlock_read(map);
	2385	return (KERN_INVALID_ARGUMENT);
	2386	}
	2387	if (end > current->end &&
	2388	(current->next == &map->header \|\|
	2389	current->end != current->next->start)) {
	2390	vm_map_unlock_read(map);
	2391	return (KERN_INVALID_ADDRESS);
	2392	}
	2393	}
	2394
	2395	if (invalidate)
	2396	pmap_remove(vm_map_pmap(map), start, end);
	2397	/*
	2398	* Make a second pass, cleaning/uncaching pages from the indicated
	2399	* objects as we go.
	2400	*/
	2401	for (current = entry; current->start < end; current = current->next) {
	2402	offset = current->offset + (start - current->start);
	2403	size = (end <= current->end ? end : current->end) - start;
	2404	if (current->maptype == VM_MAPTYPE_SUBMAP) {
	2405	vm_map_t smap;
	2406	vm_map_entry_t tentry;
	2407	vm_size_t tsize;
	2408
	2409	smap = current->object.sub_map;
	2410	vm_map_lock_read(smap);
	2411	vm_map_lookup_entry(smap, offset, &tentry);
	2412	tsize = tentry->end - offset;
	2413	if (tsize < size)
	2414	size = tsize;
	2415	object = tentry->object.vm_object;
	2416	offset = tentry->offset + (offset - tentry->start);
	2417	vm_map_unlock_read(smap);
	2418	} else {
	2419	object = current->object.vm_object;
	2420	}
	2421	/*
	2422	* Note that there is absolutely no sense in writing out
	2423	* anonymous objects, so we track down the vnode object
	2424	* to write out.
	2425	* We invalidate (remove) all pages from the address space
	2426	* anyway, for semantic correctness.
	2427	*
	2428	* note: certain anonymous maps, such as MAP_NOSYNC maps,
	2429	* may start out with a NULL object.
	2430	*/
	2431	while (object && object->backing_object) {
	2432	offset += object->backing_object_offset;
	2433	object = object->backing_object;
	2434	if (object->size < OFF_TO_IDX( offset + size))
	2435	size = IDX_TO_OFF(object->size) - offset;
	2436	}
	2437	if (object && (object->type == OBJT_VNODE) &&
	2438	(current->protection & VM_PROT_WRITE) &&
	2439	(object->flags & OBJ_NOMSYNC) == 0) {
	2440	/*
	2441	* Flush pages if writing is allowed, invalidate them
	2442	* if invalidation requested. Pages undergoing I/O
	2443	* will be ignored by vm_object_page_remove().
	2444	*
	2445	* We cannot lock the vnode and then wait for paging
	2446	* to complete without deadlocking against vm_fault.
	2447	* Instead we simply call vm_object_page_remove() and
	2448	* allow it to block internally on a page-by-page
	2449	* basis when it encounters pages undergoing async
	2450	* I/O.
	2451	*/
	2452	int flags;
	2453
	2454	vm_object_reference(object);
	2455	vn_lock(object->handle, LK_EXCLUSIVE \| LK_RETRY);
	2456	flags = (syncio \|\| invalidate) ? OBJPC_SYNC : 0;
	2457	flags \|= invalidate ? OBJPC_INVAL : 0;
	2458
	2459	/*
	2460	* When operating on a virtual page table just
	2461	* flush the whole object. XXX we probably ought
	2462	* to
	2463	*/
	2464	switch(current->maptype) {
	2465	case VM_MAPTYPE_NORMAL:
	2466	vm_object_page_clean(object,
	2467	OFF_TO_IDX(offset),
	2468	OFF_TO_IDX(offset + size + PAGE_MASK),
	2469	flags);
	2470	break;
	2471	case VM_MAPTYPE_VPAGETABLE:
	2472	vm_object_page_clean(object, 0, 0, flags);
	2473	break;
	2474	}
	2475	vn_unlock(((struct vnode *)object->handle));
	2476	vm_object_deallocate(object);
	2477	}
	2478	if (object && invalidate &&
	2479	((object->type == OBJT_VNODE) \|\|
	2480	(object->type == OBJT_DEVICE))) {
	2481	int clean_only =
	2482	(object->type == OBJT_DEVICE) ? FALSE : TRUE;
	2483	vm_object_reference(object);
	2484	switch(current->maptype) {
	2485	case VM_MAPTYPE_NORMAL:
	2486	vm_object_page_remove(object,
	2487	OFF_TO_IDX(offset),
	2488	OFF_TO_IDX(offset + size + PAGE_MASK),
	2489	clean_only);
	2490	break;
	2491	case VM_MAPTYPE_VPAGETABLE:
	2492	vm_object_page_remove(object, 0, 0, clean_only);
	2493	break;
	2494	}
	2495	vm_object_deallocate(object);
	2496	}
	2497	start += size;
	2498	}
	2499
	2500	vm_map_unlock_read(map);
	2501	return (KERN_SUCCESS);
	2502	}
	2503
	2504	/*
	2505	* vm_map_entry_unwire: [ internal use only ]
	2506	*
	2507	* Make the region specified by this entry pageable.
	2508	*
	2509	* The map in question should be locked.
	2510	* [This is the reason for this routine's existence.]
	2511	*/
	2512	static void
	2513	vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
	2514	{
	2515	entry->eflags &= ~MAP_ENTRY_USER_WIRED;
	2516	entry->wired_count = 0;
	2517	vm_fault_unwire(map, entry);
	2518	}
	2519
	2520	/*
	2521	* vm_map_entry_delete: [ internal use only ]
	2522	*
	2523	* Deallocate the given entry from the target map.
	2524	*/
	2525	static void
	2526	vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry, int *countp)
	2527	{
	2528	vm_map_entry_unlink(map, entry);
	2529	map->size -= entry->end - entry->start;
	2530
	2531	switch(entry->maptype) {
	2532	case VM_MAPTYPE_NORMAL:
	2533	case VM_MAPTYPE_VPAGETABLE:
	2534	vm_object_deallocate(entry->object.vm_object);
	2535	break;
	2536	default:
	2537	break;
	2538	}
	2539
	2540	vm_map_entry_dispose(map, entry, countp);
	2541	}
	2542
	2543	/*
	2544	* vm_map_delete: [ internal use only ]
	2545	*
	2546	* Deallocates the given address range from the target
	2547	* map.
	2548	*/
	2549	int
	2550	vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end, int *countp)
	2551	{
	2552	vm_object_t object;
	2553	vm_map_entry_t entry;
	2554	vm_map_entry_t first_entry;
	2555
	2556	again:
	2557	/*
	2558	* Find the start of the region, and clip it. Set entry to point
	2559	* at the first record containing the requested address or, if no
	2560	* such record exists, the next record with a greater address. The
	2561	* loop will run from this point until a record beyond the termination
	2562	* address is encountered.
	2563	*
	2564	* map->hint must be adjusted to not point to anything we delete,
	2565	* so set it to the entry prior to the one being deleted.
	2566	*
	2567	* GGG see other GGG comment.
	2568	*/
	2569	if (vm_map_lookup_entry(map, start, &first_entry)) {
	2570	entry = first_entry;
	2571	vm_map_clip_start(map, entry, start, countp);
	2572	map->hint = entry->prev; /* possible problem XXX */
	2573	} else {
	2574	map->hint = first_entry; /* possible problem XXX */
	2575	entry = first_entry->next;
	2576	}
	2577
	2578	/*
	2579	* If a hole opens up prior to the current first_free then
	2580	* adjust first_free. As with map->hint, map->first_free
	2581	* cannot be left set to anything we might delete.
	2582	*/
	2583	if (entry == &map->header) {
	2584	map->first_free = &map->header;
	2585	} else if (map->first_free->start >= start) {
	2586	map->first_free = entry->prev;
	2587	}
	2588
	2589	/*
	2590	* Step through all entries in this region
	2591	*/
	2592
	2593	while ((entry != &map->header) && (entry->start < end)) {
	2594	vm_map_entry_t next;
	2595	vm_offset_t s, e;
	2596	vm_pindex_t offidxstart, offidxend, count;
	2597
	2598	/*
	2599	* If we hit an in-transition entry we have to sleep and
	2600	* retry. It's easier (and not really slower) to just retry
	2601	* since this case occurs so rarely and the hint is already
	2602	* pointing at the right place. We have to reset the
	2603	* start offset so as not to accidently delete an entry
	2604	* another process just created in vacated space.
	2605	*/
	2606	if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
	2607	entry->eflags \|= MAP_ENTRY_NEEDS_WAKEUP;
	2608	start = entry->start;
	2609	++mycpu->gd_cnt.v_intrans_coll;
	2610	++mycpu->gd_cnt.v_intrans_wait;
	2611	vm_map_transition_wait(map);
	2612	goto again;
	2613	}
	2614	vm_map_clip_end(map, entry, end, countp);
	2615
	2616	s = entry->start;
	2617	e = entry->end;
	2618	next = entry->next;
	2619
	2620	offidxstart = OFF_TO_IDX(entry->offset);
	2621	count = OFF_TO_IDX(e - s);
	2622	object = entry->object.vm_object;
	2623
	2624	/*
	2625	* Unwire before removing addresses from the pmap; otherwise,
	2626	* unwiring will put the entries back in the pmap.
	2627	*/
	2628	if (entry->wired_count != 0)
	2629	vm_map_entry_unwire(map, entry);
	2630
	2631	offidxend = offidxstart + count;
	2632
	2633	if (object == &kernel_object) {
	2634	vm_object_page_remove(object, offidxstart, offidxend, FALSE);
	2635	} else {
	2636	pmap_remove(map->pmap, s, e);
	2637	if (object != NULL &&
	2638	object->ref_count != 1 &&
	2639	(object->flags & (OBJ_NOSPLIT\|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING &&
	2640	(object->type == OBJT_DEFAULT \|\| object->type == OBJT_SWAP)) {
	2641	vm_object_collapse(object);
	2642	vm_object_page_remove(object, offidxstart, offidxend, FALSE);
	2643	if (object->type == OBJT_SWAP) {
	2644	swap_pager_freespace(object, offidxstart, count);
	2645	}
	2646	if (offidxend >= object->size &&
	2647	offidxstart < object->size) {
	2648	object->size = offidxstart;
	2649	}
	2650	}
	2651	}
	2652
	2653	/*
	2654	* Delete the entry (which may delete the object) only after
	2655	* removing all pmap entries pointing to its pages.
	2656	* (Otherwise, its page frames may be reallocated, and any
	2657	* modify bits will be set in the wrong object!)
	2658	*/
	2659	vm_map_entry_delete(map, entry, countp);
	2660	entry = next;
	2661	}
	2662	return (KERN_SUCCESS);
	2663	}
	2664
	2665	/*
	2666	* vm_map_remove:
	2667	*
	2668	* Remove the given address range from the target map.
	2669	* This is the exported form of vm_map_delete.
	2670	*/
	2671	int
	2672	vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
	2673	{
	2674	int result;
	2675	int count;
	2676
	2677	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	2678	vm_map_lock(map);
	2679	VM_MAP_RANGE_CHECK(map, start, end);
	2680	result = vm_map_delete(map, start, end, &count);
	2681	vm_map_unlock(map);
	2682	vm_map_entry_release(count);
	2683
	2684	return (result);
	2685	}
	2686
	2687	/*
	2688	* vm_map_check_protection:
	2689	*
	2690	* Assert that the target map allows the specified
	2691	* privilege on the entire address region given.
	2692	* The entire region must be allocated.
	2693	*/
	2694	boolean_t
	2695	vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
	2696	vm_prot_t protection)
	2697	{
	2698	vm_map_entry_t entry;
	2699	vm_map_entry_t tmp_entry;
	2700
	2701	if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
	2702	return (FALSE);
	2703	}
	2704	entry = tmp_entry;
	2705
	2706	while (start < end) {
	2707	if (entry == &map->header) {
	2708	return (FALSE);
	2709	}
	2710	/*
	2711	* No holes allowed!
	2712	*/
	2713
	2714	if (start < entry->start) {
	2715	return (FALSE);
	2716	}
	2717	/*
	2718	* Check protection associated with entry.
	2719	*/
	2720
	2721	if ((entry->protection & protection) != protection) {
	2722	return (FALSE);
	2723	}
	2724	/* go to next entry */
	2725
	2726	start = entry->end;
	2727	entry = entry->next;
	2728	}
	2729	return (TRUE);
	2730	}
	2731
	2732	/*
	2733	* Split the pages in a map entry into a new object. This affords
	2734	* easier removal of unused pages, and keeps object inheritance from
	2735	* being a negative impact on memory usage.
	2736	*/
	2737	static void
	2738	vm_map_split(vm_map_entry_t entry)
	2739	{
	2740	vm_page_t m;
	2741	vm_object_t orig_object, new_object, source;
	2742	vm_offset_t s, e;
	2743	vm_pindex_t offidxstart, offidxend, idx;
	2744	vm_size_t size;
	2745	vm_ooffset_t offset;
	2746
	2747	orig_object = entry->object.vm_object;
	2748	if (orig_object->type != OBJT_DEFAULT && orig_object->type != OBJT_SWAP)
	2749	return;
	2750	if (orig_object->ref_count <= 1)
	2751	return;
	2752
	2753	offset = entry->offset;
	2754	s = entry->start;
	2755	e = entry->end;
	2756
	2757	offidxstart = OFF_TO_IDX(offset);
	2758	offidxend = offidxstart + OFF_TO_IDX(e - s);
	2759	size = offidxend - offidxstart;
	2760
	2761	switch(orig_object->type) {
	2762	case OBJT_DEFAULT:
	2763	new_object = default_pager_alloc(NULL, IDX_TO_OFF(size),
	2764	VM_PROT_ALL, 0);
	2765	break;
	2766	case OBJT_SWAP:
	2767	new_object = swap_pager_alloc(NULL, IDX_TO_OFF(size),
	2768	VM_PROT_ALL, 0);
	2769	break;
	2770	default:
	2771	/* not reached */
	2772	new_object = NULL;
	2773	KKASSERT(0);
	2774	}
	2775	if (new_object == NULL)
	2776	return;
	2777
	2778	source = orig_object->backing_object;
	2779	if (source != NULL) {
	2780	vm_object_reference(source); /* Referenced by new_object */
	2781	LIST_INSERT_HEAD(&source->shadow_head,
	2782	new_object, shadow_list);
	2783	vm_object_clear_flag(source, OBJ_ONEMAPPING);
	2784	new_object->backing_object_offset =
	2785	orig_object->backing_object_offset + IDX_TO_OFF(offidxstart);
	2786	new_object->backing_object = source;
	2787	source->shadow_count++;
	2788	source->generation++;
	2789	}
	2790
	2791	for (idx = 0; idx < size; idx++) {
	2792	vm_page_t m;
	2793
	2794	/*
	2795	* A critical section is required to avoid a race between
	2796	* the lookup and an interrupt/unbusy/free and our busy
	2797	* check.
	2798	*/
	2799	crit_enter();
	2800	retry:
	2801	m = vm_page_lookup(orig_object, offidxstart + idx);
	2802	if (m == NULL) {
	2803	crit_exit();
	2804	continue;
	2805	}
	2806
	2807	/*
	2808	* We must wait for pending I/O to complete before we can
	2809	* rename the page.
	2810	*
	2811	* We do not have to VM_PROT_NONE the page as mappings should
	2812	* not be changed by this operation.
	2813	*/
	2814	if (vm_page_sleep_busy(m, TRUE, "spltwt"))
	2815	goto retry;
	2816	vm_page_busy(m);
	2817	vm_page_rename(m, new_object, idx);
	2818	/* page automatically made dirty by rename and cache handled */
	2819	vm_page_busy(m);
	2820	crit_exit();
	2821	}
	2822
	2823	if (orig_object->type == OBJT_SWAP) {
	2824	vm_object_pip_add(orig_object, 1);
	2825	/*
	2826	* copy orig_object pages into new_object
	2827	* and destroy unneeded pages in
	2828	* shadow object.
	2829	*/
	2830	swap_pager_copy(orig_object, new_object, offidxstart, 0);
	2831	vm_object_pip_wakeup(orig_object);
	2832	}
	2833
	2834	/*
	2835	* Wakeup the pages we played with. No spl protection is needed
	2836	* for a simple wakeup.
	2837	*/
	2838	for (idx = 0; idx < size; idx++) {
	2839	m = vm_page_lookup(new_object, idx);
	2840	if (m)
	2841	vm_page_wakeup(m);
	2842	}
	2843
	2844	entry->object.vm_object = new_object;
	2845	entry->offset = 0LL;
	2846	vm_object_deallocate(orig_object);
	2847	}
	2848
	2849	/*
	2850	* vm_map_copy_entry:
	2851	*
	2852	* Copies the contents of the source entry to the destination
	2853	* entry. The entries must be aligned properly.
	2854	*/
	2855	static void
	2856	vm_map_copy_entry(vm_map_t src_map, vm_map_t dst_map,
	2857	vm_map_entry_t src_entry, vm_map_entry_t dst_entry)
	2858	{
	2859	vm_object_t src_object;
	2860
	2861	if (dst_entry->maptype == VM_MAPTYPE_SUBMAP)
	2862	return;
	2863	if (src_entry->maptype == VM_MAPTYPE_SUBMAP)
	2864	return;
	2865
	2866	if (src_entry->wired_count == 0) {
	2867	/*
	2868	* If the source entry is marked needs_copy, it is already
	2869	* write-protected.
	2870	*/
	2871	if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) {
	2872	pmap_protect(src_map->pmap,
	2873	src_entry->start,
	2874	src_entry->end,
	2875	src_entry->protection & ~VM_PROT_WRITE);
	2876	}
	2877
	2878	/*
	2879	* Make a copy of the object.
	2880	*/
	2881	if ((src_object = src_entry->object.vm_object) != NULL) {
	2882	if ((src_object->handle == NULL) &&
	2883	(src_object->type == OBJT_DEFAULT \|\|
	2884	src_object->type == OBJT_SWAP)) {
	2885	vm_object_collapse(src_object);
	2886	if ((src_object->flags & (OBJ_NOSPLIT\|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING) {
	2887	vm_map_split(src_entry);
	2888	src_object = src_entry->object.vm_object;
	2889	}
	2890	}
	2891
	2892	vm_object_reference(src_object);
	2893	vm_object_clear_flag(src_object, OBJ_ONEMAPPING);
	2894	dst_entry->object.vm_object = src_object;
	2895	src_entry->eflags \|= (MAP_ENTRY_COW\|MAP_ENTRY_NEEDS_COPY);
	2896	dst_entry->eflags \|= (MAP_ENTRY_COW\|MAP_ENTRY_NEEDS_COPY);
	2897	dst_entry->offset = src_entry->offset;
	2898	} else {
	2899	dst_entry->object.vm_object = NULL;
	2900	dst_entry->offset = 0;
	2901	}
	2902
	2903	pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start,
	2904	dst_entry->end - dst_entry->start, src_entry->start);
	2905	} else {
	2906	/*
	2907	* Of course, wired down pages can't be set copy-on-write.
	2908	* Cause wired pages to be copied into the new map by
	2909	* simulating faults (the new pages are pageable)
	2910	*/
	2911	vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry);
	2912	}
	2913	}
	2914
	2915	/*
	2916	* vmspace_fork:
	2917	* Create a new process vmspace structure and vm_map
	2918	* based on those of an existing process. The new map
	2919	* is based on the old map, according to the inheritance
	2920	* values on the regions in that map.
	2921	*
	2922	* The source map must not be locked.
	2923	*/
	2924	struct vmspace *
	2925	vmspace_fork(struct vmspace *vm1)
	2926	{
	2927	struct vmspace *vm2;
	2928	vm_map_t old_map = &vm1->vm_map;
	2929	vm_map_t new_map;
	2930	vm_map_entry_t old_entry;
	2931	vm_map_entry_t new_entry;
	2932	vm_object_t object;
	2933	int count;
	2934
	2935	vm_map_lock(old_map);
	2936	old_map->infork = 1;
	2937
	2938	/*
	2939	* XXX Note: upcalls are not copied.
	2940	*/
	2941	vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset);
	2942	bcopy(&vm1->vm_startcopy, &vm2->vm_startcopy,
	2943	(caddr_t)&vm1->vm_endcopy - (caddr_t)&vm1->vm_startcopy);
	2944	new_map = &vm2->vm_map; /* XXX */
	2945	new_map->timestamp = 1;
	2946
	2947	count = 0;
	2948	old_entry = old_map->header.next;
	2949	while (old_entry != &old_map->header) {
	2950	++count;
	2951	old_entry = old_entry->next;
	2952	}
	2953
	2954	count = vm_map_entry_reserve(count + MAP_RESERVE_COUNT);
	2955
	2956	old_entry = old_map->header.next;
	2957	while (old_entry != &old_map->header) {
	2958	if (old_entry->maptype == VM_MAPTYPE_SUBMAP)
	2959	panic("vm_map_fork: encountered a submap");
	2960
	2961	switch (old_entry->inheritance) {
	2962	case VM_INHERIT_NONE:
	2963	break;
	2964
	2965	case VM_INHERIT_SHARE:
	2966	/*
	2967	* Clone the entry, creating the shared object if
	2968	* necessary.
	2969	*/
	2970	object = old_entry->object.vm_object;
	2971	if (object == NULL) {
	2972	vm_map_entry_allocate_object(old_entry);
	2973	object = old_entry->object.vm_object;
	2974	}
	2975
	2976	/*
	2977	* Add the reference before calling vm_map_entry_shadow
	2978	* to insure that a shadow object is created.
	2979	*/
	2980	vm_object_reference(object);
	2981	if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
	2982	vm_map_entry_shadow(old_entry);
	2983	/* Transfer the second reference too. */
	2984	vm_object_reference(
	2985	old_entry->object.vm_object);
	2986	vm_object_deallocate(object);
	2987	object = old_entry->object.vm_object;
	2988	}
	2989	vm_object_clear_flag(object, OBJ_ONEMAPPING);
	2990
	2991	/*
	2992	* Clone the entry, referencing the shared object.
	2993	*/
	2994	new_entry = vm_map_entry_create(new_map, &count);
	2995	new_entry = old_entry;
	2996	new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
	2997	new_entry->wired_count = 0;
	2998
	2999	/*
	3000	* Insert the entry into the new map -- we know we're
	3001	* inserting at the end of the new map.
	3002	*/
	3003
	3004	vm_map_entry_link(new_map, new_map->header.prev,
	3005	new_entry);
	3006
	3007	/*
	3008	* Update the physical map
	3009	*/
	3010
	3011	pmap_copy(new_map->pmap, old_map->pmap,
	3012	new_entry->start,
	3013	(old_entry->end - old_entry->start),
	3014	old_entry->start);
	3015	break;
	3016
	3017	case VM_INHERIT_COPY:
	3018	/*
	3019	* Clone the entry and link into the map.
	3020	*/
	3021	new_entry = vm_map_entry_create(new_map, &count);
	3022	new_entry = old_entry;
	3023	new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
	3024	new_entry->wired_count = 0;
	3025	new_entry->object.vm_object = NULL;
	3026	vm_map_entry_link(new_map, new_map->header.prev,
	3027	new_entry);
	3028	vm_map_copy_entry(old_map, new_map, old_entry,
	3029	new_entry);
	3030	break;
	3031	}
	3032	old_entry = old_entry->next;
	3033	}
	3034
	3035	new_map->size = old_map->size;
	3036	old_map->infork = 0;
	3037	vm_map_unlock(old_map);
	3038	vm_map_entry_release(count);
	3039
	3040	return (vm2);
	3041	}
	3042
	3043	int
	3044	vm_map_stack (vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
	3045	int flags, vm_prot_t prot, vm_prot_t max, int cow)
	3046	{
	3047	vm_map_entry_t prev_entry;
	3048	vm_map_entry_t new_stack_entry;
	3049	vm_size_t init_ssize;
	3050	int rv;
	3051	int count;
	3052	vm_offset_t tmpaddr;
	3053
	3054	cow \|= MAP_IS_STACK;
	3055
	3056	if (max_ssize < sgrowsiz)
	3057	init_ssize = max_ssize;
	3058	else
	3059	init_ssize = sgrowsiz;
	3060
	3061	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	3062	vm_map_lock(map);
	3063
	3064	/*
	3065	* Find space for the mapping
	3066	*/
	3067	if ((flags & (MAP_FIXED \| MAP_TRYFIXED)) == 0) {
	3068	if (vm_map_findspace(map, addrbos, max_ssize, 1,
	3069	flags, &tmpaddr)) {
	3070	vm_map_unlock(map);
	3071	vm_map_entry_release(count);
	3072	return (KERN_NO_SPACE);
	3073	}
	3074	addrbos = tmpaddr;
	3075	}
	3076
	3077	/* If addr is already mapped, no go */
	3078	if (vm_map_lookup_entry(map, addrbos, &prev_entry)) {
	3079	vm_map_unlock(map);
	3080	vm_map_entry_release(count);
	3081	return (KERN_NO_SPACE);
	3082	}
	3083
	3084	#if 0
	3085	/* XXX already handled by kern_mmap() */
	3086	/* If we would blow our VMEM resource limit, no go */
	3087	if (map->size + init_ssize >
	3088	curproc->p_rlimit[RLIMIT_VMEM].rlim_cur) {
	3089	vm_map_unlock(map);
	3090	vm_map_entry_release(count);
	3091	return (KERN_NO_SPACE);
	3092	}
	3093	#endif
	3094
	3095	/*
	3096	* If we can't accomodate max_ssize in the current mapping,
	3097	* no go. However, we need to be aware that subsequent user
	3098	* mappings might map into the space we have reserved for
	3099	* stack, and currently this space is not protected.
	3100	*
	3101	* Hopefully we will at least detect this condition
	3102	* when we try to grow the stack.
	3103	*/
	3104	if ((prev_entry->next != &map->header) &&
	3105	(prev_entry->next->start < addrbos + max_ssize)) {
	3106	vm_map_unlock(map);
	3107	vm_map_entry_release(count);
	3108	return (KERN_NO_SPACE);
	3109	}
	3110
	3111	/*
	3112	* We initially map a stack of only init_ssize. We will
	3113	* grow as needed later. Since this is to be a grow
	3114	* down stack, we map at the top of the range.
	3115	*
	3116	* Note: we would normally expect prot and max to be
	3117	* VM_PROT_ALL, and cow to be 0. Possibly we should
	3118	* eliminate these as input parameters, and just
	3119	* pass these values here in the insert call.
	3120	*/
	3121	rv = vm_map_insert(map, &count,
	3122	NULL, 0, addrbos + max_ssize - init_ssize,
	3123	addrbos + max_ssize,
	3124	VM_MAPTYPE_NORMAL,
	3125	prot, max,
	3126	cow);
	3127
	3128	/* Now set the avail_ssize amount */
	3129	if (rv == KERN_SUCCESS) {
	3130	if (prev_entry != &map->header)
	3131	vm_map_clip_end(map, prev_entry, addrbos + max_ssize - init_ssize, &count);
	3132	new_stack_entry = prev_entry->next;
	3133	if (new_stack_entry->end != addrbos + max_ssize \|\|
	3134	new_stack_entry->start != addrbos + max_ssize - init_ssize)
	3135	panic ("Bad entry start/end for new stack entry");
	3136	else
	3137	new_stack_entry->aux.avail_ssize = max_ssize - init_ssize;
	3138	}
	3139
	3140	vm_map_unlock(map);
	3141	vm_map_entry_release(count);
	3142	return (rv);
	3143	}
	3144
	3145	/* Attempts to grow a vm stack entry. Returns KERN_SUCCESS if the
	3146	* desired address is already mapped, or if we successfully grow
	3147	* the stack. Also returns KERN_SUCCESS if addr is outside the
	3148	* stack range (this is strange, but preserves compatibility with
	3149	* the grow function in vm_machdep.c).
	3150	*/
	3151	int
	3152	vm_map_growstack (struct proc *p, vm_offset_t addr)
	3153	{
	3154	vm_map_entry_t prev_entry;
	3155	vm_map_entry_t stack_entry;
	3156	vm_map_entry_t new_stack_entry;
	3157	struct vmspace *vm = p->p_vmspace;
	3158	vm_map_t map = &vm->vm_map;
	3159	vm_offset_t end;
	3160	int grow_amount;
	3161	int rv = KERN_SUCCESS;
	3162	int is_procstack;
	3163	int use_read_lock = 1;
	3164	int count;
	3165
	3166	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	3167	Retry:
	3168	if (use_read_lock)
	3169	vm_map_lock_read(map);
	3170	else
	3171	vm_map_lock(map);
	3172
	3173	/* If addr is already in the entry range, no need to grow.*/
	3174	if (vm_map_lookup_entry(map, addr, &prev_entry))
	3175	goto done;
	3176
	3177	if ((stack_entry = prev_entry->next) == &map->header)
	3178	goto done;
	3179	if (prev_entry == &map->header)
	3180	end = stack_entry->start - stack_entry->aux.avail_ssize;
	3181	else
	3182	end = prev_entry->end;
	3183
	3184	/*
	3185	* This next test mimics the old grow function in vm_machdep.c.
	3186	* It really doesn't quite make sense, but we do it anyway
	3187	* for compatibility.
	3188	*
	3189	* If not growable stack, return success. This signals the
	3190	* caller to proceed as he would normally with normal vm.
	3191	*/
	3192	if (stack_entry->aux.avail_ssize < 1 \|\|
	3193	addr >= stack_entry->start \|\|
	3194	addr < stack_entry->start - stack_entry->aux.avail_ssize) {
	3195	goto done;
	3196	}
	3197
	3198	/* Find the minimum grow amount */
	3199	grow_amount = roundup (stack_entry->start - addr, PAGE_SIZE);
	3200	if (grow_amount > stack_entry->aux.avail_ssize) {
	3201	rv = KERN_NO_SPACE;
	3202	goto done;
	3203	}
	3204
	3205	/*
	3206	* If there is no longer enough space between the entries
	3207	* nogo, and adjust the available space. Note: this
	3208	* should only happen if the user has mapped into the
	3209	* stack area after the stack was created, and is
	3210	* probably an error.
	3211	*
	3212	* This also effectively destroys any guard page the user
	3213	* might have intended by limiting the stack size.
	3214	*/
	3215	if (grow_amount > stack_entry->start - end) {
	3216	if (use_read_lock && vm_map_lock_upgrade(map)) {
	3217	use_read_lock = 0;
	3218	goto Retry;
	3219	}
	3220	use_read_lock = 0;
	3221	stack_entry->aux.avail_ssize = stack_entry->start - end;
	3222	rv = KERN_NO_SPACE;
	3223	goto done;
	3224	}
	3225
	3226	is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr;
	3227
	3228	/* If this is the main process stack, see if we're over the
	3229	* stack limit.
	3230	*/
	3231	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount >
	3232	p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
	3233	rv = KERN_NO_SPACE;
	3234	goto done;
	3235	}
	3236
	3237	/* Round up the grow amount modulo SGROWSIZ */
	3238	grow_amount = roundup (grow_amount, sgrowsiz);
	3239	if (grow_amount > stack_entry->aux.avail_ssize) {
	3240	grow_amount = stack_entry->aux.avail_ssize;
	3241	}
	3242	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount >
	3243	p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
	3244	grow_amount = p->p_rlimit[RLIMIT_STACK].rlim_cur -
	3245	ctob(vm->vm_ssize);
	3246	}
	3247
	3248	/* If we would blow our VMEM resource limit, no go */
	3249	if (map->size + grow_amount > p->p_rlimit[RLIMIT_VMEM].rlim_cur) {
	3250	rv = KERN_NO_SPACE;
	3251	goto done;
	3252	}
	3253
	3254	if (use_read_lock && vm_map_lock_upgrade(map)) {
	3255	use_read_lock = 0;
	3256	goto Retry;
	3257	}
	3258	use_read_lock = 0;
	3259
	3260	/* Get the preliminary new entry start value */
	3261	addr = stack_entry->start - grow_amount;
	3262
	3263	/* If this puts us into the previous entry, cut back our growth
	3264	* to the available space. Also, see the note above.
	3265	*/
	3266	if (addr < end) {
	3267	stack_entry->aux.avail_ssize = stack_entry->start - end;
	3268	addr = end;
	3269	}
	3270
	3271	rv = vm_map_insert(map, &count,
	3272	NULL, 0, addr, stack_entry->start,
	3273	VM_MAPTYPE_NORMAL,
	3274	VM_PROT_ALL, VM_PROT_ALL,
	3275	0);
	3276
	3277	/* Adjust the available stack space by the amount we grew. */
	3278	if (rv == KERN_SUCCESS) {
	3279	if (prev_entry != &map->header)
	3280	vm_map_clip_end(map, prev_entry, addr, &count);
	3281	new_stack_entry = prev_entry->next;
	3282	if (new_stack_entry->end != stack_entry->start \|\|
	3283	new_stack_entry->start != addr)
	3284	panic ("Bad stack grow start/end in new stack entry");
	3285	else {
	3286	new_stack_entry->aux.avail_ssize =
	3287	stack_entry->aux.avail_ssize -
	3288	(new_stack_entry->end - new_stack_entry->start);
	3289	if (is_procstack)
	3290	vm->vm_ssize += btoc(new_stack_entry->end -
	3291	new_stack_entry->start);
	3292	}
	3293	}
	3294
	3295	done:
	3296	if (use_read_lock)
	3297	vm_map_unlock_read(map);
	3298	else
	3299	vm_map_unlock(map);
	3300	vm_map_entry_release(count);
	3301	return (rv);
	3302	}
	3303
	3304	/*
	3305	* Unshare the specified VM space for exec. If other processes are
	3306	* mapped to it, then create a new one. The new vmspace is null.
	3307	*/
	3308	void
	3309	vmspace_exec(struct proc p, struct vmspace vmcopy)
	3310	{
	3311	struct vmspace *oldvmspace = p->p_vmspace;
	3312	struct vmspace *newvmspace;
	3313	vm_map_t map = &p->p_vmspace->vm_map;
	3314
	3315	/*
	3316	* If we are execing a resident vmspace we fork it, otherwise
	3317	* we create a new vmspace. Note that exitingcnt and upcalls
	3318	* are not copied to the new vmspace.
	3319	*/
	3320	if (vmcopy) {
	3321	newvmspace = vmspace_fork(vmcopy);
	3322	} else {
	3323	newvmspace = vmspace_alloc(map->min_offset, map->max_offset);
	3324	bcopy(&oldvmspace->vm_startcopy, &newvmspace->vm_startcopy,
	3325	(caddr_t)&oldvmspace->vm_endcopy -
	3326	(caddr_t)&oldvmspace->vm_startcopy);
	3327	}
	3328
	3329	/*
	3330	* Finish initializing the vmspace before assigning it
	3331	* to the process. The vmspace will become the current vmspace
	3332	* if p == curproc.
	3333	*/
	3334	pmap_pinit2(vmspace_pmap(newvmspace));
	3335	pmap_replacevm(p, newvmspace, 0);
	3336	sysref_put(&oldvmspace->vm_sysref);
	3337	}
	3338
	3339	/*
	3340	* Unshare the specified VM space for forcing COW. This
	3341	* is called by rfork, for the (RFMEM\|RFPROC) == 0 case.
	3342	*
	3343	* The exitingcnt test is not strictly necessary but has been
	3344	* included for code sanity (to make the code a bit more deterministic).
	3345	*/
	3346
	3347	void
	3348	vmspace_unshare(struct proc *p)
	3349	{
	3350	struct vmspace *oldvmspace = p->p_vmspace;
	3351	struct vmspace *newvmspace;
	3352
	3353	if (oldvmspace->vm_sysref.refcnt == 1 && oldvmspace->vm_exitingcnt == 0)
	3354	return;
	3355	newvmspace = vmspace_fork(oldvmspace);
	3356	pmap_pinit2(vmspace_pmap(newvmspace));
	3357	pmap_replacevm(p, newvmspace, 0);
	3358	sysref_put(&oldvmspace->vm_sysref);
	3359	}
	3360
	3361	/*
	3362	* vm_map_lookup:
	3363	*
	3364	* Finds the VM object, offset, and
	3365	* protection for a given virtual address in the
	3366	* specified map, assuming a page fault of the
	3367	* type specified.
	3368	*
	3369	* Leaves the map in question locked for read; return
	3370	* values are guaranteed until a vm_map_lookup_done
	3371	* call is performed. Note that the map argument
	3372	* is in/out; the returned map must be used in
	3373	* the call to vm_map_lookup_done.
	3374	*
	3375	* A handle (out_entry) is returned for use in
	3376	* vm_map_lookup_done, to make that fast.
	3377	*
	3378	* If a lookup is requested with "write protection"
	3379	* specified, the map may be changed to perform virtual
	3380	* copying operations, although the data referenced will
	3381	* remain the same.
	3382	*/
	3383	int
	3384	vm_map_lookup(vm_map_t var_map, / IN/OUT */
	3385	vm_offset_t vaddr,
	3386	vm_prot_t fault_typea,
	3387	vm_map_entry_t out_entry, / OUT */
	3388	vm_object_t object, / OUT */
	3389	vm_pindex_t pindex, / OUT */
	3390	vm_prot_t out_prot, / OUT */
	3391	boolean_t wired) / OUT */
	3392	{
	3393	vm_map_entry_t entry;
	3394	vm_map_t map = *var_map;
	3395	vm_prot_t prot;
	3396	vm_prot_t fault_type = fault_typea;
	3397	int use_read_lock = 1;
	3398	int rv = KERN_SUCCESS;
	3399
	3400	RetryLookup:
	3401	if (use_read_lock)
	3402	vm_map_lock_read(map);
	3403	else
	3404	vm_map_lock(map);
	3405
	3406	/*
	3407	* If the map has an interesting hint, try it before calling full
	3408	* blown lookup routine.
	3409	*/
	3410	entry = map->hint;
	3411	*out_entry = entry;
	3412
	3413	if ((entry == &map->header) \|\|
	3414	(vaddr < entry->start) \|\| (vaddr >= entry->end)) {
	3415	vm_map_entry_t tmp_entry;
	3416
	3417	/*
	3418	* Entry was either not a valid hint, or the vaddr was not
	3419	* contained in the entry, so do a full lookup.
	3420	*/
	3421	if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
	3422	rv = KERN_INVALID_ADDRESS;
	3423	goto done;
	3424	}
	3425
	3426	entry = tmp_entry;
	3427	*out_entry = entry;
	3428	}
	3429
	3430	/*
	3431	* Handle submaps.
	3432	*/
	3433	if (entry->maptype == VM_MAPTYPE_SUBMAP) {
	3434	vm_map_t old_map = map;
	3435
	3436	*var_map = map = entry->object.sub_map;
	3437	if (use_read_lock)
	3438	vm_map_unlock_read(old_map);
	3439	else
	3440	vm_map_unlock(old_map);
	3441	use_read_lock = 1;
	3442	goto RetryLookup;
	3443	}
	3444
	3445	/*
	3446	* Check whether this task is allowed to have this page.
	3447	* Note the special case for MAP_ENTRY_COW
	3448	* pages with an override. This is to implement a forced
	3449	* COW for debuggers.
	3450	*/
	3451
	3452	if (fault_type & VM_PROT_OVERRIDE_WRITE)
	3453	prot = entry->max_protection;
	3454	else
	3455	prot = entry->protection;
	3456
	3457	fault_type &= (VM_PROT_READ\|VM_PROT_WRITE\|VM_PROT_EXECUTE);
	3458	if ((fault_type & prot) != fault_type) {
	3459	rv = KERN_PROTECTION_FAILURE;
	3460	goto done;
	3461	}
	3462
	3463	if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
	3464	(entry->eflags & MAP_ENTRY_COW) &&
	3465	(fault_type & VM_PROT_WRITE) &&
	3466	(fault_typea & VM_PROT_OVERRIDE_WRITE) == 0) {
	3467	rv = KERN_PROTECTION_FAILURE;
	3468	goto done;
	3469	}
	3470
	3471	/*
	3472	* If this page is not pageable, we have to get it for all possible
	3473	* accesses.
	3474	*/
	3475	*wired = (entry->wired_count != 0);
	3476	if (*wired)
	3477	prot = fault_type = entry->protection;
	3478
	3479	/*
	3480	* Virtual page tables may need to update the accessed (A) bit
	3481	* in a page table entry. Upgrade the fault to a write fault for
	3482	* that case if the map will support it. If the map does not support
	3483	* it the page table entry simply will not be updated.
	3484	*/
	3485	if (entry->maptype == VM_MAPTYPE_VPAGETABLE) {
	3486	if (prot & VM_PROT_WRITE)
	3487	fault_type \|= VM_PROT_WRITE;
	3488	}
	3489
	3490	/*
	3491	* If the entry was copy-on-write, we either ...
	3492	*/
	3493	if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
	3494	/*
	3495	* If we want to write the page, we may as well handle that
	3496	* now since we've got the map locked.
	3497	*
	3498	* If we don't need to write the page, we just demote the
	3499	* permissions allowed.
	3500	*/
	3501
	3502	if (fault_type & VM_PROT_WRITE) {
	3503	/*
	3504	* Make a new object, and place it in the object
	3505	* chain. Note that no new references have appeared
	3506	* -- one just moved from the map to the new
	3507	* object.
	3508	*/
	3509
	3510	if (use_read_lock && vm_map_lock_upgrade(map)) {
	3511	use_read_lock = 0;
	3512	goto RetryLookup;
	3513	}
	3514	use_read_lock = 0;
	3515
	3516	vm_map_entry_shadow(entry);
	3517	} else {
	3518	/*
	3519	* We're attempting to read a copy-on-write page --
	3520	* don't allow writes.
	3521	*/
	3522
	3523	prot &= ~VM_PROT_WRITE;
	3524	}
	3525	}
	3526
	3527	/*
	3528	* Create an object if necessary.
	3529	*/
	3530	if (entry->object.vm_object == NULL &&
	3531	!map->system_map) {
	3532	if (use_read_lock && vm_map_lock_upgrade(map)) {
	3533	use_read_lock = 0;
	3534	goto RetryLookup;
	3535	}
	3536	use_read_lock = 0;
	3537	vm_map_entry_allocate_object(entry);
	3538	}
	3539
	3540	/*
	3541	* Return the object/offset from this entry. If the entry was
	3542	* copy-on-write or empty, it has been fixed up.
	3543	*/
	3544
	3545	*pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
	3546	*object = entry->object.vm_object;
	3547
	3548	/*
	3549	* Return whether this is the only map sharing this data. On
	3550	* success we return with a read lock held on the map. On failure
	3551	* we return with the map unlocked.
	3552	*/
	3553	*out_prot = prot;
	3554	done:
	3555	if (rv == KERN_SUCCESS) {
	3556	if (use_read_lock == 0)
	3557	vm_map_lock_downgrade(map);
	3558	} else if (use_read_lock) {
	3559	vm_map_unlock_read(map);
	3560	} else {
	3561	vm_map_unlock(map);
	3562	}
	3563	return (rv);
	3564	}
	3565
	3566	/*
	3567	* vm_map_lookup_done:
	3568	*
	3569	* Releases locks acquired by a vm_map_lookup
	3570	* (according to the handle returned by that lookup).
	3571	*/
	3572
	3573	void
	3574	vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry, int count)
	3575	{
	3576	/*
	3577	* Unlock the main-level map
	3578	*/
	3579	vm_map_unlock_read(map);
	3580	if (count)
	3581	vm_map_entry_release(count);
	3582	}
	3583
	3584	#include "opt_ddb.h"
	3585	#ifdef DDB
	3586	#include <sys/kernel.h>
	3587
	3588	#include <ddb/ddb.h>
	3589
	3590	/*
	3591	* vm_map_print: [ debug ]
	3592	*/
	3593	DB_SHOW_COMMAND(map, vm_map_print)
	3594	{
	3595	static int nlines;
	3596	/* XXX convert args. */
	3597	vm_map_t map = (vm_map_t)addr;
	3598	boolean_t full = have_addr;
	3599
	3600	vm_map_entry_t entry;
	3601
	3602	db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
	3603	(void *)map,
	3604	(void *)map->pmap, map->nentries, map->timestamp);
	3605	nlines++;
	3606
	3607	if (!full && db_indent)
	3608	return;
	3609
	3610	db_indent += 2;
	3611	for (entry = map->header.next; entry != &map->header;
	3612	entry = entry->next) {
	3613	db_iprintf("map entry %p: start=%p, end=%p\n",
	3614	(void )entry, (void )entry->start, (void *)entry->end);
	3615	nlines++;
	3616	{
	3617	static char *inheritance_name[4] =
	3618	{"share", "copy", "none", "donate_copy"};
	3619
	3620	db_iprintf(" prot=%x/%x/%s",
	3621	entry->protection,
	3622	entry->max_protection,
	3623	inheritance_name[(int)(unsigned char)entry->inheritance]);
	3624	if (entry->wired_count != 0)
	3625	db_printf(", wired");
	3626	}
	3627	if (entry->maptype == VM_MAPTYPE_SUBMAP) {
	3628	/* XXX no %qd in kernel. Truncate entry->offset. */
	3629	db_printf(", share=%p, offset=0x%lx\n",
	3630	(void *)entry->object.sub_map,
	3631	(long)entry->offset);
	3632	nlines++;
	3633	if ((entry->prev == &map->header) \|\|
	3634	(entry->prev->object.sub_map !=
	3635	entry->object.sub_map)) {
	3636	db_indent += 2;
	3637	vm_map_print((db_expr_t)(intptr_t)
	3638	entry->object.sub_map,
	3639	full, 0, NULL);
	3640	db_indent -= 2;
	3641	}
	3642	} else {
	3643	/* XXX no %qd in kernel. Truncate entry->offset. */
	3644	db_printf(", object=%p, offset=0x%lx",
	3645	(void *)entry->object.vm_object,
	3646	(long)entry->offset);
	3647	if (entry->eflags & MAP_ENTRY_COW)
	3648	db_printf(", copy (%s)",
	3649	(entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
	3650	db_printf("\n");
	3651	nlines++;
	3652
	3653	if ((entry->prev == &map->header) \|\|
	3654	(entry->prev->object.vm_object !=
	3655	entry->object.vm_object)) {
	3656	db_indent += 2;
	3657	vm_object_print((db_expr_t)(intptr_t)
	3658	entry->object.vm_object,
	3659	full, 0, NULL);
	3660	nlines += 4;
	3661	db_indent -= 2;
	3662	}
	3663	}
	3664	}
	3665	db_indent -= 2;
	3666	if (db_indent == 0)
	3667	nlines = 0;
	3668	}
	3669
	3670
	3671	DB_SHOW_COMMAND(procvm, procvm)
	3672	{
	3673	struct proc *p;
	3674
	3675	if (have_addr) {
	3676	p = (struct proc *) addr;
	3677	} else {
	3678	p = curproc;
	3679	}
	3680
	3681	db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
	3682	(void )p, (void )p->p_vmspace, (void *)&p->p_vmspace->vm_map,
	3683	(void *)vmspace_pmap(p->p_vmspace));
	3684
	3685	vm_map_print((db_expr_t)(intptr_t)&p->p_vmspace->vm_map, 1, 0, NULL);
	3686	}
	3687
	3688	#endif /* DDB */