gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 1991, 1993
	3	* The Regents of the University of California. All rights reserved.
	4	*
	5	* This code is derived from software contributed to Berkeley by
	6	* The Mach Operating System project at Carnegie-Mellon University.
	7	*
	8	* Redistribution and use in source and binary forms, with or without
	9	* modification, are permitted provided that the following conditions
	10	* are met:
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in the
	15	* documentation and/or other materials provided with the distribution.
	16	* 3. All advertising materials mentioning features or use of this software
	17	* must display the following acknowledgement:
	18	* This product includes software developed by the University of
	19	* California, Berkeley and its contributors.
	20	* 4. Neither the name of the University nor the names of its contributors
	21	* may be used to endorse or promote products derived from this software
	22	* without specific prior written permission.
	23	*
	24	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	25	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	26	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	27	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	28	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	29	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	30	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	31	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	32	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	33	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	34	* SUCH DAMAGE.
	35	*
	36	* from: @(#)vm_map.c 8.3 (Berkeley) 1/12/94
	37	*
	38	*
	39	* Copyright (c) 1987, 1990 Carnegie-Mellon University.
	40	* All rights reserved.
	41	*
	42	* Authors: Avadis Tevanian, Jr., Michael Wayne Young
	43	*
	44	* Permission to use, copy, modify and distribute this software and
	45	* its documentation is hereby granted, provided that both the copyright
	46	* notice and this permission notice appear in all copies of the
	47	* software, derivative works or modified versions, and any portions
	48	* thereof, and that both notices appear in supporting documentation.
	49	*
	50	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
	51	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
	52	* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
	53	*
	54	* Carnegie Mellon requests users of this software to return to
	55	*
	56	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
	57	* School of Computer Science
	58	* Carnegie Mellon University
	59	* Pittsburgh PA 15213-3890
	60	*
	61	* any improvements or extensions that they make and grant Carnegie the
	62	* rights to redistribute these changes.
	63	*
	64	* $FreeBSD: src/sys/vm/vm_map.c,v 1.187.2.19 2003/05/27 00:47:02 alc Exp $
	65	* $DragonFly: src/sys/vm/vm_map.c,v 1.37 2005/01/20 18:00:38 dillon Exp $
	66	*/
	67
	68	/*
	69	* Virtual memory mapping module.
	70	*/
	71
	72	#include <sys/param.h>
	73	#include <sys/systm.h>
	74	#include <sys/proc.h>
	75	#include <sys/lock.h>
	76	#include <sys/vmmeter.h>
	77	#include <sys/mman.h>
	78	#include <sys/vnode.h>
	79	#include <sys/resourcevar.h>
	80	#include <sys/shm.h>
	81	#include <sys/tree.h>
	82
	83	#include <vm/vm.h>
	84	#include <vm/vm_param.h>
	85	#include <vm/pmap.h>
	86	#include <vm/vm_map.h>
	87	#include <vm/vm_page.h>
	88	#include <vm/vm_object.h>
	89	#include <vm/vm_pager.h>
	90	#include <vm/vm_kern.h>
	91	#include <vm/vm_extern.h>
	92	#include <vm/swap_pager.h>
	93	#include <vm/vm_zone.h>
	94
	95	#include <sys/thread2.h>
	96
	97	/*
	98	* Virtual memory maps provide for the mapping, protection,
	99	* and sharing of virtual memory objects. In addition,
	100	* this module provides for an efficient virtual copy of
	101	* memory from one map to another.
	102	*
	103	* Synchronization is required prior to most operations.
	104	*
	105	* Maps consist of an ordered doubly-linked list of simple
	106	* entries; a single hint is used to speed up lookups.
	107	*
	108	* Since portions of maps are specified by start/end addresses,
	109	* which may not align with existing map entries, all
	110	* routines merely "clip" entries to these start/end values.
	111	* [That is, an entry is split into two, bordering at a
	112	* start or end value.] Note that these clippings may not
	113	* always be necessary (as the two resulting entries are then
	114	* not changed); however, the clipping is done for convenience.
	115	*
	116	* As mentioned above, virtual copy operations are performed
	117	* by copying VM object references from one map to
	118	* another, and then marking both regions as copy-on-write.
	119	*/
	120
	121	/*
	122	* vm_map_startup:
	123	*
	124	* Initialize the vm_map module. Must be called before
	125	* any other vm_map routines.
	126	*
	127	* Map and entry structures are allocated from the general
	128	* purpose memory pool with some exceptions:
	129	*
	130	* - The kernel map and kmem submap are allocated statically.
	131	* - Kernel map entries are allocated out of a static pool.
	132	*
	133	* These restrictions are necessary since malloc() uses the
	134	* maps and requires map entries.
	135	*/
	136
	137	#define VMEPERCPU 2
	138
	139	static struct vm_zone mapentzone_store, mapzone_store;
	140	static vm_zone_t mapentzone, mapzone, vmspace_zone;
	141	static struct vm_object mapentobj, mapobj;
	142
	143	static struct vm_map_entry map_entry_init[MAX_MAPENT];
	144	static struct vm_map_entry cpu_map_entry_init[MAXCPU][VMEPERCPU];
	145	static struct vm_map map_init[MAX_KMAP];
	146
	147	static vm_map_entry_t vm_map_entry_create(vm_map_t map, int *);
	148	static void vm_map_entry_dispose (vm_map_t map, vm_map_entry_t entry, int *);
	149	static void _vm_map_clip_end (vm_map_t, vm_map_entry_t, vm_offset_t, int *);
	150	static void _vm_map_clip_start (vm_map_t, vm_map_entry_t, vm_offset_t, int *);
	151	static void vm_map_entry_delete (vm_map_t, vm_map_entry_t, int *);
	152	static void vm_map_entry_unwire (vm_map_t, vm_map_entry_t);
	153	static void vm_map_copy_entry (vm_map_t, vm_map_t, vm_map_entry_t,
	154	vm_map_entry_t);
	155	static void vm_map_split (vm_map_entry_t);
	156	static void vm_map_unclip_range (vm_map_t map, vm_map_entry_t start_entry, vm_offset_t start, vm_offset_t end, int *count, int flags);
	157
	158	void
	159	vm_map_startup(void)
	160	{
	161	mapzone = &mapzone_store;
	162	zbootinit(mapzone, "MAP", sizeof (struct vm_map),
	163	map_init, MAX_KMAP);
	164	mapentzone = &mapentzone_store;
	165	zbootinit(mapentzone, "MAP ENTRY", sizeof (struct vm_map_entry),
	166	map_entry_init, MAX_MAPENT);
	167	}
	168
	169	/*
	170	* Red black tree functions
	171	*/
	172	static int rb_vm_map_compare(vm_map_entry_t a, vm_map_entry_t b);
	173	RB_GENERATE(vm_map_rb_tree, vm_map_entry, rb_entry, rb_vm_map_compare);
	174
	175	/* a->start is address, and the only field has to be initialized */
	176	static int
	177	rb_vm_map_compare(vm_map_entry_t a, vm_map_entry_t b)
	178	{
	179	if (a->start < b->start)
	180	return(-1);
	181	else if (a->start > b->start)
	182	return(1);
	183	return(0);
	184	}
	185
	186	/*
	187	* Allocate a vmspace structure, including a vm_map and pmap,
	188	* and initialize those structures. The refcnt is set to 1.
	189	* The remaining fields must be initialized by the caller.
	190	*/
	191	struct vmspace *
	192	vmspace_alloc(vm_offset_t min, vm_offset_t max)
	193	{
	194	struct vmspace *vm;
	195
	196	vm = zalloc(vmspace_zone);
	197	vm_map_init(&vm->vm_map, min, max);
	198	pmap_pinit(vmspace_pmap(vm));
	199	vm->vm_map.pmap = vmspace_pmap(vm); /* XXX */
	200	vm->vm_refcnt = 1;
	201	vm->vm_shm = NULL;
	202	vm->vm_exitingcnt = 0;
	203	return (vm);
	204	}
	205
	206	void
	207	vm_init2(void)
	208	{
	209	zinitna(mapentzone, &mapentobj, NULL, 0, 0,
	210	ZONE_USE_RESERVE \| ZONE_SPECIAL, 1);
	211	zinitna(mapzone, &mapobj, NULL, 0, 0, 0, 1);
	212	vmspace_zone = zinit("VMSPACE", sizeof (struct vmspace), 0, 0, 3);
	213	pmap_init2();
	214	vm_object_init2();
	215	}
	216
	217	static __inline void
	218	vmspace_dofree(struct vmspace *vm)
	219	{
	220	int count;
	221
	222	/*
	223	* Make sure any SysV shm is freed, it might not have in
	224	* exit1()
	225	*/
	226	shmexit(vm);
	227
	228	KKASSERT(vm->vm_upcalls == NULL);
	229
	230	/*
	231	* Lock the map, to wait out all other references to it.
	232	* Delete all of the mappings and pages they hold, then call
	233	* the pmap module to reclaim anything left.
	234	*/
	235	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	236	vm_map_lock(&vm->vm_map);
	237	vm_map_delete(&vm->vm_map, vm->vm_map.min_offset,
	238	vm->vm_map.max_offset, &count);
	239	vm_map_unlock(&vm->vm_map);
	240	vm_map_entry_release(count);
	241
	242	pmap_release(vmspace_pmap(vm));
	243	zfree(vmspace_zone, vm);
	244	}
	245
	246	void
	247	vmspace_free(struct vmspace *vm)
	248	{
	249	if (vm->vm_refcnt == 0)
	250	panic("vmspace_free: attempt to free already freed vmspace");
	251
	252	if (--vm->vm_refcnt == 0 && vm->vm_exitingcnt == 0)
	253	vmspace_dofree(vm);
	254	}
	255
	256	void
	257	vmspace_exitfree(struct proc *p)
	258	{
	259	struct vmspace *vm;
	260
	261	vm = p->p_vmspace;
	262	p->p_vmspace = NULL;
	263
	264	/*
	265	* cleanup by parent process wait()ing on exiting child. vm_refcnt
	266	* may not be 0 (e.g. fork() and child exits without exec()ing).
	267	* exitingcnt may increment above 0 and drop back down to zero
	268	* several times while vm_refcnt is held non-zero. vm_refcnt
	269	* may also increment above 0 and drop back down to zero several
	270	* times while vm_exitingcnt is held non-zero.
	271	*
	272	* The last wait on the exiting child's vmspace will clean up
	273	* the remainder of the vmspace.
	274	*/
	275	if (--vm->vm_exitingcnt == 0 && vm->vm_refcnt == 0)
	276	vmspace_dofree(vm);
	277	}
	278
	279	/*
	280	* vmspace_swap_count() - count the approximate swap useage in pages for a
	281	* vmspace.
	282	*
	283	* Swap useage is determined by taking the proportional swap used by
	284	* VM objects backing the VM map. To make up for fractional losses,
	285	* if the VM object has any swap use at all the associated map entries
	286	* count for at least 1 swap page.
	287	*/
	288	int
	289	vmspace_swap_count(struct vmspace *vmspace)
	290	{
	291	vm_map_t map = &vmspace->vm_map;
	292	vm_map_entry_t cur;
	293	int count = 0;
	294
	295	for (cur = map->header.next; cur != &map->header; cur = cur->next) {
	296	vm_object_t object;
	297
	298	if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 &&
	299	(object = cur->object.vm_object) != NULL &&
	300	object->type == OBJT_SWAP
	301	) {
	302	int n = (cur->end - cur->start) / PAGE_SIZE;
	303
	304	if (object->un_pager.swp.swp_bcount) {
	305	count += object->un_pager.swp.swp_bcount *
	306	SWAP_META_PAGES * n / object->size + 1;
	307	}
	308	}
	309	}
	310	return(count);
	311	}
	312
	313
	314	/*
	315	* vm_map_create:
	316	*
	317	* Creates and returns a new empty VM map with
	318	* the given physical map structure, and having
	319	* the given lower and upper address bounds.
	320	*/
	321	vm_map_t
	322	vm_map_create(pmap_t pmap, vm_offset_t min, vm_offset_t max)
	323	{
	324	vm_map_t result;
	325
	326	result = zalloc(mapzone);
	327	vm_map_init(result, min, max);
	328	result->pmap = pmap;
	329	return (result);
	330	}
	331
	332	/*
	333	* Initialize an existing vm_map structure
	334	* such as that in the vmspace structure.
	335	* The pmap is set elsewhere.
	336	*/
	337	void
	338	vm_map_init(struct vm_map *map, vm_offset_t min, vm_offset_t max)
	339	{
	340	map->header.next = map->header.prev = &map->header;
	341	RB_INIT(&map->rb_root);
	342	map->nentries = 0;
	343	map->size = 0;
	344	map->system_map = 0;
	345	map->infork = 0;
	346	map->min_offset = min;
	347	map->max_offset = max;
	348	map->first_free = &map->header;
	349	map->hint = &map->header;
	350	map->timestamp = 0;
	351	lockinit(&map->lock, 0, "thrd_sleep", 0, LK_NOPAUSE);
	352	}
	353
	354	/*
	355	* vm_map_entry_reserve_cpu_init:
	356	*
	357	* Set an initial negative count so the first attempt to reserve
	358	* space preloads a bunch of vm_map_entry's for this cpu. Also
	359	* pre-allocate 2 vm_map_entries which will be needed by zalloc() to
	360	* map a new page for vm_map_entry structures. SMP systems are
	361	* particularly sensitive.
	362	*
	363	* This routine is called in early boot so we cannot just call
	364	* vm_map_entry_reserve().
	365	*
	366	* May be called for a gd other then mycpu, but may only be called
	367	* during early boot.
	368	*/
	369	void
	370	vm_map_entry_reserve_cpu_init(globaldata_t gd)
	371	{
	372	vm_map_entry_t entry;
	373	int i;
	374
	375	gd->gd_vme_avail -= MAP_RESERVE_COUNT * 2;
	376	entry = &cpu_map_entry_init[gd->gd_cpuid][0];
	377	for (i = 0; i < VMEPERCPU; ++i, ++entry) {
	378	entry->next = gd->gd_vme_base;
	379	gd->gd_vme_base = entry;
	380	}
	381	}
	382
	383	/*
	384	* vm_map_entry_reserve:
	385	*
	386	* Reserves vm_map_entry structures so code later on can manipulate
	387	* map_entry structures within a locked map without blocking trying
	388	* to allocate a new vm_map_entry.
	389	*/
	390	int
	391	vm_map_entry_reserve(int count)
	392	{
	393	struct globaldata *gd = mycpu;
	394	vm_map_entry_t entry;
	395
	396	crit_enter();
	397
	398	/*
	399	* Make sure we have enough structures in gd_vme_base to handle
	400	* the reservation request.
	401	*/
	402	while (gd->gd_vme_avail < count) {
	403	entry = zalloc(mapentzone);
	404	entry->next = gd->gd_vme_base;
	405	gd->gd_vme_base = entry;
	406	++gd->gd_vme_avail;
	407	}
	408	gd->gd_vme_avail -= count;
	409	crit_exit();
	410	return(count);
	411	}
	412
	413	/*
	414	* vm_map_entry_release:
	415	*
	416	* Releases previously reserved vm_map_entry structures that were not
	417	* used. If we have too much junk in our per-cpu cache clean some of
	418	* it out.
	419	*/
	420	void
	421	vm_map_entry_release(int count)
	422	{
	423	struct globaldata *gd = mycpu;
	424	vm_map_entry_t entry;
	425
	426	crit_enter();
	427	gd->gd_vme_avail += count;
	428	while (gd->gd_vme_avail > MAP_RESERVE_SLOP) {
	429	entry = gd->gd_vme_base;
	430	KKASSERT(entry != NULL);
	431	gd->gd_vme_base = entry->next;
	432	--gd->gd_vme_avail;
	433	crit_exit();
	434	zfree(mapentzone, entry);
	435	crit_enter();
	436	}
	437	crit_exit();
	438	}
	439
	440	/*
	441	* vm_map_entry_kreserve:
	442	*
	443	* Reserve map entry structures for use in kernel_map itself. These
	444	* entries have ALREADY been reserved on a per-cpu basis when the map
	445	* was inited. This function is used by zalloc() to avoid a recursion
	446	* when zalloc() itself needs to allocate additional kernel memory.
	447	*
	448	* This function works like the normal reserve but does not load the
	449	* vm_map_entry cache (because that would result in an infinite
	450	* recursion). Note that gd_vme_avail may go negative. This is expected.
	451	*
	452	* Any caller of this function must be sure to renormalize after
	453	* potentially eating entries to ensure that the reserve supply
	454	* remains intact.
	455	*/
	456	int
	457	vm_map_entry_kreserve(int count)
	458	{
	459	struct globaldata *gd = mycpu;
	460
	461	crit_enter();
	462	gd->gd_vme_avail -= count;
	463	crit_exit();
	464	KASSERT(gd->gd_vme_base != NULL, ("no reserved entries left, gd_vme_avail = %d\n", gd->gd_vme_avail));
	465	return(count);
	466	}
	467
	468	/*
	469	* vm_map_entry_krelease:
	470	*
	471	* Release previously reserved map entries for kernel_map. We do not
	472	* attempt to clean up like the normal release function as this would
	473	* cause an unnecessary (but probably not fatal) deep procedure call.
	474	*/
	475	void
	476	vm_map_entry_krelease(int count)
	477	{
	478	struct globaldata *gd = mycpu;
	479
	480	crit_enter();
	481	gd->gd_vme_avail += count;
	482	crit_exit();
	483	}
	484
	485	/*
	486	* vm_map_entry_create: [ internal use only ]
	487	*
	488	* Allocates a VM map entry for insertion. No entry fields are filled
	489	* in.
	490	*
	491	* This routine may be called from an interrupt thread but not a FAST
	492	* interrupt. This routine may recurse the map lock.
	493	*/
	494	static vm_map_entry_t
	495	vm_map_entry_create(vm_map_t map, int *countp)
	496	{
	497	struct globaldata *gd = mycpu;
	498	vm_map_entry_t entry;
	499
	500	KKASSERT(*countp > 0);
	501	--*countp;
	502	crit_enter();
	503	entry = gd->gd_vme_base;
	504	KASSERT(entry != NULL, ("gd_vme_base NULL! count %d", *countp));
	505	gd->gd_vme_base = entry->next;
	506	crit_exit();
	507	return(entry);
	508	}
	509
	510	/*
	511	* vm_map_entry_dispose: [ internal use only ]
	512	*
	513	* Dispose of a vm_map_entry that is no longer being referenced. This
	514	* function may be called from an interrupt.
	515	*/
	516	static void
	517	vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry, int *countp)
	518	{
	519	struct globaldata *gd = mycpu;
	520
	521	KKASSERT(map->hint != entry);
	522	KKASSERT(map->first_free != entry);
	523
	524	++*countp;
	525	crit_enter();
	526	entry->next = gd->gd_vme_base;
	527	gd->gd_vme_base = entry;
	528	crit_exit();
	529	}
	530
	531
	532	/*
	533	* vm_map_entry_{un,}link:
	534	*
	535	* Insert/remove entries from maps.
	536	*/
	537	static __inline void
	538	vm_map_entry_link(vm_map_t map,
	539	vm_map_entry_t after_where,
	540	vm_map_entry_t entry)
	541	{
	542	map->nentries++;
	543	entry->prev = after_where;
	544	entry->next = after_where->next;
	545	entry->next->prev = entry;
	546	after_where->next = entry;
	547	vm_map_rb_tree_RB_INSERT(&map->rb_root, entry);
	548	}
	549
	550	static __inline void
	551	vm_map_entry_unlink(vm_map_t map,
	552	vm_map_entry_t entry)
	553	{
	554	vm_map_entry_t prev;
	555	vm_map_entry_t next;
	556
	557	if (entry->eflags & MAP_ENTRY_IN_TRANSITION)
	558	panic("vm_map_entry_unlink: attempt to mess with locked entry! %p", entry);
	559	prev = entry->prev;
	560	next = entry->next;
	561	next->prev = prev;
	562	prev->next = next;
	563	vm_map_rb_tree_RB_REMOVE(&map->rb_root, entry);
	564	map->nentries--;
	565	}
	566
	567	/*
	568	* vm_map_lookup_entry: [ internal use only ]
	569	*
	570	* Finds the map entry containing (or
	571	* immediately preceding) the specified address
	572	* in the given map; the entry is returned
	573	* in the "entry" parameter. The boolean
	574	* result indicates whether the address is
	575	* actually contained in the map.
	576	*/
	577	boolean_t
	578	vm_map_lookup_entry(vm_map_t map, vm_offset_t address,
	579	vm_map_entry_t entry / OUT */)
	580	{
	581	vm_map_entry_t tmp;
	582	vm_map_entry_t last;
	583
	584	#if 0
	585	/*
	586	* XXX TEMPORARILY DISABLED. For some reason our attempt to revive
	587	* the hint code with the red-black lookup meets with system crashes
	588	* and lockups. We do not yet know why.
	589	*
	590	* It is possible that the problem is related to the setting
	591	* of the hint during map_entry deletion, in the code specified
	592	* at the GGG comment later on in this file.
	593	*/
	594	/*
	595	* Quickly check the cached hint, there's a good chance of a match.
	596	*/
	597	if (map->hint != &map->header) {
	598	tmp = map->hint;
	599	if (address >= tmp->start && address < tmp->end) {
	600	*entry = tmp;
	601	return(TRUE);
	602	}
	603	}
	604	#endif
	605
	606	/*
	607	* Locate the record from the top of the tree. 'last' tracks the
	608	* closest prior record and is returned if no match is found, which
	609	* in binary tree terms means tracking the most recent right-branch
	610	* taken. If there is no prior record, &map->header is returned.
	611	*/
	612	last = &map->header;
	613	tmp = RB_ROOT(&map->rb_root);
	614
	615	while (tmp) {
	616	if (address >= tmp->start) {
	617	if (address < tmp->end) {
	618	*entry = tmp;
	619	map->hint = tmp;
	620	return(TRUE);
	621	}
	622	last = tmp;
	623	tmp = RB_RIGHT(tmp, rb_entry);
	624	} else {
	625	tmp = RB_LEFT(tmp, rb_entry);
	626	}
	627	*entry = last;
	628	}
	629	*entry = last;
	630	return (FALSE);
	631	}
	632
	633	/*
	634	* vm_map_insert:
	635	*
	636	* Inserts the given whole VM object into the target
	637	* map at the specified address range. The object's
	638	* size should match that of the address range.
	639	*
	640	* Requires that the map be locked, and leaves it so. Requires that
	641	* sufficient vm_map_entry structures have been reserved and tracks
	642	* the use via countp.
	643	*
	644	* If object is non-NULL, ref count must be bumped by caller
	645	* prior to making call to account for the new entry.
	646	*/
	647	int
	648	vm_map_insert(vm_map_t map, int *countp,
	649	vm_object_t object, vm_ooffset_t offset,
	650	vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max,
	651	int cow)
	652	{
	653	vm_map_entry_t new_entry;
	654	vm_map_entry_t prev_entry;
	655	vm_map_entry_t temp_entry;
	656	vm_eflags_t protoeflags;
	657
	658	/*
	659	* Check that the start and end points are not bogus.
	660	*/
	661
	662	if ((start < map->min_offset) \|\| (end > map->max_offset) \|\|
	663	(start >= end))
	664	return (KERN_INVALID_ADDRESS);
	665
	666	/*
	667	* Find the entry prior to the proposed starting address; if it's part
	668	* of an existing entry, this range is bogus.
	669	*/
	670
	671	if (vm_map_lookup_entry(map, start, &temp_entry))
	672	return (KERN_NO_SPACE);
	673
	674	prev_entry = temp_entry;
	675
	676	/*
	677	* Assert that the next entry doesn't overlap the end point.
	678	*/
	679
	680	if ((prev_entry->next != &map->header) &&
	681	(prev_entry->next->start < end))
	682	return (KERN_NO_SPACE);
	683
	684	protoeflags = 0;
	685
	686	if (cow & MAP_COPY_ON_WRITE)
	687	protoeflags \|= MAP_ENTRY_COW\|MAP_ENTRY_NEEDS_COPY;
	688
	689	if (cow & MAP_NOFAULT) {
	690	protoeflags \|= MAP_ENTRY_NOFAULT;
	691
	692	KASSERT(object == NULL,
	693	("vm_map_insert: paradoxical MAP_NOFAULT request"));
	694	}
	695	if (cow & MAP_DISABLE_SYNCER)
	696	protoeflags \|= MAP_ENTRY_NOSYNC;
	697	if (cow & MAP_DISABLE_COREDUMP)
	698	protoeflags \|= MAP_ENTRY_NOCOREDUMP;
	699
	700	if (object) {
	701	/*
	702	* When object is non-NULL, it could be shared with another
	703	* process. We have to set or clear OBJ_ONEMAPPING
	704	* appropriately.
	705	*/
	706	if ((object->ref_count > 1) \|\| (object->shadow_count != 0)) {
	707	vm_object_clear_flag(object, OBJ_ONEMAPPING);
	708	}
	709	}
	710	else if ((prev_entry != &map->header) &&
	711	(prev_entry->eflags == protoeflags) &&
	712	(prev_entry->end == start) &&
	713	(prev_entry->wired_count == 0) &&
	714	((prev_entry->object.vm_object == NULL) \|\|
	715	vm_object_coalesce(prev_entry->object.vm_object,
	716	OFF_TO_IDX(prev_entry->offset),
	717	(vm_size_t)(prev_entry->end - prev_entry->start),
	718	(vm_size_t)(end - prev_entry->end)))) {
	719	/*
	720	* We were able to extend the object. Determine if we
	721	* can extend the previous map entry to include the
	722	* new range as well.
	723	*/
	724	if ((prev_entry->inheritance == VM_INHERIT_DEFAULT) &&
	725	(prev_entry->protection == prot) &&
	726	(prev_entry->max_protection == max)) {
	727	map->size += (end - prev_entry->end);
	728	prev_entry->end = end;
	729	vm_map_simplify_entry(map, prev_entry, countp);
	730	return (KERN_SUCCESS);
	731	}
	732
	733	/*
	734	* If we can extend the object but cannot extend the
	735	* map entry, we have to create a new map entry. We
	736	* must bump the ref count on the extended object to
	737	* account for it. object may be NULL.
	738	*/
	739	object = prev_entry->object.vm_object;
	740	offset = prev_entry->offset +
	741	(prev_entry->end - prev_entry->start);
	742	vm_object_reference(object);
	743	}
	744
	745	/*
	746	* NOTE: if conditionals fail, object can be NULL here. This occurs
	747	* in things like the buffer map where we manage kva but do not manage
	748	* backing objects.
	749	*/
	750
	751	/*
	752	* Create a new entry
	753	*/
	754
	755	new_entry = vm_map_entry_create(map, countp);
	756	new_entry->start = start;
	757	new_entry->end = end;
	758
	759	new_entry->eflags = protoeflags;
	760	new_entry->object.vm_object = object;
	761	new_entry->offset = offset;
	762	new_entry->avail_ssize = 0;
	763
	764	new_entry->inheritance = VM_INHERIT_DEFAULT;
	765	new_entry->protection = prot;
	766	new_entry->max_protection = max;
	767	new_entry->wired_count = 0;
	768
	769	/*
	770	* Insert the new entry into the list
	771	*/
	772
	773	vm_map_entry_link(map, prev_entry, new_entry);
	774	map->size += new_entry->end - new_entry->start;
	775
	776	/*
	777	* Update the free space hint
	778	*/
	779	if ((map->first_free == prev_entry) &&
	780	(prev_entry->end >= new_entry->start)) {
	781	map->first_free = new_entry;
	782	}
	783
	784	#if 0
	785	/*
	786	* Temporarily removed to avoid MAP_STACK panic, due to
	787	* MAP_STACK being a huge hack. Will be added back in
	788	* when MAP_STACK (and the user stack mapping) is fixed.
	789	*/
	790	/*
	791	* It may be possible to simplify the entry
	792	*/
	793	vm_map_simplify_entry(map, new_entry, countp);
	794	#endif
	795
	796	if (cow & (MAP_PREFAULT\|MAP_PREFAULT_PARTIAL)) {
	797	pmap_object_init_pt(map->pmap, start, prot,
	798	object, OFF_TO_IDX(offset), end - start,
	799	cow & MAP_PREFAULT_PARTIAL);
	800	}
	801
	802	return (KERN_SUCCESS);
	803	}
	804
	805	/*
	806	* Find sufficient space for `length' bytes in the given map, starting at
	807	* `start'. The map must be locked. Returns 0 on success, 1 on no space.
	808	*
	809	* This function will returned an arbitrarily aligned pointer. If no
	810	* particular alignment is required you should pass align as 1. Note that
	811	* the map may return PAGE_SIZE aligned pointers if all the lengths used in
	812	* the map are a multiple of PAGE_SIZE, even if you pass a smaller align
	813	* argument.
	814	*
	815	* 'align' should be a power of 2 but is not required to be.
	816	*/
	817	int
	818	vm_map_findspace(
	819	vm_map_t map,
	820	vm_offset_t start,
	821	vm_size_t length,
	822	vm_offset_t align,
	823	vm_offset_t *addr)
	824	{
	825	vm_map_entry_t entry, next;
	826	vm_offset_t end;
	827	vm_offset_t align_mask;
	828
	829	if (start < map->min_offset)
	830	start = map->min_offset;
	831	if (start > map->max_offset)
	832	return (1);
	833
	834	/*
	835	* If the alignment is not a power of 2 we will have to use
	836	* a mod/division, set align_mask to a special value.
	837	*/
	838	if ((align \| (align - 1)) + 1 != (align << 1))
	839	align_mask = (vm_offset_t)-1;
	840	else
	841	align_mask = align - 1;
	842
	843	retry:
	844	/*
	845	* Look for the first possible address; if there's already something
	846	* at this address, we have to start after it.
	847	*/
	848	if (start == map->min_offset) {
	849	if ((entry = map->first_free) != &map->header)
	850	start = entry->end;
	851	} else {
	852	vm_map_entry_t tmp;
	853
	854	if (vm_map_lookup_entry(map, start, &tmp))
	855	start = tmp->end;
	856	entry = tmp;
	857	}
	858
	859	/*
	860	* Look through the rest of the map, trying to fit a new region in the
	861	* gap between existing regions, or after the very last region.
	862	*/
	863	for (;; start = (entry = next)->end) {
	864	/*
	865	* Adjust the proposed start by the requested alignment,
	866	* be sure that we didn't wrap the address.
	867	*/
	868	if (align_mask == (vm_offset_t)-1)
	869	end = ((start + align - 1) / align) * align;
	870	else
	871	end = (start + align_mask) & ~align_mask;
	872	if (end < start)
	873	return (1);
	874	start = end;
	875	/*
	876	* Find the end of the proposed new region. Be sure we didn't
	877	* go beyond the end of the map, or wrap around the address.
	878	* Then check to see if this is the last entry or if the
	879	* proposed end fits in the gap between this and the next
	880	* entry.
	881	*/
	882	end = start + length;
	883	if (end > map->max_offset \|\| end < start)
	884	return (1);
	885	next = entry->next;
	886	if (next == &map->header \|\| next->start >= end)
	887	break;
	888	}
	889	map->hint = entry;
	890	if (map == kernel_map) {
	891	vm_offset_t ksize;
	892	if ((ksize = round_page(start + length)) > kernel_vm_end) {
	893	pmap_growkernel(ksize);
	894	goto retry;
	895	}
	896	}
	897	*addr = start;
	898	return (0);
	899	}
	900
	901	/*
	902	* vm_map_find finds an unallocated region in the target address
	903	* map with the given length. The search is defined to be
	904	* first-fit from the specified address; the region found is
	905	* returned in the same parameter.
	906	*
	907	* If object is non-NULL, ref count must be bumped by caller
	908	* prior to making call to account for the new entry.
	909	*/
	910	int
	911	vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
	912	vm_offset_t addr, / IN/OUT */
	913	vm_size_t length, boolean_t find_space, vm_prot_t prot,
	914	vm_prot_t max, int cow)
	915	{
	916	vm_offset_t start;
	917	int result;
	918	int count;
	919
	920	start = *addr;
	921
	922	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	923	vm_map_lock(map);
	924	if (find_space) {
	925	if (vm_map_findspace(map, start, length, 1, addr)) {
	926	vm_map_unlock(map);
	927	vm_map_entry_release(count);
	928	return (KERN_NO_SPACE);
	929	}
	930	start = *addr;
	931	}
	932	result = vm_map_insert(map, &count, object, offset,
	933	start, start + length, prot, max, cow);
	934	vm_map_unlock(map);
	935	vm_map_entry_release(count);
	936
	937	return (result);
	938	}
	939
	940	/*
	941	* vm_map_simplify_entry:
	942	*
	943	* Simplify the given map entry by merging with either neighbor. This
	944	* routine also has the ability to merge with both neighbors.
	945	*
	946	* The map must be locked.
	947	*
	948	* This routine guarentees that the passed entry remains valid (though
	949	* possibly extended). When merging, this routine may delete one or
	950	* both neighbors. No action is taken on entries which have their
	951	* in-transition flag set.
	952	*/
	953	void
	954	vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry, int *countp)
	955	{
	956	vm_map_entry_t next, prev;
	957	vm_size_t prevsize, esize;
	958
	959	if (entry->eflags & (MAP_ENTRY_IN_TRANSITION \| MAP_ENTRY_IS_SUB_MAP)) {
	960	++mycpu->gd_cnt.v_intrans_coll;
	961	return;
	962	}
	963
	964	prev = entry->prev;
	965	if (prev != &map->header) {
	966	prevsize = prev->end - prev->start;
	967	if ( (prev->end == entry->start) &&
	968	(prev->object.vm_object == entry->object.vm_object) &&
	969	(!prev->object.vm_object \|\|
	970	(prev->offset + prevsize == entry->offset)) &&
	971	(prev->eflags == entry->eflags) &&
	972	(prev->protection == entry->protection) &&
	973	(prev->max_protection == entry->max_protection) &&
	974	(prev->inheritance == entry->inheritance) &&
	975	(prev->wired_count == entry->wired_count)) {
	976	if (map->first_free == prev)
	977	map->first_free = entry;
	978	if (map->hint == prev)
	979	map->hint = entry;
	980	vm_map_entry_unlink(map, prev);
	981	entry->start = prev->start;
	982	entry->offset = prev->offset;
	983	if (prev->object.vm_object)
	984	vm_object_deallocate(prev->object.vm_object);
	985	vm_map_entry_dispose(map, prev, countp);
	986	}
	987	}
	988
	989	next = entry->next;
	990	if (next != &map->header) {
	991	esize = entry->end - entry->start;
	992	if ((entry->end == next->start) &&
	993	(next->object.vm_object == entry->object.vm_object) &&
	994	(!entry->object.vm_object \|\|
	995	(entry->offset + esize == next->offset)) &&
	996	(next->eflags == entry->eflags) &&
	997	(next->protection == entry->protection) &&
	998	(next->max_protection == entry->max_protection) &&
	999	(next->inheritance == entry->inheritance) &&
	1000	(next->wired_count == entry->wired_count)) {
	1001	if (map->first_free == next)
	1002	map->first_free = entry;
	1003	if (map->hint == next)
	1004	map->hint = entry;
	1005	vm_map_entry_unlink(map, next);
	1006	entry->end = next->end;
	1007	if (next->object.vm_object)
	1008	vm_object_deallocate(next->object.vm_object);
	1009	vm_map_entry_dispose(map, next, countp);
	1010	}
	1011	}
	1012	}
	1013	/*
	1014	* vm_map_clip_start: [ internal use only ]
	1015	*
	1016	* Asserts that the given entry begins at or after
	1017	* the specified address; if necessary,
	1018	* it splits the entry into two.
	1019	*/
	1020	#define vm_map_clip_start(map, entry, startaddr, countp) \
	1021	{ \
	1022	if (startaddr > entry->start) \
	1023	_vm_map_clip_start(map, entry, startaddr, countp); \
	1024	}
	1025
	1026	/*
	1027	* This routine is called only when it is known that
	1028	* the entry must be split.
	1029	*/
	1030	static void
	1031	_vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start, int *countp)
	1032	{
	1033	vm_map_entry_t new_entry;
	1034
	1035	/*
	1036	* Split off the front portion -- note that we must insert the new
	1037	* entry BEFORE this one, so that this entry has the specified
	1038	* starting address.
	1039	*/
	1040
	1041	vm_map_simplify_entry(map, entry, countp);
	1042
	1043	/*
	1044	* If there is no object backing this entry, we might as well create
	1045	* one now. If we defer it, an object can get created after the map
	1046	* is clipped, and individual objects will be created for the split-up
	1047	* map. This is a bit of a hack, but is also about the best place to
	1048	* put this improvement.
	1049	*/
	1050
	1051	if (entry->object.vm_object == NULL && !map->system_map) {
	1052	vm_object_t object;
	1053	object = vm_object_allocate(OBJT_DEFAULT,
	1054	atop(entry->end - entry->start));
	1055	entry->object.vm_object = object;
	1056	entry->offset = 0;
	1057	}
	1058
	1059	new_entry = vm_map_entry_create(map, countp);
	1060	new_entry = entry;
	1061
	1062	new_entry->end = start;
	1063	entry->offset += (start - entry->start);
	1064	entry->start = start;
	1065
	1066	vm_map_entry_link(map, entry->prev, new_entry);
	1067
	1068	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
	1069	vm_object_reference(new_entry->object.vm_object);
	1070	}
	1071	}
	1072
	1073	/*
	1074	* vm_map_clip_end: [ internal use only ]
	1075	*
	1076	* Asserts that the given entry ends at or before
	1077	* the specified address; if necessary,
	1078	* it splits the entry into two.
	1079	*/
	1080
	1081	#define vm_map_clip_end(map, entry, endaddr, countp) \
	1082	{ \
	1083	if (endaddr < entry->end) \
	1084	_vm_map_clip_end(map, entry, endaddr, countp); \
	1085	}
	1086
	1087	/*
	1088	* This routine is called only when it is known that
	1089	* the entry must be split.
	1090	*/
	1091	static void
	1092	_vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end, int *countp)
	1093	{
	1094	vm_map_entry_t new_entry;
	1095
	1096	/*
	1097	* If there is no object backing this entry, we might as well create
	1098	* one now. If we defer it, an object can get created after the map
	1099	* is clipped, and individual objects will be created for the split-up
	1100	* map. This is a bit of a hack, but is also about the best place to
	1101	* put this improvement.
	1102	*/
	1103
	1104	if (entry->object.vm_object == NULL && !map->system_map) {
	1105	vm_object_t object;
	1106	object = vm_object_allocate(OBJT_DEFAULT,
	1107	atop(entry->end - entry->start));
	1108	entry->object.vm_object = object;
	1109	entry->offset = 0;
	1110	}
	1111
	1112	/*
	1113	* Create a new entry and insert it AFTER the specified entry
	1114	*/
	1115
	1116	new_entry = vm_map_entry_create(map, countp);
	1117	new_entry = entry;
	1118
	1119	new_entry->start = entry->end = end;
	1120	new_entry->offset += (end - entry->start);
	1121
	1122	vm_map_entry_link(map, entry, new_entry);
	1123
	1124	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
	1125	vm_object_reference(new_entry->object.vm_object);
	1126	}
	1127	}
	1128
	1129	/*
	1130	* VM_MAP_RANGE_CHECK: [ internal use only ]
	1131	*
	1132	* Asserts that the starting and ending region
	1133	* addresses fall within the valid range of the map.
	1134	*/
	1135	#define VM_MAP_RANGE_CHECK(map, start, end) \
	1136	{ \
	1137	if (start < vm_map_min(map)) \
	1138	start = vm_map_min(map); \
	1139	if (end > vm_map_max(map)) \
	1140	end = vm_map_max(map); \
	1141	if (start > end) \
	1142	start = end; \
	1143	}
	1144
	1145	/*
	1146	* vm_map_transition_wait: [ kernel use only ]
	1147	*
	1148	* Used to block when an in-transition collison occurs. The map
	1149	* is unlocked for the sleep and relocked before the return.
	1150	*/
	1151	static
	1152	void
	1153	vm_map_transition_wait(vm_map_t map)
	1154	{
	1155	vm_map_unlock(map);
	1156	tsleep(map, 0, "vment", 0);
	1157	vm_map_lock(map);
	1158	}
	1159
	1160	/*
	1161	* CLIP_CHECK_BACK
	1162	* CLIP_CHECK_FWD
	1163	*
	1164	* When we do blocking operations with the map lock held it is
	1165	* possible that a clip might have occured on our in-transit entry,
	1166	* requiring an adjustment to the entry in our loop. These macros
	1167	* help the pageable and clip_range code deal with the case. The
	1168	* conditional costs virtually nothing if no clipping has occured.
	1169	*/
	1170
	1171	#define CLIP_CHECK_BACK(entry, save_start) \
	1172	do { \
	1173	while (entry->start != save_start) { \
	1174	entry = entry->prev; \
	1175	KASSERT(entry != &map->header, ("bad entry clip")); \
	1176	} \
	1177	} while(0)
	1178
	1179	#define CLIP_CHECK_FWD(entry, save_end) \
	1180	do { \
	1181	while (entry->end != save_end) { \
	1182	entry = entry->next; \
	1183	KASSERT(entry != &map->header, ("bad entry clip")); \
	1184	} \
	1185	} while(0)
	1186
	1187
	1188	/*
	1189	* vm_map_clip_range: [ kernel use only ]
	1190	*
	1191	* Clip the specified range and return the base entry. The
	1192	* range may cover several entries starting at the returned base
	1193	* and the first and last entry in the covering sequence will be
	1194	* properly clipped to the requested start and end address.
	1195	*
	1196	* If no holes are allowed you should pass the MAP_CLIP_NO_HOLES
	1197	* flag.
	1198	*
	1199	* The MAP_ENTRY_IN_TRANSITION flag will be set for the entries
	1200	* covered by the requested range.
	1201	*
	1202	* The map must be exclusively locked on entry and will remain locked
	1203	* on return. If no range exists or the range contains holes and you
	1204	* specified that no holes were allowed, NULL will be returned. This
	1205	* routine may temporarily unlock the map in order avoid a deadlock when
	1206	* sleeping.
	1207	*/
	1208	static
	1209	vm_map_entry_t
	1210	vm_map_clip_range(vm_map_t map, vm_offset_t start, vm_offset_t end,
	1211	int *countp, int flags)
	1212	{
	1213	vm_map_entry_t start_entry;
	1214	vm_map_entry_t entry;
	1215
	1216	/*
	1217	* Locate the entry and effect initial clipping. The in-transition
	1218	* case does not occur very often so do not try to optimize it.
	1219	*/
	1220	again:
	1221	if (vm_map_lookup_entry(map, start, &start_entry) == FALSE)
	1222	return (NULL);
	1223	entry = start_entry;
	1224	if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
	1225	entry->eflags \|= MAP_ENTRY_NEEDS_WAKEUP;
	1226	++mycpu->gd_cnt.v_intrans_coll;
	1227	++mycpu->gd_cnt.v_intrans_wait;
	1228	vm_map_transition_wait(map);
	1229	/*
	1230	* entry and/or start_entry may have been clipped while
	1231	* we slept, or may have gone away entirely. We have
	1232	* to restart from the lookup.
	1233	*/
	1234	goto again;
	1235	}
	1236	/*
	1237	* Since we hold an exclusive map lock we do not have to restart
	1238	* after clipping, even though clipping may block in zalloc.
	1239	*/
	1240	vm_map_clip_start(map, entry, start, countp);
	1241	vm_map_clip_end(map, entry, end, countp);
	1242	entry->eflags \|= MAP_ENTRY_IN_TRANSITION;
	1243
	1244	/*
	1245	* Scan entries covered by the range. When working on the next
	1246	* entry a restart need only re-loop on the current entry which
	1247	* we have already locked, since 'next' may have changed. Also,
	1248	* even though entry is safe, it may have been clipped so we
	1249	* have to iterate forwards through the clip after sleeping.
	1250	*/
	1251	while (entry->next != &map->header && entry->next->start < end) {
	1252	vm_map_entry_t next = entry->next;
	1253
	1254	if (flags & MAP_CLIP_NO_HOLES) {
	1255	if (next->start > entry->end) {
	1256	vm_map_unclip_range(map, start_entry,
	1257	start, entry->end, countp, flags);
	1258	return(NULL);
	1259	}
	1260	}
	1261
	1262	if (next->eflags & MAP_ENTRY_IN_TRANSITION) {
	1263	vm_offset_t save_end = entry->end;
	1264	next->eflags \|= MAP_ENTRY_NEEDS_WAKEUP;
	1265	++mycpu->gd_cnt.v_intrans_coll;
	1266	++mycpu->gd_cnt.v_intrans_wait;
	1267	vm_map_transition_wait(map);
	1268
	1269	/*
	1270	* clips might have occured while we blocked.
	1271	*/
	1272	CLIP_CHECK_FWD(entry, save_end);
	1273	CLIP_CHECK_BACK(start_entry, start);
	1274	continue;
	1275	}
	1276	/*
	1277	* No restart necessary even though clip_end may block, we
	1278	* are holding the map lock.
	1279	*/
	1280	vm_map_clip_end(map, next, end, countp);
	1281	next->eflags \|= MAP_ENTRY_IN_TRANSITION;
	1282	entry = next;
	1283	}
	1284	if (flags & MAP_CLIP_NO_HOLES) {
	1285	if (entry->end != end) {
	1286	vm_map_unclip_range(map, start_entry,
	1287	start, entry->end, countp, flags);
	1288	return(NULL);
	1289	}
	1290	}
	1291	return(start_entry);
	1292	}
	1293
	1294	/*
	1295	* vm_map_unclip_range: [ kernel use only ]
	1296	*
	1297	* Undo the effect of vm_map_clip_range(). You should pass the same
	1298	* flags and the same range that you passed to vm_map_clip_range().
	1299	* This code will clear the in-transition flag on the entries and
	1300	* wake up anyone waiting. This code will also simplify the sequence
	1301	* and attempt to merge it with entries before and after the sequence.
	1302	*
	1303	* The map must be locked on entry and will remain locked on return.
	1304	*
	1305	* Note that you should also pass the start_entry returned by
	1306	* vm_map_clip_range(). However, if you block between the two calls
	1307	* with the map unlocked please be aware that the start_entry may
	1308	* have been clipped and you may need to scan it backwards to find
	1309	* the entry corresponding with the original start address. You are
	1310	* responsible for this, vm_map_unclip_range() expects the correct
	1311	* start_entry to be passed to it and will KASSERT otherwise.
	1312	*/
	1313	static
	1314	void
	1315	vm_map_unclip_range(
	1316	vm_map_t map,
	1317	vm_map_entry_t start_entry,
	1318	vm_offset_t start,
	1319	vm_offset_t end,
	1320	int *countp,
	1321	int flags)
	1322	{
	1323	vm_map_entry_t entry;
	1324
	1325	entry = start_entry;
	1326
	1327	KASSERT(entry->start == start, ("unclip_range: illegal base entry"));
	1328	while (entry != &map->header && entry->start < end) {
	1329	KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION, ("in-transition flag not set during unclip on: %p", entry));
	1330	KASSERT(entry->end <= end, ("unclip_range: tail wasn't clipped"));
	1331	entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
	1332	if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
	1333	entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
	1334	wakeup(map);
	1335	}
	1336	entry = entry->next;
	1337	}
	1338
	1339	/*
	1340	* Simplification does not block so there is no restart case.
	1341	*/
	1342	entry = start_entry;
	1343	while (entry != &map->header && entry->start < end) {
	1344	vm_map_simplify_entry(map, entry, countp);
	1345	entry = entry->next;
	1346	}
	1347	}
	1348
	1349	/*
	1350	* vm_map_submap: [ kernel use only ]
	1351	*
	1352	* Mark the given range as handled by a subordinate map.
	1353	*
	1354	* This range must have been created with vm_map_find,
	1355	* and no other operations may have been performed on this
	1356	* range prior to calling vm_map_submap.
	1357	*
	1358	* Only a limited number of operations can be performed
	1359	* within this rage after calling vm_map_submap:
	1360	* vm_fault
	1361	* [Don't try vm_map_copy!]
	1362	*
	1363	* To remove a submapping, one must first remove the
	1364	* range from the superior map, and then destroy the
	1365	* submap (if desired). [Better yet, don't try it.]
	1366	*/
	1367	int
	1368	vm_map_submap(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_map_t submap)
	1369	{
	1370	vm_map_entry_t entry;
	1371	int result = KERN_INVALID_ARGUMENT;
	1372	int count;
	1373
	1374	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	1375	vm_map_lock(map);
	1376
	1377	VM_MAP_RANGE_CHECK(map, start, end);
	1378
	1379	if (vm_map_lookup_entry(map, start, &entry)) {
	1380	vm_map_clip_start(map, entry, start, &count);
	1381	} else {
	1382	entry = entry->next;
	1383	}
	1384
	1385	vm_map_clip_end(map, entry, end, &count);
	1386
	1387	if ((entry->start == start) && (entry->end == end) &&
	1388	((entry->eflags & MAP_ENTRY_COW) == 0) &&
	1389	(entry->object.vm_object == NULL)) {
	1390	entry->object.sub_map = submap;
	1391	entry->eflags \|= MAP_ENTRY_IS_SUB_MAP;
	1392	result = KERN_SUCCESS;
	1393	}
	1394	vm_map_unlock(map);
	1395	vm_map_entry_release(count);
	1396
	1397	return (result);
	1398	}
	1399
	1400	/*
	1401	* vm_map_protect:
	1402	*
	1403	* Sets the protection of the specified address
	1404	* region in the target map. If "set_max" is
	1405	* specified, the maximum protection is to be set;
	1406	* otherwise, only the current protection is affected.
	1407	*/
	1408	int
	1409	vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
	1410	vm_prot_t new_prot, boolean_t set_max)
	1411	{
	1412	vm_map_entry_t current;
	1413	vm_map_entry_t entry;
	1414	int count;
	1415
	1416	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	1417	vm_map_lock(map);
	1418
	1419	VM_MAP_RANGE_CHECK(map, start, end);
	1420
	1421	if (vm_map_lookup_entry(map, start, &entry)) {
	1422	vm_map_clip_start(map, entry, start, &count);
	1423	} else {
	1424	entry = entry->next;
	1425	}
	1426
	1427	/*
	1428	* Make a first pass to check for protection violations.
	1429	*/
	1430
	1431	current = entry;
	1432	while ((current != &map->header) && (current->start < end)) {
	1433	if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
	1434	vm_map_unlock(map);
	1435	vm_map_entry_release(count);
	1436	return (KERN_INVALID_ARGUMENT);
	1437	}
	1438	if ((new_prot & current->max_protection) != new_prot) {
	1439	vm_map_unlock(map);
	1440	vm_map_entry_release(count);
	1441	return (KERN_PROTECTION_FAILURE);
	1442	}
	1443	current = current->next;
	1444	}
	1445
	1446	/*
	1447	* Go back and fix up protections. [Note that clipping is not
	1448	* necessary the second time.]
	1449	*/
	1450	current = entry;
	1451
	1452	while ((current != &map->header) && (current->start < end)) {
	1453	vm_prot_t old_prot;
	1454
	1455	vm_map_clip_end(map, current, end, &count);
	1456
	1457	old_prot = current->protection;
	1458	if (set_max)
	1459	current->protection =
	1460	(current->max_protection = new_prot) &
	1461	old_prot;
	1462	else
	1463	current->protection = new_prot;
	1464
	1465	/*
	1466	* Update physical map if necessary. Worry about copy-on-write
	1467	* here -- CHECK THIS XXX
	1468	*/
	1469
	1470	if (current->protection != old_prot) {
	1471	#define MASK(entry) (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
	1472	VM_PROT_ALL)
	1473
	1474	pmap_protect(map->pmap, current->start,
	1475	current->end,
	1476	current->protection & MASK(current));
	1477	#undef MASK
	1478	}
	1479
	1480	vm_map_simplify_entry(map, current, &count);
	1481
	1482	current = current->next;
	1483	}
	1484
	1485	vm_map_unlock(map);
	1486	vm_map_entry_release(count);
	1487	return (KERN_SUCCESS);
	1488	}
	1489
	1490	/*
	1491	* vm_map_madvise:
	1492	*
	1493	* This routine traverses a processes map handling the madvise
	1494	* system call. Advisories are classified as either those effecting
	1495	* the vm_map_entry structure, or those effecting the underlying
	1496	* objects.
	1497	*/
	1498
	1499	int
	1500	vm_map_madvise(vm_map_t map, vm_offset_t start, vm_offset_t end, int behav)
	1501	{
	1502	vm_map_entry_t current, entry;
	1503	int modify_map = 0;
	1504	int count;
	1505
	1506	/*
	1507	* Some madvise calls directly modify the vm_map_entry, in which case
	1508	* we need to use an exclusive lock on the map and we need to perform
	1509	* various clipping operations. Otherwise we only need a read-lock
	1510	* on the map.
	1511	*/
	1512
	1513	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	1514
	1515	switch(behav) {
	1516	case MADV_NORMAL:
	1517	case MADV_SEQUENTIAL:
	1518	case MADV_RANDOM:
	1519	case MADV_NOSYNC:
	1520	case MADV_AUTOSYNC:
	1521	case MADV_NOCORE:
	1522	case MADV_CORE:
	1523	modify_map = 1;
	1524	vm_map_lock(map);
	1525	break;
	1526	case MADV_WILLNEED:
	1527	case MADV_DONTNEED:
	1528	case MADV_FREE:
	1529	vm_map_lock_read(map);
	1530	break;
	1531	default:
	1532	vm_map_entry_release(count);
	1533	return (KERN_INVALID_ARGUMENT);
	1534	}
	1535
	1536	/*
	1537	* Locate starting entry and clip if necessary.
	1538	*/
	1539
	1540	VM_MAP_RANGE_CHECK(map, start, end);
	1541
	1542	if (vm_map_lookup_entry(map, start, &entry)) {
	1543	if (modify_map)
	1544	vm_map_clip_start(map, entry, start, &count);
	1545	} else {
	1546	entry = entry->next;
	1547	}
	1548
	1549	if (modify_map) {
	1550	/*
	1551	* madvise behaviors that are implemented in the vm_map_entry.
	1552	*
	1553	* We clip the vm_map_entry so that behavioral changes are
	1554	* limited to the specified address range.
	1555	*/
	1556	for (current = entry;
	1557	(current != &map->header) && (current->start < end);
	1558	current = current->next
	1559	) {
	1560	if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
	1561	continue;
	1562
	1563	vm_map_clip_end(map, current, end, &count);
	1564
	1565	switch (behav) {
	1566	case MADV_NORMAL:
	1567	vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL);
	1568	break;
	1569	case MADV_SEQUENTIAL:
	1570	vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL);
	1571	break;
	1572	case MADV_RANDOM:
	1573	vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM);
	1574	break;
	1575	case MADV_NOSYNC:
	1576	current->eflags \|= MAP_ENTRY_NOSYNC;
	1577	break;
	1578	case MADV_AUTOSYNC:
	1579	current->eflags &= ~MAP_ENTRY_NOSYNC;
	1580	break;
	1581	case MADV_NOCORE:
	1582	current->eflags \|= MAP_ENTRY_NOCOREDUMP;
	1583	break;
	1584	case MADV_CORE:
	1585	current->eflags &= ~MAP_ENTRY_NOCOREDUMP;
	1586	break;
	1587	default:
	1588	break;
	1589	}
	1590	vm_map_simplify_entry(map, current, &count);
	1591	}
	1592	vm_map_unlock(map);
	1593	} else {
	1594	vm_pindex_t pindex;
	1595	int count;
	1596
	1597	/*
	1598	* madvise behaviors that are implemented in the underlying
	1599	* vm_object.
	1600	*
	1601	* Since we don't clip the vm_map_entry, we have to clip
	1602	* the vm_object pindex and count.
	1603	*/
	1604	for (current = entry;
	1605	(current != &map->header) && (current->start < end);
	1606	current = current->next
	1607	) {
	1608	vm_offset_t useStart;
	1609
	1610	if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
	1611	continue;
	1612
	1613	pindex = OFF_TO_IDX(current->offset);
	1614	count = atop(current->end - current->start);
	1615	useStart = current->start;
	1616
	1617	if (current->start < start) {
	1618	pindex += atop(start - current->start);
	1619	count -= atop(start - current->start);
	1620	useStart = start;
	1621	}
	1622	if (current->end > end)
	1623	count -= atop(current->end - end);
	1624
	1625	if (count <= 0)
	1626	continue;
	1627
	1628	vm_object_madvise(current->object.vm_object,
	1629	pindex, count, behav);
	1630	if (behav == MADV_WILLNEED) {
	1631	pmap_object_init_pt(
	1632	map->pmap,
	1633	useStart,
	1634	current->protection,
	1635	current->object.vm_object,
	1636	pindex,
	1637	(count << PAGE_SHIFT),
	1638	MAP_PREFAULT_MADVISE
	1639	);
	1640	}
	1641	}
	1642	vm_map_unlock_read(map);
	1643	}
	1644	vm_map_entry_release(count);
	1645	return(0);
	1646	}
	1647
	1648
	1649	/*
	1650	* vm_map_inherit:
	1651	*
	1652	* Sets the inheritance of the specified address
	1653	* range in the target map. Inheritance
	1654	* affects how the map will be shared with
	1655	* child maps at the time of vm_map_fork.
	1656	*/
	1657	int
	1658	vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
	1659	vm_inherit_t new_inheritance)
	1660	{
	1661	vm_map_entry_t entry;
	1662	vm_map_entry_t temp_entry;
	1663	int count;
	1664
	1665	switch (new_inheritance) {
	1666	case VM_INHERIT_NONE:
	1667	case VM_INHERIT_COPY:
	1668	case VM_INHERIT_SHARE:
	1669	break;
	1670	default:
	1671	return (KERN_INVALID_ARGUMENT);
	1672	}
	1673
	1674	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	1675	vm_map_lock(map);
	1676
	1677	VM_MAP_RANGE_CHECK(map, start, end);
	1678
	1679	if (vm_map_lookup_entry(map, start, &temp_entry)) {
	1680	entry = temp_entry;
	1681	vm_map_clip_start(map, entry, start, &count);
	1682	} else
	1683	entry = temp_entry->next;
	1684
	1685	while ((entry != &map->header) && (entry->start < end)) {
	1686	vm_map_clip_end(map, entry, end, &count);
	1687
	1688	entry->inheritance = new_inheritance;
	1689
	1690	vm_map_simplify_entry(map, entry, &count);
	1691
	1692	entry = entry->next;
	1693	}
	1694	vm_map_unlock(map);
	1695	vm_map_entry_release(count);
	1696	return (KERN_SUCCESS);
	1697	}
	1698
	1699	/*
	1700	* Implement the semantics of mlock
	1701	*/
	1702	int
	1703	vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t real_end,
	1704	boolean_t new_pageable)
	1705	{
	1706	vm_map_entry_t entry;
	1707	vm_map_entry_t start_entry;
	1708	vm_offset_t end;
	1709	int rv = KERN_SUCCESS;
	1710	int count;
	1711
	1712	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	1713	vm_map_lock(map);
	1714	VM_MAP_RANGE_CHECK(map, start, real_end);
	1715	end = real_end;
	1716
	1717	start_entry = vm_map_clip_range(map, start, end, &count, MAP_CLIP_NO_HOLES);
	1718	if (start_entry == NULL) {
	1719	vm_map_unlock(map);
	1720	vm_map_entry_release(count);
	1721	return (KERN_INVALID_ADDRESS);
	1722	}
	1723
	1724	if (new_pageable == 0) {
	1725	entry = start_entry;
	1726	while ((entry != &map->header) && (entry->start < end)) {
	1727	vm_offset_t save_start;
	1728	vm_offset_t save_end;
	1729
	1730	/*
	1731	* Already user wired or hard wired (trivial cases)
	1732	*/
	1733	if (entry->eflags & MAP_ENTRY_USER_WIRED) {
	1734	entry = entry->next;
	1735	continue;
	1736	}
	1737	if (entry->wired_count != 0) {
	1738	entry->wired_count++;
	1739	entry->eflags \|= MAP_ENTRY_USER_WIRED;
	1740	entry = entry->next;
	1741	continue;
	1742	}
	1743
	1744	/*
	1745	* A new wiring requires instantiation of appropriate
	1746	* management structures and the faulting in of the
	1747	* page.
	1748	*/
	1749	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
	1750	int copyflag = entry->eflags & MAP_ENTRY_NEEDS_COPY;
	1751	if (copyflag && ((entry->protection & VM_PROT_WRITE) != 0)) {
	1752
	1753	vm_object_shadow(&entry->object.vm_object,
	1754	&entry->offset,
	1755	atop(entry->end - entry->start));
	1756	entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
	1757
	1758	} else if (entry->object.vm_object == NULL &&
	1759	!map->system_map) {
	1760
	1761	entry->object.vm_object =
	1762	vm_object_allocate(OBJT_DEFAULT,
	1763	atop(entry->end - entry->start));
	1764	entry->offset = (vm_offset_t) 0;
	1765
	1766	}
	1767	}
	1768	entry->wired_count++;
	1769	entry->eflags \|= MAP_ENTRY_USER_WIRED;
	1770
	1771	/*
	1772	* Now fault in the area. Note that vm_fault_wire()
	1773	* may release the map lock temporarily, it will be
	1774	* relocked on return. The in-transition
	1775	* flag protects the entries.
	1776	*/
	1777	save_start = entry->start;
	1778	save_end = entry->end;
	1779	rv = vm_fault_wire(map, entry, TRUE);
	1780	if (rv) {
	1781	CLIP_CHECK_BACK(entry, save_start);
	1782	for (;;) {
	1783	KASSERT(entry->wired_count == 1, ("bad wired_count on entry"));
	1784	entry->eflags &= ~MAP_ENTRY_USER_WIRED;
	1785	entry->wired_count = 0;
	1786	if (entry->end == save_end)
	1787	break;
	1788	entry = entry->next;
	1789	KASSERT(entry != &map->header, ("bad entry clip during backout"));
	1790	}
	1791	end = save_start; /* unwire the rest */
	1792	break;
	1793	}
	1794	/*
	1795	* note that even though the entry might have been
	1796	* clipped, the USER_WIRED flag we set prevents
	1797	* duplication so we do not have to do a
	1798	* clip check.
	1799	*/
	1800	entry = entry->next;
	1801	}
	1802
	1803	/*
	1804	* If we failed fall through to the unwiring section to
	1805	* unwire what we had wired so far. 'end' has already
	1806	* been adjusted.
	1807	*/
	1808	if (rv)
	1809	new_pageable = 1;
	1810
	1811	/*
	1812	* start_entry might have been clipped if we unlocked the
	1813	* map and blocked. No matter how clipped it has gotten
	1814	* there should be a fragment that is on our start boundary.
	1815	*/
	1816	CLIP_CHECK_BACK(start_entry, start);
	1817	}
	1818
	1819	/*
	1820	* Deal with the unwiring case.
	1821	*/
	1822	if (new_pageable) {
	1823	/*
	1824	* This is the unwiring case. We must first ensure that the
	1825	* range to be unwired is really wired down. We know there
	1826	* are no holes.
	1827	*/
	1828	entry = start_entry;
	1829	while ((entry != &map->header) && (entry->start < end)) {
	1830	if ((entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
	1831	rv = KERN_INVALID_ARGUMENT;
	1832	goto done;
	1833	}
	1834	KASSERT(entry->wired_count != 0, ("wired count was 0 with USER_WIRED set! %p", entry));
	1835	entry = entry->next;
	1836	}
	1837
	1838	/*
	1839	* Now decrement the wiring count for each region. If a region
	1840	* becomes completely unwired, unwire its physical pages and
	1841	* mappings.
	1842	*/
	1843	/*
	1844	* The map entries are processed in a loop, checking to
	1845	* make sure the entry is wired and asserting it has a wired
	1846	* count. However, another loop was inserted more-or-less in
	1847	* the middle of the unwiring path. This loop picks up the
	1848	* "entry" loop variable from the first loop without first
	1849	* setting it to start_entry. Naturally, the secound loop
	1850	* is never entered and the pages backing the entries are
	1851	* never unwired. This can lead to a leak of wired pages.
	1852	*/
	1853	entry = start_entry;
	1854	while ((entry != &map->header) && (entry->start < end)) {
	1855	KASSERT(entry->eflags & MAP_ENTRY_USER_WIRED,
	1856	("expected USER_WIRED on entry %p", entry));
	1857	entry->eflags &= ~MAP_ENTRY_USER_WIRED;
	1858	entry->wired_count--;
	1859	if (entry->wired_count == 0)
	1860	vm_fault_unwire(map, entry);
	1861	entry = entry->next;
	1862	}
	1863	}
	1864	done:
	1865	vm_map_unclip_range(map, start_entry, start, real_end, &count,
	1866	MAP_CLIP_NO_HOLES);
	1867	map->timestamp++;
	1868	vm_map_unlock(map);
	1869	vm_map_entry_release(count);
	1870	return (rv);
	1871	}
	1872
	1873	/*
	1874	* vm_map_wire:
	1875	*
	1876	* Sets the pageability of the specified address
	1877	* range in the target map. Regions specified
	1878	* as not pageable require locked-down physical
	1879	* memory and physical page maps.
	1880	*
	1881	* The map must not be locked, but a reference
	1882	* must remain to the map throughout the call.
	1883	*
	1884	* This function may be called via the zalloc path and must properly
	1885	* reserve map entries for kernel_map.
	1886	*/
	1887	int
	1888	vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t real_end, int kmflags)
	1889	{
	1890	vm_map_entry_t entry;
	1891	vm_map_entry_t start_entry;
	1892	vm_offset_t end;
	1893	int rv = KERN_SUCCESS;
	1894	int count;
	1895
	1896	if (kmflags & KM_KRESERVE)
	1897	count = vm_map_entry_kreserve(MAP_RESERVE_COUNT);
	1898	else
	1899	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	1900	vm_map_lock(map);
	1901	VM_MAP_RANGE_CHECK(map, start, real_end);
	1902	end = real_end;
	1903
	1904	start_entry = vm_map_clip_range(map, start, end, &count, MAP_CLIP_NO_HOLES);
	1905	if (start_entry == NULL) {
	1906	vm_map_unlock(map);
	1907	rv = KERN_INVALID_ADDRESS;
	1908	goto failure;
	1909	}
	1910	if ((kmflags & KM_PAGEABLE) == 0) {
	1911	/*
	1912	* Wiring.
	1913	*
	1914	* 1. Holding the write lock, we create any shadow or zero-fill
	1915	* objects that need to be created. Then we clip each map
	1916	* entry to the region to be wired and increment its wiring
	1917	* count. We create objects before clipping the map entries
	1918	* to avoid object proliferation.
	1919	*
	1920	* 2. We downgrade to a read lock, and call vm_fault_wire to
	1921	* fault in the pages for any newly wired area (wired_count is
	1922	* 1).
	1923	*
	1924	* Downgrading to a read lock for vm_fault_wire avoids a
	1925	* possible deadlock with another process that may have faulted
	1926	* on one of the pages to be wired (it would mark the page busy,
	1927	* blocking us, then in turn block on the map lock that we
	1928	* hold). Because of problems in the recursive lock package,
	1929	* we cannot upgrade to a write lock in vm_map_lookup. Thus,
	1930	* any actions that require the write lock must be done
	1931	* beforehand. Because we keep the read lock on the map, the
	1932	* copy-on-write status of the entries we modify here cannot
	1933	* change.
	1934	*/
	1935
	1936	entry = start_entry;
	1937	while ((entry != &map->header) && (entry->start < end)) {
	1938	/*
	1939	* Trivial case if the entry is already wired
	1940	*/
	1941	if (entry->wired_count) {
	1942	entry->wired_count++;
	1943	entry = entry->next;
	1944	continue;
	1945	}
	1946
	1947	/*
	1948	* The entry is being newly wired, we have to setup
	1949	* appropriate management structures. A shadow
	1950	* object is required for a copy-on-write region,
	1951	* or a normal object for a zero-fill region. We
	1952	* do not have to do this for entries that point to sub
	1953	* maps because we won't hold the lock on the sub map.
	1954	*/
	1955	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
	1956	int copyflag = entry->eflags & MAP_ENTRY_NEEDS_COPY;
	1957	if (copyflag &&
	1958	((entry->protection & VM_PROT_WRITE) != 0)) {
	1959
	1960	vm_object_shadow(&entry->object.vm_object,
	1961	&entry->offset,
	1962	atop(entry->end - entry->start));
	1963	entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
	1964	} else if (entry->object.vm_object == NULL &&
	1965	!map->system_map) {
	1966	entry->object.vm_object =
	1967	vm_object_allocate(OBJT_DEFAULT,
	1968	atop(entry->end - entry->start));
	1969	entry->offset = (vm_offset_t) 0;
	1970	}
	1971	}
	1972
	1973	entry->wired_count++;
	1974	entry = entry->next;
	1975	}
	1976
	1977	/*
	1978	* Pass 2.
	1979	*/
	1980
	1981	/*
	1982	* HACK HACK HACK HACK
	1983	*
	1984	* Unlock the map to avoid deadlocks. The in-transit flag
	1985	* protects us from most changes but note that
	1986	* clipping may still occur. To prevent clipping from
	1987	* occuring after the unlock, except for when we are
	1988	* blocking in vm_fault_wire, we must run in a critical
	1989	* section, otherwise our accesses to entry->start and
	1990	* entry->end could be corrupted. We have to enter the
	1991	* critical section prior to unlocking so start_entry does
	1992	* not change out from under us at the very beginning of the
	1993	* loop.
	1994	*
	1995	* HACK HACK HACK HACK
	1996	*/
	1997
	1998	crit_enter();
	1999
	2000	entry = start_entry;
	2001	while (entry != &map->header && entry->start < end) {
	2002	/*
	2003	* If vm_fault_wire fails for any page we need to undo
	2004	* what has been done. We decrement the wiring count
	2005	* for those pages which have not yet been wired (now)
	2006	* and unwire those that have (later).
	2007	*/
	2008	vm_offset_t save_start = entry->start;
	2009	vm_offset_t save_end = entry->end;
	2010
	2011	if (entry->wired_count == 1)
	2012	rv = vm_fault_wire(map, entry, FALSE);
	2013	if (rv) {
	2014	CLIP_CHECK_BACK(entry, save_start);
	2015	for (;;) {
	2016	KASSERT(entry->wired_count == 1, ("wired_count changed unexpectedly"));
	2017	entry->wired_count = 0;
	2018	if (entry->end == save_end)
	2019	break;
	2020	entry = entry->next;
	2021	KASSERT(entry != &map->header, ("bad entry clip during backout"));
	2022	}
	2023	end = save_start;
	2024	break;
	2025	}
	2026	CLIP_CHECK_FWD(entry, save_end);
	2027	entry = entry->next;
	2028	}
	2029	crit_exit();
	2030
	2031	/*
	2032	* If a failure occured undo everything by falling through
	2033	* to the unwiring code. 'end' has already been adjusted
	2034	* appropriately.
	2035	*/
	2036	if (rv)
	2037	kmflags \|= KM_PAGEABLE;
	2038
	2039	/*
	2040	* start_entry is still IN_TRANSITION but may have been
	2041	* clipped since vm_fault_wire() unlocks and relocks the
	2042	* map. No matter how clipped it has gotten there should
	2043	* be a fragment that is on our start boundary.
	2044	*/
	2045	CLIP_CHECK_BACK(start_entry, start);
	2046	}
	2047
	2048	if (kmflags & KM_PAGEABLE) {
	2049	/*
	2050	* This is the unwiring case. We must first ensure that the
	2051	* range to be unwired is really wired down. We know there
	2052	* are no holes.
	2053	*/
	2054	entry = start_entry;
	2055	while ((entry != &map->header) && (entry->start < end)) {
	2056	if (entry->wired_count == 0) {
	2057	rv = KERN_INVALID_ARGUMENT;
	2058	goto done;
	2059	}
	2060	entry = entry->next;
	2061	}
	2062
	2063	/*
	2064	* Now decrement the wiring count for each region. If a region
	2065	* becomes completely unwired, unwire its physical pages and
	2066	* mappings.
	2067	*/
	2068	entry = start_entry;
	2069	while ((entry != &map->header) && (entry->start < end)) {
	2070	entry->wired_count--;
	2071	if (entry->wired_count == 0)
	2072	vm_fault_unwire(map, entry);
	2073	entry = entry->next;
	2074	}
	2075	}
	2076	done:
	2077	vm_map_unclip_range(map, start_entry, start, real_end, &count,
	2078	MAP_CLIP_NO_HOLES);
	2079	map->timestamp++;
	2080	vm_map_unlock(map);
	2081	failure:
	2082	if (kmflags & KM_KRESERVE)
	2083	vm_map_entry_krelease(count);
	2084	else
	2085	vm_map_entry_release(count);
	2086	return (rv);
	2087	}
	2088
	2089	/*
	2090	* vm_map_set_wired_quick()
	2091	*
	2092	* Mark a newly allocated address range as wired but do not fault in
	2093	* the pages. The caller is expected to load the pages into the object.
	2094	*
	2095	* The map must be locked on entry and will remain locked on return.
	2096	*/
	2097	void
	2098	vm_map_set_wired_quick(vm_map_t map, vm_offset_t addr, vm_size_t size, int *countp)
	2099	{
	2100	vm_map_entry_t scan;
	2101	vm_map_entry_t entry;
	2102
	2103	entry = vm_map_clip_range(map, addr, addr + size, countp, MAP_CLIP_NO_HOLES);
	2104	for (scan = entry; scan != &map->header && scan->start < addr + size; scan = scan->next) {
	2105	KKASSERT(entry->wired_count == 0);
	2106	entry->wired_count = 1;
	2107	}
	2108	vm_map_unclip_range(map, entry, addr, addr + size, countp, MAP_CLIP_NO_HOLES);
	2109	}
	2110
	2111	/*
	2112	* vm_map_clean
	2113	*
	2114	* Push any dirty cached pages in the address range to their pager.
	2115	* If syncio is TRUE, dirty pages are written synchronously.
	2116	* If invalidate is TRUE, any cached pages are freed as well.
	2117	*
	2118	* Returns an error if any part of the specified range is not mapped.
	2119	*/
	2120	int
	2121	vm_map_clean(vm_map_t map, vm_offset_t start, vm_offset_t end, boolean_t syncio,
	2122	boolean_t invalidate)
	2123	{
	2124	vm_map_entry_t current;
	2125	vm_map_entry_t entry;
	2126	vm_size_t size;
	2127	vm_object_t object;
	2128	vm_ooffset_t offset;
	2129
	2130	vm_map_lock_read(map);
	2131	VM_MAP_RANGE_CHECK(map, start, end);
	2132	if (!vm_map_lookup_entry(map, start, &entry)) {
	2133	vm_map_unlock_read(map);
	2134	return (KERN_INVALID_ADDRESS);
	2135	}
	2136	/*
	2137	* Make a first pass to check for holes.
	2138	*/
	2139	for (current = entry; current->start < end; current = current->next) {
	2140	if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
	2141	vm_map_unlock_read(map);
	2142	return (KERN_INVALID_ARGUMENT);
	2143	}
	2144	if (end > current->end &&
	2145	(current->next == &map->header \|\|
	2146	current->end != current->next->start)) {
	2147	vm_map_unlock_read(map);
	2148	return (KERN_INVALID_ADDRESS);
	2149	}
	2150	}
	2151
	2152	if (invalidate)
	2153	pmap_remove(vm_map_pmap(map), start, end);
	2154	/*
	2155	* Make a second pass, cleaning/uncaching pages from the indicated
	2156	* objects as we go.
	2157	*/
	2158	for (current = entry; current->start < end; current = current->next) {
	2159	offset = current->offset + (start - current->start);
	2160	size = (end <= current->end ? end : current->end) - start;
	2161	if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
	2162	vm_map_t smap;
	2163	vm_map_entry_t tentry;
	2164	vm_size_t tsize;
	2165
	2166	smap = current->object.sub_map;
	2167	vm_map_lock_read(smap);
	2168	(void) vm_map_lookup_entry(smap, offset, &tentry);
	2169	tsize = tentry->end - offset;
	2170	if (tsize < size)
	2171	size = tsize;
	2172	object = tentry->object.vm_object;
	2173	offset = tentry->offset + (offset - tentry->start);
	2174	vm_map_unlock_read(smap);
	2175	} else {
	2176	object = current->object.vm_object;
	2177	}
	2178	/*
	2179	* Note that there is absolutely no sense in writing out
	2180	* anonymous objects, so we track down the vnode object
	2181	* to write out.
	2182	* We invalidate (remove) all pages from the address space
	2183	* anyway, for semantic correctness.
	2184	*
	2185	* note: certain anonymous maps, such as MAP_NOSYNC maps,
	2186	* may start out with a NULL object.
	2187	*/
	2188	while (object && object->backing_object) {
	2189	offset += object->backing_object_offset;
	2190	object = object->backing_object;
	2191	if (object->size < OFF_TO_IDX( offset + size))
	2192	size = IDX_TO_OFF(object->size) - offset;
	2193	}
	2194	if (object && (object->type == OBJT_VNODE) &&
	2195	(current->protection & VM_PROT_WRITE)) {
	2196	/*
	2197	* Flush pages if writing is allowed, invalidate them
	2198	* if invalidation requested. Pages undergoing I/O
	2199	* will be ignored by vm_object_page_remove().
	2200	*
	2201	* We cannot lock the vnode and then wait for paging
	2202	* to complete without deadlocking against vm_fault.
	2203	* Instead we simply call vm_object_page_remove() and
	2204	* allow it to block internally on a page-by-page
	2205	* basis when it encounters pages undergoing async
	2206	* I/O.
	2207	*/
	2208	int flags;
	2209
	2210	vm_object_reference(object);
	2211	vn_lock(object->handle,
	2212	LK_EXCLUSIVE \| LK_RETRY, curthread);
	2213	flags = (syncio \|\| invalidate) ? OBJPC_SYNC : 0;
	2214	flags \|= invalidate ? OBJPC_INVAL : 0;
	2215	vm_object_page_clean(object,
	2216	OFF_TO_IDX(offset),
	2217	OFF_TO_IDX(offset + size + PAGE_MASK),
	2218	flags);
	2219	VOP_UNLOCK(((struct vnode *)object->handle),
	2220	0, curthread);
	2221	vm_object_deallocate(object);
	2222	}
	2223	if (object && invalidate &&
	2224	((object->type == OBJT_VNODE) \|\|
	2225	(object->type == OBJT_DEVICE))) {
	2226	int clean_only =
	2227	(object->type == OBJT_DEVICE) ? FALSE : TRUE;
	2228	vm_object_reference(object);
	2229	vm_object_page_remove(object,
	2230	OFF_TO_IDX(offset),
	2231	OFF_TO_IDX(offset + size + PAGE_MASK),
	2232	clean_only);
	2233	vm_object_deallocate(object);
	2234	}
	2235	start += size;
	2236	}
	2237
	2238	vm_map_unlock_read(map);
	2239	return (KERN_SUCCESS);
	2240	}
	2241
	2242	/*
	2243	* vm_map_entry_unwire: [ internal use only ]
	2244	*
	2245	* Make the region specified by this entry pageable.
	2246	*
	2247	* The map in question should be locked.
	2248	* [This is the reason for this routine's existence.]
	2249	*/
	2250	static void
	2251	vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
	2252	{
	2253	entry->eflags &= ~MAP_ENTRY_USER_WIRED;
	2254	entry->wired_count = 0;
	2255	vm_fault_unwire(map, entry);
	2256	}
	2257
	2258	/*
	2259	* vm_map_entry_delete: [ internal use only ]
	2260	*
	2261	* Deallocate the given entry from the target map.
	2262	*/
	2263	static void
	2264	vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry, int *countp)
	2265	{
	2266	vm_map_entry_unlink(map, entry);
	2267	map->size -= entry->end - entry->start;
	2268
	2269	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
	2270	vm_object_deallocate(entry->object.vm_object);
	2271	}
	2272
	2273	vm_map_entry_dispose(map, entry, countp);
	2274	}
	2275
	2276	/*
	2277	* vm_map_delete: [ internal use only ]
	2278	*
	2279	* Deallocates the given address range from the target
	2280	* map.
	2281	*/
	2282	int
	2283	vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end, int *countp)
	2284	{
	2285	vm_object_t object;
	2286	vm_map_entry_t entry;
	2287	vm_map_entry_t first_entry;
	2288
	2289	again:
	2290	/*
	2291	* Find the start of the region, and clip it. Set entry to point
	2292	* at the first record containing the requested address or, if no
	2293	* such record exists, the next record with a greater address. The
	2294	* loop will run from this point until a record beyond the termination
	2295	* address is encountered.
	2296	*
	2297	* map->hint must be adjusted to not point to anything we delete,
	2298	* so set it to the entry prior to the one being deleted.
	2299	*
	2300	* GGG see other GGG comment.
	2301	*/
	2302	if (vm_map_lookup_entry(map, start, &first_entry)) {
	2303	entry = first_entry;
	2304	vm_map_clip_start(map, entry, start, countp);
	2305	map->hint = entry->prev; /* possible problem XXX */
	2306	} else {
	2307	map->hint = first_entry; /* possible problem XXX */
	2308	entry = first_entry->next;
	2309	}
	2310
	2311	/*
	2312	* If a hole opens up prior to the current first_free then
	2313	* adjust first_free. As with map->hint, map->first_free
	2314	* cannot be left set to anything we might delete.
	2315	*/
	2316	if (entry == &map->header) {
	2317	map->first_free = &map->header;
	2318	} else if (map->first_free->start >= start) {
	2319	map->first_free = entry->prev;
	2320	}
	2321
	2322	/*
	2323	* Step through all entries in this region
	2324	*/
	2325
	2326	while ((entry != &map->header) && (entry->start < end)) {
	2327	vm_map_entry_t next;
	2328	vm_offset_t s, e;
	2329	vm_pindex_t offidxstart, offidxend, count;
	2330
	2331	/*
	2332	* If we hit an in-transition entry we have to sleep and
	2333	* retry. It's easier (and not really slower) to just retry
	2334	* since this case occurs so rarely and the hint is already
	2335	* pointing at the right place. We have to reset the
	2336	* start offset so as not to accidently delete an entry
	2337	* another process just created in vacated space.
	2338	*/
	2339	if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
	2340	entry->eflags \|= MAP_ENTRY_NEEDS_WAKEUP;
	2341	start = entry->start;
	2342	++mycpu->gd_cnt.v_intrans_coll;
	2343	++mycpu->gd_cnt.v_intrans_wait;
	2344	vm_map_transition_wait(map);
	2345	goto again;
	2346	}
	2347	vm_map_clip_end(map, entry, end, countp);
	2348
	2349	s = entry->start;
	2350	e = entry->end;
	2351	next = entry->next;
	2352
	2353	offidxstart = OFF_TO_IDX(entry->offset);
	2354	count = OFF_TO_IDX(e - s);
	2355	object = entry->object.vm_object;
	2356
	2357	/*
	2358	* Unwire before removing addresses from the pmap; otherwise,
	2359	* unwiring will put the entries back in the pmap.
	2360	*/
	2361	if (entry->wired_count != 0)
	2362	vm_map_entry_unwire(map, entry);
	2363
	2364	offidxend = offidxstart + count;
	2365
	2366	if ((object == kernel_object) \|\| (object == kmem_object)) {
	2367	vm_object_page_remove(object, offidxstart, offidxend, FALSE);
	2368	} else {
	2369	pmap_remove(map->pmap, s, e);
	2370	if (object != NULL &&
	2371	object->ref_count != 1 &&
	2372	(object->flags & (OBJ_NOSPLIT\|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING &&
	2373	(object->type == OBJT_DEFAULT \|\| object->type == OBJT_SWAP)) {
	2374	vm_object_collapse(object);
	2375	vm_object_page_remove(object, offidxstart, offidxend, FALSE);
	2376	if (object->type == OBJT_SWAP) {
	2377	swap_pager_freespace(object, offidxstart, count);
	2378	}
	2379	if (offidxend >= object->size &&
	2380	offidxstart < object->size) {
	2381	object->size = offidxstart;
	2382	}
	2383	}
	2384	}
	2385
	2386	/*
	2387	* Delete the entry (which may delete the object) only after
	2388	* removing all pmap entries pointing to its pages.
	2389	* (Otherwise, its page frames may be reallocated, and any
	2390	* modify bits will be set in the wrong object!)
	2391	*/
	2392	vm_map_entry_delete(map, entry, countp);
	2393	entry = next;
	2394	}
	2395	return (KERN_SUCCESS);
	2396	}
	2397
	2398	/*
	2399	* vm_map_remove:
	2400	*
	2401	* Remove the given address range from the target map.
	2402	* This is the exported form of vm_map_delete.
	2403	*/
	2404	int
	2405	vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
	2406	{
	2407	int result;
	2408	int count;
	2409
	2410	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	2411	vm_map_lock(map);
	2412	VM_MAP_RANGE_CHECK(map, start, end);
	2413	result = vm_map_delete(map, start, end, &count);
	2414	vm_map_unlock(map);
	2415	vm_map_entry_release(count);
	2416
	2417	return (result);
	2418	}
	2419
	2420	/*
	2421	* vm_map_check_protection:
	2422	*
	2423	* Assert that the target map allows the specified
	2424	* privilege on the entire address region given.
	2425	* The entire region must be allocated.
	2426	*/
	2427	boolean_t
	2428	vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
	2429	vm_prot_t protection)
	2430	{
	2431	vm_map_entry_t entry;
	2432	vm_map_entry_t tmp_entry;
	2433
	2434	if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
	2435	return (FALSE);
	2436	}
	2437	entry = tmp_entry;
	2438
	2439	while (start < end) {
	2440	if (entry == &map->header) {
	2441	return (FALSE);
	2442	}
	2443	/*
	2444	* No holes allowed!
	2445	*/
	2446
	2447	if (start < entry->start) {
	2448	return (FALSE);
	2449	}
	2450	/*
	2451	* Check protection associated with entry.
	2452	*/
	2453
	2454	if ((entry->protection & protection) != protection) {
	2455	return (FALSE);
	2456	}
	2457	/* go to next entry */
	2458
	2459	start = entry->end;
	2460	entry = entry->next;
	2461	}
	2462	return (TRUE);
	2463	}
	2464
	2465	/*
	2466	* Split the pages in a map entry into a new object. This affords
	2467	* easier removal of unused pages, and keeps object inheritance from
	2468	* being a negative impact on memory usage.
	2469	*/
	2470	static void
	2471	vm_map_split(vm_map_entry_t entry)
	2472	{
	2473	vm_page_t m;
	2474	vm_object_t orig_object, new_object, source;
	2475	vm_offset_t s, e;
	2476	vm_pindex_t offidxstart, offidxend, idx;
	2477	vm_size_t size;
	2478	vm_ooffset_t offset;
	2479
	2480	orig_object = entry->object.vm_object;
	2481	if (orig_object->type != OBJT_DEFAULT && orig_object->type != OBJT_SWAP)
	2482	return;
	2483	if (orig_object->ref_count <= 1)
	2484	return;
	2485
	2486	offset = entry->offset;
	2487	s = entry->start;
	2488	e = entry->end;
	2489
	2490	offidxstart = OFF_TO_IDX(offset);
	2491	offidxend = offidxstart + OFF_TO_IDX(e - s);
	2492	size = offidxend - offidxstart;
	2493
	2494	new_object = vm_pager_allocate(orig_object->type,
	2495	NULL, IDX_TO_OFF(size), VM_PROT_ALL, 0LL);
	2496	if (new_object == NULL)
	2497	return;
	2498
	2499	source = orig_object->backing_object;
	2500	if (source != NULL) {
	2501	vm_object_reference(source); /* Referenced by new_object */
	2502	LIST_INSERT_HEAD(&source->shadow_head,
	2503	new_object, shadow_list);
	2504	vm_object_clear_flag(source, OBJ_ONEMAPPING);
	2505	new_object->backing_object_offset =
	2506	orig_object->backing_object_offset + IDX_TO_OFF(offidxstart);
	2507	new_object->backing_object = source;
	2508	source->shadow_count++;
	2509	source->generation++;
	2510	}
	2511
	2512	for (idx = 0; idx < size; idx++) {
	2513	vm_page_t m;
	2514
	2515	/*
	2516	* A critical section is required to avoid a race between
	2517	* the lookup and an interrupt/unbusy/free and our busy
	2518	* check.
	2519	*/
	2520	crit_enter();
	2521	retry:
	2522	m = vm_page_lookup(orig_object, offidxstart + idx);
	2523	if (m == NULL) {
	2524	crit_exit();
	2525	continue;
	2526	}
	2527
	2528	/*
	2529	* We must wait for pending I/O to complete before we can
	2530	* rename the page.
	2531	*
	2532	* We do not have to VM_PROT_NONE the page as mappings should
	2533	* not be changed by this operation.
	2534	*/
	2535	if (vm_page_sleep_busy(m, TRUE, "spltwt"))
	2536	goto retry;
	2537	vm_page_busy(m);
	2538	vm_page_rename(m, new_object, idx);
	2539	/* page automatically made dirty by rename and cache handled */
	2540	vm_page_busy(m);
	2541	crit_exit();
	2542	}
	2543
	2544	if (orig_object->type == OBJT_SWAP) {
	2545	vm_object_pip_add(orig_object, 1);
	2546	/*
	2547	* copy orig_object pages into new_object
	2548	* and destroy unneeded pages in
	2549	* shadow object.
	2550	*/
	2551	swap_pager_copy(orig_object, new_object, offidxstart, 0);
	2552	vm_object_pip_wakeup(orig_object);
	2553	}
	2554
	2555	/*
	2556	* Wakeup the pages we played with. No spl protection is needed
	2557	* for a simple wakeup.
	2558	*/
	2559	for (idx = 0; idx < size; idx++) {
	2560	m = vm_page_lookup(new_object, idx);
	2561	if (m)
	2562	vm_page_wakeup(m);
	2563	}
	2564
	2565	entry->object.vm_object = new_object;
	2566	entry->offset = 0LL;
	2567	vm_object_deallocate(orig_object);
	2568	}
	2569
	2570	/*
	2571	* vm_map_copy_entry:
	2572	*
	2573	* Copies the contents of the source entry to the destination
	2574	* entry. The entries must be aligned properly.
	2575	*/
	2576	static void
	2577	vm_map_copy_entry(vm_map_t src_map, vm_map_t dst_map,
	2578	vm_map_entry_t src_entry, vm_map_entry_t dst_entry)
	2579	{
	2580	vm_object_t src_object;
	2581
	2582	if ((dst_entry->eflags\|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP)
	2583	return;
	2584
	2585	if (src_entry->wired_count == 0) {
	2586
	2587	/*
	2588	* If the source entry is marked needs_copy, it is already
	2589	* write-protected.
	2590	*/
	2591	if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) {
	2592	pmap_protect(src_map->pmap,
	2593	src_entry->start,
	2594	src_entry->end,
	2595	src_entry->protection & ~VM_PROT_WRITE);
	2596	}
	2597
	2598	/*
	2599	* Make a copy of the object.
	2600	*/
	2601	if ((src_object = src_entry->object.vm_object) != NULL) {
	2602
	2603	if ((src_object->handle == NULL) &&
	2604	(src_object->type == OBJT_DEFAULT \|\|
	2605	src_object->type == OBJT_SWAP)) {
	2606	vm_object_collapse(src_object);
	2607	if ((src_object->flags & (OBJ_NOSPLIT\|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING) {
	2608	vm_map_split(src_entry);
	2609	src_object = src_entry->object.vm_object;
	2610	}
	2611	}
	2612
	2613	vm_object_reference(src_object);
	2614	vm_object_clear_flag(src_object, OBJ_ONEMAPPING);
	2615	dst_entry->object.vm_object = src_object;
	2616	src_entry->eflags \|= (MAP_ENTRY_COW\|MAP_ENTRY_NEEDS_COPY);
	2617	dst_entry->eflags \|= (MAP_ENTRY_COW\|MAP_ENTRY_NEEDS_COPY);
	2618	dst_entry->offset = src_entry->offset;
	2619	} else {
	2620	dst_entry->object.vm_object = NULL;
	2621	dst_entry->offset = 0;
	2622	}
	2623
	2624	pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start,
	2625	dst_entry->end - dst_entry->start, src_entry->start);
	2626	} else {
	2627	/*
	2628	* Of course, wired down pages can't be set copy-on-write.
	2629	* Cause wired pages to be copied into the new map by
	2630	* simulating faults (the new pages are pageable)
	2631	*/
	2632	vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry);
	2633	}
	2634	}
	2635
	2636	/*
	2637	* vmspace_fork:
	2638	* Create a new process vmspace structure and vm_map
	2639	* based on those of an existing process. The new map
	2640	* is based on the old map, according to the inheritance
	2641	* values on the regions in that map.
	2642	*
	2643	* The source map must not be locked.
	2644	*/
	2645	struct vmspace *
	2646	vmspace_fork(struct vmspace *vm1)
	2647	{
	2648	struct vmspace *vm2;
	2649	vm_map_t old_map = &vm1->vm_map;
	2650	vm_map_t new_map;
	2651	vm_map_entry_t old_entry;
	2652	vm_map_entry_t new_entry;
	2653	vm_object_t object;
	2654	int count;
	2655
	2656	vm_map_lock(old_map);
	2657	old_map->infork = 1;
	2658
	2659	/*
	2660	* XXX Note: upcalls are not copied.
	2661	*/
	2662	vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset);
	2663	bcopy(&vm1->vm_startcopy, &vm2->vm_startcopy,
	2664	(caddr_t)&vm1->vm_endcopy - (caddr_t)&vm1->vm_startcopy);
	2665	new_map = &vm2->vm_map; /* XXX */
	2666	new_map->timestamp = 1;
	2667
	2668	count = 0;
	2669	old_entry = old_map->header.next;
	2670	while (old_entry != &old_map->header) {
	2671	++count;
	2672	old_entry = old_entry->next;
	2673	}
	2674
	2675	count = vm_map_entry_reserve(count + MAP_RESERVE_COUNT);
	2676
	2677	old_entry = old_map->header.next;
	2678	while (old_entry != &old_map->header) {
	2679	if (old_entry->eflags & MAP_ENTRY_IS_SUB_MAP)
	2680	panic("vm_map_fork: encountered a submap");
	2681
	2682	switch (old_entry->inheritance) {
	2683	case VM_INHERIT_NONE:
	2684	break;
	2685
	2686	case VM_INHERIT_SHARE:
	2687	/*
	2688	* Clone the entry, creating the shared object if necessary.
	2689	*/
	2690	object = old_entry->object.vm_object;
	2691	if (object == NULL) {
	2692	object = vm_object_allocate(OBJT_DEFAULT,
	2693	atop(old_entry->end - old_entry->start));
	2694	old_entry->object.vm_object = object;
	2695	old_entry->offset = (vm_offset_t) 0;
	2696	}
	2697
	2698	/*
	2699	* Add the reference before calling vm_object_shadow
	2700	* to insure that a shadow object is created.
	2701	*/
	2702	vm_object_reference(object);
	2703	if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
	2704	vm_object_shadow(&old_entry->object.vm_object,
	2705	&old_entry->offset,
	2706	atop(old_entry->end - old_entry->start));
	2707	old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
	2708	/* Transfer the second reference too. */
	2709	vm_object_reference(
	2710	old_entry->object.vm_object);
	2711	vm_object_deallocate(object);
	2712	object = old_entry->object.vm_object;
	2713	}
	2714	vm_object_clear_flag(object, OBJ_ONEMAPPING);
	2715
	2716	/*
	2717	* Clone the entry, referencing the shared object.
	2718	*/
	2719	new_entry = vm_map_entry_create(new_map, &count);
	2720	new_entry = old_entry;
	2721	new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
	2722	new_entry->wired_count = 0;
	2723
	2724	/*
	2725	* Insert the entry into the new map -- we know we're
	2726	* inserting at the end of the new map.
	2727	*/
	2728
	2729	vm_map_entry_link(new_map, new_map->header.prev,
	2730	new_entry);
	2731
	2732	/*
	2733	* Update the physical map
	2734	*/
	2735
	2736	pmap_copy(new_map->pmap, old_map->pmap,
	2737	new_entry->start,
	2738	(old_entry->end - old_entry->start),
	2739	old_entry->start);
	2740	break;
	2741
	2742	case VM_INHERIT_COPY:
	2743	/*
	2744	* Clone the entry and link into the map.
	2745	*/
	2746	new_entry = vm_map_entry_create(new_map, &count);
	2747	new_entry = old_entry;
	2748	new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
	2749	new_entry->wired_count = 0;
	2750	new_entry->object.vm_object = NULL;
	2751	vm_map_entry_link(new_map, new_map->header.prev,
	2752	new_entry);
	2753	vm_map_copy_entry(old_map, new_map, old_entry,
	2754	new_entry);
	2755	break;
	2756	}
	2757	old_entry = old_entry->next;
	2758	}
	2759
	2760	new_map->size = old_map->size;
	2761	old_map->infork = 0;
	2762	vm_map_unlock(old_map);
	2763	vm_map_entry_release(count);
	2764
	2765	return (vm2);
	2766	}
	2767
	2768	int
	2769	vm_map_stack (vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
	2770	vm_prot_t prot, vm_prot_t max, int cow)
	2771	{
	2772	vm_map_entry_t prev_entry;
	2773	vm_map_entry_t new_stack_entry;
	2774	vm_size_t init_ssize;
	2775	int rv;
	2776	int count;
	2777
	2778	if (VM_MIN_ADDRESS > 0 && addrbos < VM_MIN_ADDRESS)
	2779	return (KERN_NO_SPACE);
	2780
	2781	if (max_ssize < sgrowsiz)
	2782	init_ssize = max_ssize;
	2783	else
	2784	init_ssize = sgrowsiz;
	2785
	2786	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	2787	vm_map_lock(map);
	2788
	2789	/* If addr is already mapped, no go */
	2790	if (vm_map_lookup_entry(map, addrbos, &prev_entry)) {
	2791	vm_map_unlock(map);
	2792	vm_map_entry_release(count);
	2793	return (KERN_NO_SPACE);
	2794	}
	2795
	2796	/* If we would blow our VMEM resource limit, no go */
	2797	if (map->size + init_ssize >
	2798	curproc->p_rlimit[RLIMIT_VMEM].rlim_cur) {
	2799	vm_map_unlock(map);
	2800	vm_map_entry_release(count);
	2801	return (KERN_NO_SPACE);
	2802	}
	2803
	2804	/* If we can't accomodate max_ssize in the current mapping,
	2805	* no go. However, we need to be aware that subsequent user
	2806	* mappings might map into the space we have reserved for
	2807	* stack, and currently this space is not protected.
	2808	*
	2809	* Hopefully we will at least detect this condition
	2810	* when we try to grow the stack.
	2811	*/
	2812	if ((prev_entry->next != &map->header) &&
	2813	(prev_entry->next->start < addrbos + max_ssize)) {
	2814	vm_map_unlock(map);
	2815	vm_map_entry_release(count);
	2816	return (KERN_NO_SPACE);
	2817	}
	2818
	2819	/* We initially map a stack of only init_ssize. We will
	2820	* grow as needed later. Since this is to be a grow
	2821	* down stack, we map at the top of the range.
	2822	*
	2823	* Note: we would normally expect prot and max to be
	2824	* VM_PROT_ALL, and cow to be 0. Possibly we should
	2825	* eliminate these as input parameters, and just
	2826	* pass these values here in the insert call.
	2827	*/
	2828	rv = vm_map_insert(map, &count,
	2829	NULL, 0, addrbos + max_ssize - init_ssize,
	2830	addrbos + max_ssize, prot, max, cow);
	2831
	2832	/* Now set the avail_ssize amount */
	2833	if (rv == KERN_SUCCESS) {
	2834	if (prev_entry != &map->header)
	2835	vm_map_clip_end(map, prev_entry, addrbos + max_ssize - init_ssize, &count);
	2836	new_stack_entry = prev_entry->next;
	2837	if (new_stack_entry->end != addrbos + max_ssize \|\|
	2838	new_stack_entry->start != addrbos + max_ssize - init_ssize)
	2839	panic ("Bad entry start/end for new stack entry");
	2840	else
	2841	new_stack_entry->avail_ssize = max_ssize - init_ssize;
	2842	}
	2843
	2844	vm_map_unlock(map);
	2845	vm_map_entry_release(count);
	2846	return (rv);
	2847	}
	2848
	2849	/* Attempts to grow a vm stack entry. Returns KERN_SUCCESS if the
	2850	* desired address is already mapped, or if we successfully grow
	2851	* the stack. Also returns KERN_SUCCESS if addr is outside the
	2852	* stack range (this is strange, but preserves compatibility with
	2853	* the grow function in vm_machdep.c).
	2854	*/
	2855	int
	2856	vm_map_growstack (struct proc *p, vm_offset_t addr)
	2857	{
	2858	vm_map_entry_t prev_entry;
	2859	vm_map_entry_t stack_entry;
	2860	vm_map_entry_t new_stack_entry;
	2861	struct vmspace *vm = p->p_vmspace;
	2862	vm_map_t map = &vm->vm_map;
	2863	vm_offset_t end;
	2864	int grow_amount;
	2865	int rv = KERN_SUCCESS;
	2866	int is_procstack;
	2867	int use_read_lock = 1;
	2868	int count;
	2869
	2870	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	2871	Retry:
	2872	if (use_read_lock)
	2873	vm_map_lock_read(map);
	2874	else
	2875	vm_map_lock(map);
	2876
	2877	/* If addr is already in the entry range, no need to grow.*/
	2878	if (vm_map_lookup_entry(map, addr, &prev_entry))
	2879	goto done;
	2880
	2881	if ((stack_entry = prev_entry->next) == &map->header)
	2882	goto done;
	2883	if (prev_entry == &map->header)
	2884	end = stack_entry->start - stack_entry->avail_ssize;
	2885	else
	2886	end = prev_entry->end;
	2887
	2888	/* This next test mimics the old grow function in vm_machdep.c.
	2889	* It really doesn't quite make sense, but we do it anyway
	2890	* for compatibility.
	2891	*
	2892	* If not growable stack, return success. This signals the
	2893	* caller to proceed as he would normally with normal vm.
	2894	*/
	2895	if (stack_entry->avail_ssize < 1 \|\|
	2896	addr >= stack_entry->start \|\|
	2897	addr < stack_entry->start - stack_entry->avail_ssize) {
	2898	goto done;
	2899	}
	2900
	2901	/* Find the minimum grow amount */
	2902	grow_amount = roundup (stack_entry->start - addr, PAGE_SIZE);
	2903	if (grow_amount > stack_entry->avail_ssize) {
	2904	rv = KERN_NO_SPACE;
	2905	goto done;
	2906	}
	2907
	2908	/* If there is no longer enough space between the entries
	2909	* nogo, and adjust the available space. Note: this
	2910	* should only happen if the user has mapped into the
	2911	* stack area after the stack was created, and is
	2912	* probably an error.
	2913	*
	2914	* This also effectively destroys any guard page the user
	2915	* might have intended by limiting the stack size.
	2916	*/
	2917	if (grow_amount > stack_entry->start - end) {
	2918	if (use_read_lock && vm_map_lock_upgrade(map)) {
	2919	use_read_lock = 0;
	2920	goto Retry;
	2921	}
	2922	use_read_lock = 0;
	2923	stack_entry->avail_ssize = stack_entry->start - end;
	2924	rv = KERN_NO_SPACE;
	2925	goto done;
	2926	}
	2927
	2928	is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr;
	2929
	2930	/* If this is the main process stack, see if we're over the
	2931	* stack limit.
	2932	*/
	2933	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount >
	2934	p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
	2935	rv = KERN_NO_SPACE;
	2936	goto done;
	2937	}
	2938
	2939	/* Round up the grow amount modulo SGROWSIZ */
	2940	grow_amount = roundup (grow_amount, sgrowsiz);
	2941	if (grow_amount > stack_entry->avail_ssize) {
	2942	grow_amount = stack_entry->avail_ssize;
	2943	}
	2944	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount >
	2945	p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
	2946	grow_amount = p->p_rlimit[RLIMIT_STACK].rlim_cur -
	2947	ctob(vm->vm_ssize);
	2948	}
	2949
	2950	/* If we would blow our VMEM resource limit, no go */
	2951	if (map->size + grow_amount > p->p_rlimit[RLIMIT_VMEM].rlim_cur) {
	2952	rv = KERN_NO_SPACE;
	2953	goto done;
	2954	}
	2955
	2956	if (use_read_lock && vm_map_lock_upgrade(map)) {
	2957	use_read_lock = 0;
	2958	goto Retry;
	2959	}
	2960	use_read_lock = 0;
	2961
	2962	/* Get the preliminary new entry start value */
	2963	addr = stack_entry->start - grow_amount;
	2964
	2965	/* If this puts us into the previous entry, cut back our growth
	2966	* to the available space. Also, see the note above.
	2967	*/
	2968	if (addr < end) {
	2969	stack_entry->avail_ssize = stack_entry->start - end;
	2970	addr = end;
	2971	}
	2972
	2973	rv = vm_map_insert(map, &count,
	2974	NULL, 0, addr, stack_entry->start,
	2975	VM_PROT_ALL,
	2976	VM_PROT_ALL,
	2977	0);
	2978
	2979	/* Adjust the available stack space by the amount we grew. */
	2980	if (rv == KERN_SUCCESS) {
	2981	if (prev_entry != &map->header)
	2982	vm_map_clip_end(map, prev_entry, addr, &count);
	2983	new_stack_entry = prev_entry->next;
	2984	if (new_stack_entry->end != stack_entry->start \|\|
	2985	new_stack_entry->start != addr)
	2986	panic ("Bad stack grow start/end in new stack entry");
	2987	else {
	2988	new_stack_entry->avail_ssize = stack_entry->avail_ssize -
	2989	(new_stack_entry->end -
	2990	new_stack_entry->start);
	2991	if (is_procstack)
	2992	vm->vm_ssize += btoc(new_stack_entry->end -
	2993	new_stack_entry->start);
	2994	}
	2995	}
	2996
	2997	done:
	2998	if (use_read_lock)
	2999	vm_map_unlock_read(map);
	3000	else
	3001	vm_map_unlock(map);
	3002	vm_map_entry_release(count);
	3003	return (rv);
	3004	}
	3005
	3006	/*
	3007	* Unshare the specified VM space for exec. If other processes are
	3008	* mapped to it, then create a new one. The new vmspace is null.
	3009	*/
	3010
	3011	void
	3012	vmspace_exec(struct proc p, struct vmspace vmcopy)
	3013	{
	3014	struct vmspace *oldvmspace = p->p_vmspace;
	3015	struct vmspace *newvmspace;
	3016	vm_map_t map = &p->p_vmspace->vm_map;
	3017
	3018	/*
	3019	* If we are execing a resident vmspace we fork it, otherwise
	3020	* we create a new vmspace. Note that exitingcnt and upcalls
	3021	* are not copied to the new vmspace.
	3022	*/
	3023	if (vmcopy) {
	3024	newvmspace = vmspace_fork(vmcopy);
	3025	} else {
	3026	newvmspace = vmspace_alloc(map->min_offset, map->max_offset);
	3027	bcopy(&oldvmspace->vm_startcopy, &newvmspace->vm_startcopy,
	3028	(caddr_t)&oldvmspace->vm_endcopy -
	3029	(caddr_t)&oldvmspace->vm_startcopy);
	3030	}
	3031
	3032	/*
	3033	* This code is written like this for prototype purposes. The
	3034	* goal is to avoid running down the vmspace here, but let the
	3035	* other process's that are still using the vmspace to finally
	3036	* run it down. Even though there is little or no chance of blocking
	3037	* here, it is a good idea to keep this form for future mods.
	3038	*/
	3039	p->p_vmspace = newvmspace;
	3040	pmap_pinit2(vmspace_pmap(newvmspace));
	3041	if (p == curproc)
	3042	pmap_activate(p);
	3043	vmspace_free(oldvmspace);
	3044	}
	3045
	3046	/*
	3047	* Unshare the specified VM space for forcing COW. This
	3048	* is called by rfork, for the (RFMEM\|RFPROC) == 0 case.
	3049	*
	3050	* The exitingcnt test is not strictly necessary but has been
	3051	* included for code sanity (to make the code a bit more deterministic).
	3052	*/
	3053
	3054	void
	3055	vmspace_unshare(struct proc *p)
	3056	{
	3057	struct vmspace *oldvmspace = p->p_vmspace;
	3058	struct vmspace *newvmspace;
	3059
	3060	if (oldvmspace->vm_refcnt == 1 && oldvmspace->vm_exitingcnt == 0)
	3061	return;
	3062	newvmspace = vmspace_fork(oldvmspace);
	3063	p->p_vmspace = newvmspace;
	3064	pmap_pinit2(vmspace_pmap(newvmspace));
	3065	if (p == curproc)
	3066	pmap_activate(p);
	3067	vmspace_free(oldvmspace);
	3068	}
	3069
	3070	/*
	3071	* vm_map_lookup:
	3072	*
	3073	* Finds the VM object, offset, and
	3074	* protection for a given virtual address in the
	3075	* specified map, assuming a page fault of the
	3076	* type specified.
	3077	*
	3078	* Leaves the map in question locked for read; return
	3079	* values are guaranteed until a vm_map_lookup_done
	3080	* call is performed. Note that the map argument
	3081	* is in/out; the returned map must be used in
	3082	* the call to vm_map_lookup_done.
	3083	*
	3084	* A handle (out_entry) is returned for use in
	3085	* vm_map_lookup_done, to make that fast.
	3086	*
	3087	* If a lookup is requested with "write protection"
	3088	* specified, the map may be changed to perform virtual
	3089	* copying operations, although the data referenced will
	3090	* remain the same.
	3091	*/
	3092	int
	3093	vm_map_lookup(vm_map_t var_map, / IN/OUT */
	3094	vm_offset_t vaddr,
	3095	vm_prot_t fault_typea,
	3096	vm_map_entry_t out_entry, / OUT */
	3097	vm_object_t object, / OUT */
	3098	vm_pindex_t pindex, / OUT */
	3099	vm_prot_t out_prot, / OUT */
	3100	boolean_t wired) / OUT */
	3101	{
	3102	vm_map_entry_t entry;
	3103	vm_map_t map = *var_map;
	3104	vm_prot_t prot;
	3105	vm_prot_t fault_type = fault_typea;
	3106	int use_read_lock = 1;
	3107	int rv = KERN_SUCCESS;
	3108
	3109	RetryLookup:
	3110	if (use_read_lock)
	3111	vm_map_lock_read(map);
	3112	else
	3113	vm_map_lock(map);
	3114
	3115	/*
	3116	* If the map has an interesting hint, try it before calling full
	3117	* blown lookup routine.
	3118	*/
	3119	entry = map->hint;
	3120	*out_entry = entry;
	3121
	3122	if ((entry == &map->header) \|\|
	3123	(vaddr < entry->start) \|\| (vaddr >= entry->end)) {
	3124	vm_map_entry_t tmp_entry;
	3125
	3126	/*
	3127	* Entry was either not a valid hint, or the vaddr was not
	3128	* contained in the entry, so do a full lookup.
	3129	*/
	3130	if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
	3131	rv = KERN_INVALID_ADDRESS;
	3132	goto done;
	3133	}
	3134
	3135	entry = tmp_entry;
	3136	*out_entry = entry;
	3137	}
	3138
	3139	/*
	3140	* Handle submaps.
	3141	*/
	3142
	3143	if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
	3144	vm_map_t old_map = map;
	3145
	3146	*var_map = map = entry->object.sub_map;
	3147	if (use_read_lock)
	3148	vm_map_unlock_read(old_map);
	3149	else
	3150	vm_map_unlock(old_map);
	3151	use_read_lock = 1;
	3152	goto RetryLookup;
	3153	}
	3154
	3155	/*
	3156	* Check whether this task is allowed to have this page.
	3157	* Note the special case for MAP_ENTRY_COW
	3158	* pages with an override. This is to implement a forced
	3159	* COW for debuggers.
	3160	*/
	3161
	3162	if (fault_type & VM_PROT_OVERRIDE_WRITE)
	3163	prot = entry->max_protection;
	3164	else
	3165	prot = entry->protection;
	3166
	3167	fault_type &= (VM_PROT_READ\|VM_PROT_WRITE\|VM_PROT_EXECUTE);
	3168	if ((fault_type & prot) != fault_type) {
	3169	rv = KERN_PROTECTION_FAILURE;
	3170	goto done;
	3171	}
	3172
	3173	if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
	3174	(entry->eflags & MAP_ENTRY_COW) &&
	3175	(fault_type & VM_PROT_WRITE) &&
	3176	(fault_typea & VM_PROT_OVERRIDE_WRITE) == 0) {
	3177	rv = KERN_PROTECTION_FAILURE;
	3178	goto done;
	3179	}
	3180
	3181	/*
	3182	* If this page is not pageable, we have to get it for all possible
	3183	* accesses.
	3184	*/
	3185
	3186	*wired = (entry->wired_count != 0);
	3187	if (*wired)
	3188	prot = fault_type = entry->protection;
	3189
	3190	/*
	3191	* If the entry was copy-on-write, we either ...
	3192	*/
	3193
	3194	if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
	3195	/*
	3196	* If we want to write the page, we may as well handle that
	3197	* now since we've got the map locked.
	3198	*
	3199	* If we don't need to write the page, we just demote the
	3200	* permissions allowed.
	3201	*/
	3202
	3203	if (fault_type & VM_PROT_WRITE) {
	3204	/*
	3205	* Make a new object, and place it in the object
	3206	* chain. Note that no new references have appeared
	3207	* -- one just moved from the map to the new
	3208	* object.
	3209	*/
	3210
	3211	if (use_read_lock && vm_map_lock_upgrade(map)) {
	3212	use_read_lock = 0;
	3213	goto RetryLookup;
	3214	}
	3215	use_read_lock = 0;
	3216
	3217	vm_object_shadow(
	3218	&entry->object.vm_object,
	3219	&entry->offset,
	3220	atop(entry->end - entry->start));
	3221
	3222	entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
	3223	} else {
	3224	/*
	3225	* We're attempting to read a copy-on-write page --
	3226	* don't allow writes.
	3227	*/
	3228
	3229	prot &= ~VM_PROT_WRITE;
	3230	}
	3231	}
	3232
	3233	/*
	3234	* Create an object if necessary.
	3235	*/
	3236	if (entry->object.vm_object == NULL &&
	3237	!map->system_map) {
	3238	if (use_read_lock && vm_map_lock_upgrade(map)) {
	3239	use_read_lock = 0;
	3240	goto RetryLookup;
	3241	}
	3242	use_read_lock = 0;
	3243	entry->object.vm_object = vm_object_allocate(OBJT_DEFAULT,
	3244	atop(entry->end - entry->start));
	3245	entry->offset = 0;
	3246	}
	3247
	3248	/*
	3249	* Return the object/offset from this entry. If the entry was
	3250	* copy-on-write or empty, it has been fixed up.
	3251	*/
	3252
	3253	*pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
	3254	*object = entry->object.vm_object;
	3255
	3256	/*
	3257	* Return whether this is the only map sharing this data. On
	3258	* success we return with a read lock held on the map. On failure
	3259	* we return with the map unlocked.
	3260	*/
	3261	*out_prot = prot;
	3262	done:
	3263	if (rv == KERN_SUCCESS) {
	3264	if (use_read_lock == 0)
	3265	vm_map_lock_downgrade(map);
	3266	} else if (use_read_lock) {
	3267	vm_map_unlock_read(map);
	3268	} else {
	3269	vm_map_unlock(map);
	3270	}
	3271	return (rv);
	3272	}
	3273
	3274	/*
	3275	* vm_map_lookup_done:
	3276	*
	3277	* Releases locks acquired by a vm_map_lookup
	3278	* (according to the handle returned by that lookup).
	3279	*/
	3280
	3281	void
	3282	vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry, int count)
	3283	{
	3284	/*
	3285	* Unlock the main-level map
	3286	*/
	3287	vm_map_unlock_read(map);
	3288	if (count)
	3289	vm_map_entry_release(count);
	3290	}
	3291
	3292	/*
	3293	* Performs the copy_on_write operations necessary to allow the virtual copies
	3294	* into user space to work. This has to be called for write(2) system calls
	3295	* from other processes, file unlinking, and file size shrinkage.
	3296	*/
	3297	void
	3298	vm_freeze_copyopts(vm_object_t object, vm_pindex_t froma, vm_pindex_t toa)
	3299	{
	3300	int rv;
	3301	vm_object_t robject;
	3302	vm_pindex_t idx;
	3303
	3304	if ((object == NULL) \|\|
	3305	((object->flags & OBJ_OPT) == 0))
	3306	return;
	3307
	3308	if (object->shadow_count > object->ref_count)
	3309	panic("vm_freeze_copyopts: sc > rc");
	3310
	3311	while ((robject = LIST_FIRST(&object->shadow_head)) != NULL) {
	3312	vm_pindex_t bo_pindex;
	3313	vm_page_t m_in, m_out;
	3314
	3315	bo_pindex = OFF_TO_IDX(robject->backing_object_offset);
	3316
	3317	vm_object_reference(robject);
	3318
	3319	vm_object_pip_wait(robject, "objfrz");
	3320
	3321	if (robject->ref_count == 1) {
	3322	vm_object_deallocate(robject);
	3323	continue;
	3324	}
	3325
	3326	vm_object_pip_add(robject, 1);
	3327
	3328	for (idx = 0; idx < robject->size; idx++) {
	3329
	3330	m_out = vm_page_grab(robject, idx,
	3331	VM_ALLOC_NORMAL \| VM_ALLOC_RETRY);
	3332
	3333	if (m_out->valid == 0) {
	3334	m_in = vm_page_grab(object, bo_pindex + idx,
	3335	VM_ALLOC_NORMAL \| VM_ALLOC_RETRY);
	3336	if (m_in->valid == 0) {
	3337	rv = vm_pager_get_pages(object, &m_in, 1, 0);
	3338	if (rv != VM_PAGER_OK) {
	3339	printf("vm_freeze_copyopts: cannot read page from file: %lx\n", (long)m_in->pindex);
	3340	continue;
	3341	}
	3342	vm_page_deactivate(m_in);
	3343	}
	3344
	3345	vm_page_protect(m_in, VM_PROT_NONE);
	3346	pmap_copy_page(VM_PAGE_TO_PHYS(m_in), VM_PAGE_TO_PHYS(m_out));
	3347	m_out->valid = m_in->valid;
	3348	vm_page_dirty(m_out);
	3349	vm_page_activate(m_out);
	3350	vm_page_wakeup(m_in);
	3351	}
	3352	vm_page_wakeup(m_out);
	3353	}
	3354
	3355	object->shadow_count--;
	3356	object->ref_count--;
	3357	LIST_REMOVE(robject, shadow_list);
	3358	robject->backing_object = NULL;
	3359	robject->backing_object_offset = 0;
	3360
	3361	vm_object_pip_wakeup(robject);
	3362	vm_object_deallocate(robject);
	3363	}
	3364
	3365	vm_object_clear_flag(object, OBJ_OPT);
	3366	}
	3367
	3368	#include "opt_ddb.h"
	3369	#ifdef DDB
	3370	#include <sys/kernel.h>
	3371
	3372	#include <ddb/ddb.h>
	3373
	3374	/*
	3375	* vm_map_print: [ debug ]
	3376	*/
	3377	DB_SHOW_COMMAND(map, vm_map_print)
	3378	{
	3379	static int nlines;
	3380	/* XXX convert args. */
	3381	vm_map_t map = (vm_map_t)addr;
	3382	boolean_t full = have_addr;
	3383
	3384	vm_map_entry_t entry;
	3385
	3386	db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
	3387	(void *)map,
	3388	(void *)map->pmap, map->nentries, map->timestamp);
	3389	nlines++;
	3390
	3391	if (!full && db_indent)
	3392	return;
	3393
	3394	db_indent += 2;
	3395	for (entry = map->header.next; entry != &map->header;
	3396	entry = entry->next) {
	3397	db_iprintf("map entry %p: start=%p, end=%p\n",
	3398	(void )entry, (void )entry->start, (void *)entry->end);
	3399	nlines++;
	3400	{
	3401	static char *inheritance_name[4] =
	3402	{"share", "copy", "none", "donate_copy"};
	3403
	3404	db_iprintf(" prot=%x/%x/%s",
	3405	entry->protection,
	3406	entry->max_protection,
	3407	inheritance_name[(int)(unsigned char)entry->inheritance]);
	3408	if (entry->wired_count != 0)
	3409	db_printf(", wired");
	3410	}
	3411	if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
	3412	/* XXX no %qd in kernel. Truncate entry->offset. */
	3413	db_printf(", share=%p, offset=0x%lx\n",
	3414	(void *)entry->object.sub_map,
	3415	(long)entry->offset);
	3416	nlines++;
	3417	if ((entry->prev == &map->header) \|\|
	3418	(entry->prev->object.sub_map !=
	3419	entry->object.sub_map)) {
	3420	db_indent += 2;
	3421	vm_map_print((db_expr_t)(intptr_t)
	3422	entry->object.sub_map,
	3423	full, 0, (char *)0);
	3424	db_indent -= 2;
	3425	}
	3426	} else {
	3427	/* XXX no %qd in kernel. Truncate entry->offset. */
	3428	db_printf(", object=%p, offset=0x%lx",
	3429	(void *)entry->object.vm_object,
	3430	(long)entry->offset);
	3431	if (entry->eflags & MAP_ENTRY_COW)
	3432	db_printf(", copy (%s)",
	3433	(entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
	3434	db_printf("\n");
	3435	nlines++;
	3436
	3437	if ((entry->prev == &map->header) \|\|
	3438	(entry->prev->object.vm_object !=
	3439	entry->object.vm_object)) {
	3440	db_indent += 2;
	3441	vm_object_print((db_expr_t)(intptr_t)
	3442	entry->object.vm_object,
	3443	full, 0, (char *)0);
	3444	nlines += 4;
	3445	db_indent -= 2;
	3446	}
	3447	}
	3448	}
	3449	db_indent -= 2;
	3450	if (db_indent == 0)
	3451	nlines = 0;
	3452	}
	3453
	3454
	3455	DB_SHOW_COMMAND(procvm, procvm)
	3456	{
	3457	struct proc *p;
	3458
	3459	if (have_addr) {
	3460	p = (struct proc *) addr;
	3461	} else {
	3462	p = curproc;
	3463	}
	3464
	3465	db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
	3466	(void )p, (void )p->p_vmspace, (void *)&p->p_vmspace->vm_map,
	3467	(void *)vmspace_pmap(p->p_vmspace));
	3468
	3469	vm_map_print((db_expr_t)(intptr_t)&p->p_vmspace->vm_map, 1, 0, NULL);
	3470	}
	3471
	3472	#endif /* DDB */