Implement nearly all the remaining items required to allow the virtual kernel
[dragonfly.git] / sys / vm / vm_map.c
1 /*
2  * Copyright (c) 1991, 1993
3  *      The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * The Mach Operating System project at Carnegie-Mellon University.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. All advertising materials mentioning features or use of this software
17  *    must display the following acknowledgement:
18  *      This product includes software developed by the University of
19  *      California, Berkeley and its contributors.
20  * 4. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *      from: @(#)vm_map.c      8.3 (Berkeley) 1/12/94
37  *
38  *
39  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
40  * All rights reserved.
41  *
42  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
43  *
44  * Permission to use, copy, modify and distribute this software and
45  * its documentation is hereby granted, provided that both the copyright
46  * notice and this permission notice appear in all copies of the
47  * software, derivative works or modified versions, and any portions
48  * thereof, and that both notices appear in supporting documentation.
49  *
50  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
51  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
52  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
53  *
54  * Carnegie Mellon requests users of this software to return to
55  *
56  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
57  *  School of Computer Science
58  *  Carnegie Mellon University
59  *  Pittsburgh PA 15213-3890
60  *
61  * any improvements or extensions that they make and grant Carnegie the
62  * rights to redistribute these changes.
63  *
64  * $FreeBSD: src/sys/vm/vm_map.c,v 1.187.2.19 2003/05/27 00:47:02 alc Exp $
65  * $DragonFly: src/sys/vm/vm_map.c,v 1.55 2007/01/07 08:37:37 dillon Exp $
66  */
67
68 /*
69  *      Virtual memory mapping module.
70  */
71
72 #include <sys/param.h>
73 #include <sys/systm.h>
74 #include <sys/proc.h>
75 #include <sys/lock.h>
76 #include <sys/vmmeter.h>
77 #include <sys/mman.h>
78 #include <sys/vnode.h>
79 #include <sys/resourcevar.h>
80 #include <sys/shm.h>
81 #include <sys/tree.h>
82
83 #include <vm/vm.h>
84 #include <vm/vm_param.h>
85 #include <vm/pmap.h>
86 #include <vm/vm_map.h>
87 #include <vm/vm_page.h>
88 #include <vm/vm_object.h>
89 #include <vm/vm_pager.h>
90 #include <vm/vm_kern.h>
91 #include <vm/vm_extern.h>
92 #include <vm/swap_pager.h>
93 #include <vm/vm_zone.h>
94
95 #include <sys/thread2.h>
96
97 /*
98  *      Virtual memory maps provide for the mapping, protection,
99  *      and sharing of virtual memory objects.  In addition,
100  *      this module provides for an efficient virtual copy of
101  *      memory from one map to another.
102  *
103  *      Synchronization is required prior to most operations.
104  *
105  *      Maps consist of an ordered doubly-linked list of simple
106  *      entries; a single hint is used to speed up lookups.
107  *
108  *      Since portions of maps are specified by start/end addresses,
109  *      which may not align with existing map entries, all
110  *      routines merely "clip" entries to these start/end values.
111  *      [That is, an entry is split into two, bordering at a
112  *      start or end value.]  Note that these clippings may not
113  *      always be necessary (as the two resulting entries are then
114  *      not changed); however, the clipping is done for convenience.
115  *
116  *      As mentioned above, virtual copy operations are performed
117  *      by copying VM object references from one map to
118  *      another, and then marking both regions as copy-on-write.
119  */
120
121 /*
122  *      vm_map_startup:
123  *
124  *      Initialize the vm_map module.  Must be called before
125  *      any other vm_map routines.
126  *
127  *      Map and entry structures are allocated from the general
128  *      purpose memory pool with some exceptions:
129  *
130  *      - The kernel map and kmem submap are allocated statically.
131  *      - Kernel map entries are allocated out of a static pool.
132  *
133  *      These restrictions are necessary since malloc() uses the
134  *      maps and requires map entries.
135  */
136
137 #define VMEPERCPU       2
138
139 static struct vm_zone mapentzone_store, mapzone_store;
140 static vm_zone_t mapentzone, mapzone, vmspace_zone;
141 static struct vm_object mapentobj, mapobj;
142
143 static struct vm_map_entry map_entry_init[MAX_MAPENT];
144 static struct vm_map_entry cpu_map_entry_init[MAXCPU][VMEPERCPU];
145 static struct vm_map map_init[MAX_KMAP];
146
147 static void vm_map_entry_shadow(vm_map_entry_t entry);
148 static vm_map_entry_t vm_map_entry_create(vm_map_t map, int *);
149 static void vm_map_entry_dispose (vm_map_t map, vm_map_entry_t entry, int *);
150 static void _vm_map_clip_end (vm_map_t, vm_map_entry_t, vm_offset_t, int *);
151 static void _vm_map_clip_start (vm_map_t, vm_map_entry_t, vm_offset_t, int *);
152 static void vm_map_entry_delete (vm_map_t, vm_map_entry_t, int *);
153 static void vm_map_entry_unwire (vm_map_t, vm_map_entry_t);
154 static void vm_map_copy_entry (vm_map_t, vm_map_t, vm_map_entry_t,
155                 vm_map_entry_t);
156 static void vm_map_split (vm_map_entry_t);
157 static void vm_map_unclip_range (vm_map_t map, vm_map_entry_t start_entry, vm_offset_t start, vm_offset_t end, int *count, int flags);
158
159 void
160 vm_map_startup(void)
161 {
162         mapzone = &mapzone_store;
163         zbootinit(mapzone, "MAP", sizeof (struct vm_map),
164                 map_init, MAX_KMAP);
165         mapentzone = &mapentzone_store;
166         zbootinit(mapentzone, "MAP ENTRY", sizeof (struct vm_map_entry),
167                 map_entry_init, MAX_MAPENT);
168 }
169
170 /*
171  * Red black tree functions
172  */
173 static int rb_vm_map_compare(vm_map_entry_t a, vm_map_entry_t b);
174 RB_GENERATE(vm_map_rb_tree, vm_map_entry, rb_entry, rb_vm_map_compare);
175
176 /* a->start is address, and the only field has to be initialized */
177 static int
178 rb_vm_map_compare(vm_map_entry_t a, vm_map_entry_t b)
179 {
180         if (a->start < b->start)
181                 return(-1);
182         else if (a->start > b->start)
183                 return(1);
184         return(0);
185 }
186
187 /*
188  * Allocate a vmspace structure, including a vm_map and pmap,
189  * and initialize those structures.  The refcnt is set to 1.
190  * The remaining fields must be initialized by the caller.
191  */
192 struct vmspace *
193 vmspace_alloc(vm_offset_t min, vm_offset_t max)
194 {
195         struct vmspace *vm;
196
197         vm = zalloc(vmspace_zone);
198         bzero(&vm->vm_startcopy,
199                 (char *)&vm->vm_endcopy - (char *)&vm->vm_startcopy);
200         vm_map_init(&vm->vm_map, min, max, NULL);
201         pmap_pinit(vmspace_pmap(vm));
202         vm->vm_map.pmap = vmspace_pmap(vm);             /* XXX */
203         vm->vm_refcnt = 1;
204         vm->vm_shm = NULL;
205         vm->vm_exitingcnt = 0;
206         cpu_vmspace_alloc(vm);
207         return (vm);
208 }
209
210 void
211 vm_init2(void) 
212 {
213         zinitna(mapentzone, &mapentobj, NULL, 0, 0, 
214                 ZONE_USE_RESERVE | ZONE_SPECIAL, 1);
215         zinitna(mapzone, &mapobj, NULL, 0, 0, 0, 1);
216         vmspace_zone = zinit("VMSPACE", sizeof (struct vmspace), 0, 0, 3);
217         pmap_init2();
218         vm_object_init2();
219 }
220
221 static __inline void
222 vmspace_dofree(struct vmspace *vm)
223 {
224         int count;
225
226         cpu_vmspace_free(vm);
227
228         /*
229          * Make sure any SysV shm is freed, it might not have in
230          * exit1()
231          */
232         shmexit(vm);
233
234         KKASSERT(vm->vm_upcalls == NULL);
235
236         /*
237          * Lock the map, to wait out all other references to it.
238          * Delete all of the mappings and pages they hold, then call
239          * the pmap module to reclaim anything left.
240          */
241         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
242         vm_map_lock(&vm->vm_map);
243         vm_map_delete(&vm->vm_map, vm->vm_map.min_offset,
244                 vm->vm_map.max_offset, &count);
245         vm_map_unlock(&vm->vm_map);
246         vm_map_entry_release(count);
247
248         pmap_release(vmspace_pmap(vm));
249         zfree(vmspace_zone, vm);
250 }
251
252 void
253 vmspace_free(struct vmspace *vm)
254 {
255         if (vm->vm_refcnt == 0)
256                 panic("vmspace_free: attempt to free already freed vmspace");
257
258         if (--vm->vm_refcnt == 0 && vm->vm_exitingcnt == 0)
259                 vmspace_dofree(vm);
260 }
261
262 void
263 vmspace_exitfree(struct proc *p)
264 {
265         struct vmspace *vm;
266
267         vm = p->p_vmspace;
268         p->p_vmspace = NULL;
269
270         /*
271          * cleanup by parent process wait()ing on exiting child.  vm_refcnt
272          * may not be 0 (e.g. fork() and child exits without exec()ing).
273          * exitingcnt may increment above 0 and drop back down to zero
274          * several times while vm_refcnt is held non-zero.  vm_refcnt
275          * may also increment above 0 and drop back down to zero several
276          * times while vm_exitingcnt is held non-zero.
277          *
278          * The last wait on the exiting child's vmspace will clean up
279          * the remainder of the vmspace.
280          */
281         if (--vm->vm_exitingcnt == 0 && vm->vm_refcnt == 0)
282                 vmspace_dofree(vm);
283 }
284
285 /*
286  * vmspace_swap_count() - count the approximate swap useage in pages for a
287  *                        vmspace.
288  *
289  *      Swap useage is determined by taking the proportional swap used by
290  *      VM objects backing the VM map.  To make up for fractional losses,
291  *      if the VM object has any swap use at all the associated map entries
292  *      count for at least 1 swap page.
293  */
294 int
295 vmspace_swap_count(struct vmspace *vmspace)
296 {
297         vm_map_t map = &vmspace->vm_map;
298         vm_map_entry_t cur;
299         vm_object_t object;
300         int count = 0;
301         int n;
302
303         for (cur = map->header.next; cur != &map->header; cur = cur->next) {
304                 switch(cur->maptype) {
305                 case VM_MAPTYPE_NORMAL:
306                 case VM_MAPTYPE_VPAGETABLE:
307                         if ((object = cur->object.vm_object) == NULL)
308                                 break;
309                         if (object->type != OBJT_SWAP)
310                                 break;
311                         n = (cur->end - cur->start) / PAGE_SIZE;
312                         if (object->un_pager.swp.swp_bcount) {
313                                 count += object->un_pager.swp.swp_bcount *
314                                     SWAP_META_PAGES * n / object->size + 1;
315                         }
316                         break;
317                 default:
318                         break;
319                 }
320         }
321         return(count);
322 }
323
324
325 /*
326  *      vm_map_create:
327  *
328  *      Creates and returns a new empty VM map with
329  *      the given physical map structure, and having
330  *      the given lower and upper address bounds.
331  */
332 vm_map_t
333 vm_map_create(vm_map_t result, pmap_t pmap, vm_offset_t min, vm_offset_t max)
334 {
335         if (result == NULL)
336                 result = zalloc(mapzone);
337         vm_map_init(result, min, max, pmap);
338         return (result);
339 }
340
341 /*
342  * Initialize an existing vm_map structure
343  * such as that in the vmspace structure.
344  * The pmap is set elsewhere.
345  */
346 void
347 vm_map_init(struct vm_map *map, vm_offset_t min, vm_offset_t max, pmap_t pmap)
348 {
349         map->header.next = map->header.prev = &map->header;
350         RB_INIT(&map->rb_root);
351         map->nentries = 0;
352         map->size = 0;
353         map->system_map = 0;
354         map->infork = 0;
355         map->min_offset = min;
356         map->max_offset = max;
357         map->pmap = pmap;
358         map->first_free = &map->header;
359         map->hint = &map->header;
360         map->timestamp = 0;
361         lockinit(&map->lock, "thrd_sleep", 0, 0);
362 }
363
364 /*
365  * Shadow the vm_map_entry's object.  This typically needs to be done when
366  * a write fault is taken on an entry which had previously been cloned by
367  * fork().  The shared object (which might be NULL) must become private so
368  * we add a shadow layer above it.
369  *
370  * Object allocation for anonymous mappings is defered as long as possible.
371  * When creating a shadow, however, the underlying object must be instantiated
372  * so it can be shared.
373  *
374  * If the map segment is governed by a virtual page table then it is
375  * possible to address offsets beyond the mapped area.  Just allocate
376  * a maximally sized object for this case.
377  */
378 static
379 void
380 vm_map_entry_shadow(vm_map_entry_t entry)
381 {
382         if (entry->maptype == VM_MAPTYPE_VPAGETABLE) {
383                 vm_object_shadow(&entry->object.vm_object, &entry->offset,
384                                  0x7FFFFFFF);   /* XXX */
385         } else {
386                 vm_object_shadow(&entry->object.vm_object, &entry->offset,
387                                  atop(entry->end - entry->start));
388         }
389         entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
390 }
391
392 /*
393  * Allocate an object for a vm_map_entry.
394  *
395  * Object allocation for anonymous mappings is defered as long as possible.
396  * This function is called when we can defer no longer, generally when a map
397  * entry might be split or forked or takes a page fault.
398  *
399  * If the map segment is governed by a virtual page table then it is
400  * possible to address offsets beyond the mapped area.  Just allocate
401  * a maximally sized object for this case.
402  */
403 void 
404 vm_map_entry_allocate_object(vm_map_entry_t entry)
405 {
406         vm_object_t obj;
407
408         if (entry->maptype == VM_MAPTYPE_VPAGETABLE) {
409                 obj = vm_object_allocate(OBJT_DEFAULT, 0x7FFFFFFF); /* XXX */
410         } else {
411                 obj = vm_object_allocate(OBJT_DEFAULT,
412                                          atop(entry->end - entry->start));
413         }
414         entry->object.vm_object = obj;
415         entry->offset = 0;
416 }
417
418 /*
419  *      vm_map_entry_reserve_cpu_init:
420  *
421  *      Set an initial negative count so the first attempt to reserve
422  *      space preloads a bunch of vm_map_entry's for this cpu.  Also
423  *      pre-allocate 2 vm_map_entries which will be needed by zalloc() to
424  *      map a new page for vm_map_entry structures.  SMP systems are
425  *      particularly sensitive.
426  *
427  *      This routine is called in early boot so we cannot just call
428  *      vm_map_entry_reserve().
429  *
430  *      May be called for a gd other then mycpu, but may only be called
431  *      during early boot.
432  */
433 void
434 vm_map_entry_reserve_cpu_init(globaldata_t gd)
435 {
436         vm_map_entry_t entry;
437         int i;
438
439         gd->gd_vme_avail -= MAP_RESERVE_COUNT * 2;
440         entry = &cpu_map_entry_init[gd->gd_cpuid][0];
441         for (i = 0; i < VMEPERCPU; ++i, ++entry) {
442                 entry->next = gd->gd_vme_base;
443                 gd->gd_vme_base = entry;
444         }
445 }
446
447 /*
448  *      vm_map_entry_reserve:
449  *
450  *      Reserves vm_map_entry structures so code later on can manipulate
451  *      map_entry structures within a locked map without blocking trying
452  *      to allocate a new vm_map_entry.
453  */
454 int
455 vm_map_entry_reserve(int count)
456 {
457         struct globaldata *gd = mycpu;
458         vm_map_entry_t entry;
459
460         crit_enter();
461
462         /*
463          * Make sure we have enough structures in gd_vme_base to handle
464          * the reservation request.
465          */
466         while (gd->gd_vme_avail < count) {
467                 entry = zalloc(mapentzone);
468                 entry->next = gd->gd_vme_base;
469                 gd->gd_vme_base = entry;
470                 ++gd->gd_vme_avail;
471         }
472         gd->gd_vme_avail -= count;
473         crit_exit();
474         return(count);
475 }
476
477 /*
478  *      vm_map_entry_release:
479  *
480  *      Releases previously reserved vm_map_entry structures that were not
481  *      used.  If we have too much junk in our per-cpu cache clean some of
482  *      it out.
483  */
484 void
485 vm_map_entry_release(int count)
486 {
487         struct globaldata *gd = mycpu;
488         vm_map_entry_t entry;
489
490         crit_enter();
491         gd->gd_vme_avail += count;
492         while (gd->gd_vme_avail > MAP_RESERVE_SLOP) {
493                 entry = gd->gd_vme_base;
494                 KKASSERT(entry != NULL);
495                 gd->gd_vme_base = entry->next;
496                 --gd->gd_vme_avail;
497                 crit_exit();
498                 zfree(mapentzone, entry);
499                 crit_enter();
500         }
501         crit_exit();
502 }
503
504 /*
505  *      vm_map_entry_kreserve:
506  *
507  *      Reserve map entry structures for use in kernel_map itself.  These
508  *      entries have *ALREADY* been reserved on a per-cpu basis when the map
509  *      was inited.  This function is used by zalloc() to avoid a recursion
510  *      when zalloc() itself needs to allocate additional kernel memory.
511  *
512  *      This function works like the normal reserve but does not load the
513  *      vm_map_entry cache (because that would result in an infinite
514  *      recursion).  Note that gd_vme_avail may go negative.  This is expected.
515  *
516  *      Any caller of this function must be sure to renormalize after 
517  *      potentially eating entries to ensure that the reserve supply
518  *      remains intact.
519  */
520 int
521 vm_map_entry_kreserve(int count)
522 {
523         struct globaldata *gd = mycpu;
524
525         crit_enter();
526         gd->gd_vme_avail -= count;
527         crit_exit();
528         KASSERT(gd->gd_vme_base != NULL, ("no reserved entries left, gd_vme_avail = %d\n", gd->gd_vme_avail));
529         return(count);
530 }
531
532 /*
533  *      vm_map_entry_krelease:
534  *
535  *      Release previously reserved map entries for kernel_map.  We do not
536  *      attempt to clean up like the normal release function as this would
537  *      cause an unnecessary (but probably not fatal) deep procedure call.
538  */
539 void
540 vm_map_entry_krelease(int count)
541 {
542         struct globaldata *gd = mycpu;
543
544         crit_enter();
545         gd->gd_vme_avail += count;
546         crit_exit();
547 }
548
549 /*
550  *      vm_map_entry_create:    [ internal use only ]
551  *
552  *      Allocates a VM map entry for insertion.  No entry fields are filled 
553  *      in.
554  *
555  *      This routine may be called from an interrupt thread but not a FAST
556  *      interrupt.  This routine may recurse the map lock.
557  */
558 static vm_map_entry_t
559 vm_map_entry_create(vm_map_t map, int *countp)
560 {
561         struct globaldata *gd = mycpu;
562         vm_map_entry_t entry;
563
564         KKASSERT(*countp > 0);
565         --*countp;
566         crit_enter();
567         entry = gd->gd_vme_base;
568         KASSERT(entry != NULL, ("gd_vme_base NULL! count %d", *countp));
569         gd->gd_vme_base = entry->next;
570         crit_exit();
571         return(entry);
572 }
573
574 /*
575  *      vm_map_entry_dispose:   [ internal use only ]
576  *
577  *      Dispose of a vm_map_entry that is no longer being referenced.  This
578  *      function may be called from an interrupt.
579  */
580 static void
581 vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry, int *countp)
582 {
583         struct globaldata *gd = mycpu;
584
585         KKASSERT(map->hint != entry);
586         KKASSERT(map->first_free != entry);
587
588         ++*countp;
589         crit_enter();
590         entry->next = gd->gd_vme_base;
591         gd->gd_vme_base = entry;
592         crit_exit();
593 }
594
595
596 /*
597  *      vm_map_entry_{un,}link:
598  *
599  *      Insert/remove entries from maps.
600  */
601 static __inline void
602 vm_map_entry_link(vm_map_t map,
603                   vm_map_entry_t after_where,
604                   vm_map_entry_t entry)
605 {
606         map->nentries++;
607         entry->prev = after_where;
608         entry->next = after_where->next;
609         entry->next->prev = entry;
610         after_where->next = entry;
611         if (vm_map_rb_tree_RB_INSERT(&map->rb_root, entry))
612                 panic("vm_map_entry_link: dup addr map %p ent %p", map, entry);
613 }
614
615 static __inline void
616 vm_map_entry_unlink(vm_map_t map,
617                     vm_map_entry_t entry)
618 {
619         vm_map_entry_t prev;
620         vm_map_entry_t next;
621
622         if (entry->eflags & MAP_ENTRY_IN_TRANSITION)
623                 panic("vm_map_entry_unlink: attempt to mess with locked entry! %p", entry);
624         prev = entry->prev;
625         next = entry->next;
626         next->prev = prev;
627         prev->next = next;
628         vm_map_rb_tree_RB_REMOVE(&map->rb_root, entry);
629         map->nentries--;
630 }
631
632 /*
633  *      vm_map_lookup_entry:    [ internal use only ]
634  *
635  *      Finds the map entry containing (or
636  *      immediately preceding) the specified address
637  *      in the given map; the entry is returned
638  *      in the "entry" parameter.  The boolean
639  *      result indicates whether the address is
640  *      actually contained in the map.
641  */
642 boolean_t
643 vm_map_lookup_entry(vm_map_t map, vm_offset_t address,
644     vm_map_entry_t *entry /* OUT */)
645 {
646         vm_map_entry_t tmp;
647         vm_map_entry_t last;
648
649 #if 0
650         /*
651          * XXX TEMPORARILY DISABLED.  For some reason our attempt to revive
652          * the hint code with the red-black lookup meets with system crashes
653          * and lockups.  We do not yet know why.
654          *
655          * It is possible that the problem is related to the setting
656          * of the hint during map_entry deletion, in the code specified
657          * at the GGG comment later on in this file.
658          */
659         /*
660          * Quickly check the cached hint, there's a good chance of a match.
661          */
662         if (map->hint != &map->header) {
663                 tmp = map->hint;
664                 if (address >= tmp->start && address < tmp->end) {
665                         *entry = tmp;
666                         return(TRUE);
667                 }
668         }
669 #endif
670
671         /*
672          * Locate the record from the top of the tree.  'last' tracks the
673          * closest prior record and is returned if no match is found, which
674          * in binary tree terms means tracking the most recent right-branch
675          * taken.  If there is no prior record, &map->header is returned.
676          */
677         last = &map->header;
678         tmp = RB_ROOT(&map->rb_root);
679
680         while (tmp) {
681                 if (address >= tmp->start) {
682                         if (address < tmp->end) {
683                                 *entry = tmp;
684                                 map->hint = tmp;
685                                 return(TRUE);
686                         }
687                         last = tmp;
688                         tmp = RB_RIGHT(tmp, rb_entry);
689                 } else {
690                         tmp = RB_LEFT(tmp, rb_entry);
691                 }
692         }
693         *entry = last;
694         return (FALSE);
695 }
696
697 /*
698  *      vm_map_insert:
699  *
700  *      Inserts the given whole VM object into the target
701  *      map at the specified address range.  The object's
702  *      size should match that of the address range.
703  *
704  *      Requires that the map be locked, and leaves it so.  Requires that
705  *      sufficient vm_map_entry structures have been reserved and tracks
706  *      the use via countp.
707  *
708  *      If object is non-NULL, ref count must be bumped by caller
709  *      prior to making call to account for the new entry.
710  */
711 int
712 vm_map_insert(vm_map_t map, int *countp,
713               vm_object_t object, vm_ooffset_t offset,
714               vm_offset_t start, vm_offset_t end,
715               vm_maptype_t maptype,
716               vm_prot_t prot, vm_prot_t max,
717               int cow)
718 {
719         vm_map_entry_t new_entry;
720         vm_map_entry_t prev_entry;
721         vm_map_entry_t temp_entry;
722         vm_eflags_t protoeflags;
723
724         /*
725          * Check that the start and end points are not bogus.
726          */
727
728         if ((start < map->min_offset) || (end > map->max_offset) ||
729             (start >= end))
730                 return (KERN_INVALID_ADDRESS);
731
732         /*
733          * Find the entry prior to the proposed starting address; if it's part
734          * of an existing entry, this range is bogus.
735          */
736
737         if (vm_map_lookup_entry(map, start, &temp_entry))
738                 return (KERN_NO_SPACE);
739
740         prev_entry = temp_entry;
741
742         /*
743          * Assert that the next entry doesn't overlap the end point.
744          */
745
746         if ((prev_entry->next != &map->header) &&
747             (prev_entry->next->start < end))
748                 return (KERN_NO_SPACE);
749
750         protoeflags = 0;
751
752         if (cow & MAP_COPY_ON_WRITE)
753                 protoeflags |= MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY;
754
755         if (cow & MAP_NOFAULT) {
756                 protoeflags |= MAP_ENTRY_NOFAULT;
757
758                 KASSERT(object == NULL,
759                         ("vm_map_insert: paradoxical MAP_NOFAULT request"));
760         }
761         if (cow & MAP_DISABLE_SYNCER)
762                 protoeflags |= MAP_ENTRY_NOSYNC;
763         if (cow & MAP_DISABLE_COREDUMP)
764                 protoeflags |= MAP_ENTRY_NOCOREDUMP;
765
766         if (object) {
767                 /*
768                  * When object is non-NULL, it could be shared with another
769                  * process.  We have to set or clear OBJ_ONEMAPPING 
770                  * appropriately.
771                  */
772                 if ((object->ref_count > 1) || (object->shadow_count != 0)) {
773                         vm_object_clear_flag(object, OBJ_ONEMAPPING);
774                 }
775         }
776         else if ((prev_entry != &map->header) &&
777                  (prev_entry->eflags == protoeflags) &&
778                  (prev_entry->end == start) &&
779                  (prev_entry->wired_count == 0) &&
780                  prev_entry->maptype == maptype &&
781                  ((prev_entry->object.vm_object == NULL) ||
782                   vm_object_coalesce(prev_entry->object.vm_object,
783                                      OFF_TO_IDX(prev_entry->offset),
784                                      (vm_size_t)(prev_entry->end - prev_entry->start),
785                                      (vm_size_t)(end - prev_entry->end)))) {
786                 /*
787                  * We were able to extend the object.  Determine if we
788                  * can extend the previous map entry to include the 
789                  * new range as well.
790                  */
791                 if ((prev_entry->inheritance == VM_INHERIT_DEFAULT) &&
792                     (prev_entry->protection == prot) &&
793                     (prev_entry->max_protection == max)) {
794                         map->size += (end - prev_entry->end);
795                         prev_entry->end = end;
796                         vm_map_simplify_entry(map, prev_entry, countp);
797                         return (KERN_SUCCESS);
798                 }
799
800                 /*
801                  * If we can extend the object but cannot extend the
802                  * map entry, we have to create a new map entry.  We
803                  * must bump the ref count on the extended object to
804                  * account for it.  object may be NULL.
805                  */
806                 object = prev_entry->object.vm_object;
807                 offset = prev_entry->offset +
808                         (prev_entry->end - prev_entry->start);
809                 vm_object_reference(object);
810         }
811
812         /*
813          * NOTE: if conditionals fail, object can be NULL here.  This occurs
814          * in things like the buffer map where we manage kva but do not manage
815          * backing objects.
816          */
817
818         /*
819          * Create a new entry
820          */
821
822         new_entry = vm_map_entry_create(map, countp);
823         new_entry->start = start;
824         new_entry->end = end;
825
826         new_entry->maptype = maptype;
827         new_entry->eflags = protoeflags;
828         new_entry->object.vm_object = object;
829         new_entry->offset = offset;
830         new_entry->aux.master_pde = 0;
831
832         new_entry->inheritance = VM_INHERIT_DEFAULT;
833         new_entry->protection = prot;
834         new_entry->max_protection = max;
835         new_entry->wired_count = 0;
836
837         /*
838          * Insert the new entry into the list
839          */
840
841         vm_map_entry_link(map, prev_entry, new_entry);
842         map->size += new_entry->end - new_entry->start;
843
844         /*
845          * Update the free space hint
846          */
847         if ((map->first_free == prev_entry) &&
848             (prev_entry->end >= new_entry->start)) {
849                 map->first_free = new_entry;
850         }
851
852 #if 0
853         /*
854          * Temporarily removed to avoid MAP_STACK panic, due to
855          * MAP_STACK being a huge hack.  Will be added back in
856          * when MAP_STACK (and the user stack mapping) is fixed.
857          */
858         /*
859          * It may be possible to simplify the entry
860          */
861         vm_map_simplify_entry(map, new_entry, countp);
862 #endif
863
864         /*
865          * Try to pre-populate the page table.  Mappings governed by virtual
866          * page tables cannot be prepopulated without a lot of work, so
867          * don't try.
868          */
869         if ((cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL)) &&
870             maptype != VM_MAPTYPE_VPAGETABLE) {
871                 pmap_object_init_pt(map->pmap, start, prot,
872                                     object, OFF_TO_IDX(offset), end - start,
873                                     cow & MAP_PREFAULT_PARTIAL);
874         }
875
876         return (KERN_SUCCESS);
877 }
878
879 /*
880  * Find sufficient space for `length' bytes in the given map, starting at
881  * `start'.  The map must be locked.  Returns 0 on success, 1 on no space.
882  *
883  * This function will returned an arbitrarily aligned pointer.  If no
884  * particular alignment is required you should pass align as 1.  Note that
885  * the map may return PAGE_SIZE aligned pointers if all the lengths used in
886  * the map are a multiple of PAGE_SIZE, even if you pass a smaller align
887  * argument.
888  *
889  * 'align' should be a power of 2 but is not required to be.
890  */
891 int
892 vm_map_findspace(
893         vm_map_t map,
894         vm_offset_t start,
895         vm_size_t length,
896         vm_offset_t align,
897         vm_offset_t *addr)
898 {
899         vm_map_entry_t entry, next;
900         vm_offset_t end;
901         vm_offset_t align_mask;
902
903         if (start < map->min_offset)
904                 start = map->min_offset;
905         if (start > map->max_offset)
906                 return (1);
907
908         /*
909          * If the alignment is not a power of 2 we will have to use
910          * a mod/division, set align_mask to a special value.
911          */
912         if ((align | (align - 1)) + 1 != (align << 1))
913                 align_mask = (vm_offset_t)-1;
914         else
915                 align_mask = align - 1;
916
917 retry:
918         /*
919          * Look for the first possible address; if there's already something
920          * at this address, we have to start after it.
921          */
922         if (start == map->min_offset) {
923                 if ((entry = map->first_free) != &map->header)
924                         start = entry->end;
925         } else {
926                 vm_map_entry_t tmp;
927
928                 if (vm_map_lookup_entry(map, start, &tmp))
929                         start = tmp->end;
930                 entry = tmp;
931         }
932
933         /*
934          * Look through the rest of the map, trying to fit a new region in the
935          * gap between existing regions, or after the very last region.
936          */
937         for (;; start = (entry = next)->end) {
938                 /*
939                  * Adjust the proposed start by the requested alignment,
940                  * be sure that we didn't wrap the address.
941                  */
942                 if (align_mask == (vm_offset_t)-1)
943                         end = ((start + align - 1) / align) * align;
944                 else
945                         end = (start + align_mask) & ~align_mask;
946                 if (end < start)
947                         return (1);
948                 start = end;
949                 /*
950                  * Find the end of the proposed new region.  Be sure we didn't
951                  * go beyond the end of the map, or wrap around the address.
952                  * Then check to see if this is the last entry or if the 
953                  * proposed end fits in the gap between this and the next
954                  * entry.
955                  */
956                 end = start + length;
957                 if (end > map->max_offset || end < start)
958                         return (1);
959                 next = entry->next;
960                 if (next == &map->header || next->start >= end)
961                         break;
962         }
963         map->hint = entry;
964         if (map == &kernel_map) {
965                 vm_offset_t ksize;
966                 if ((ksize = round_page(start + length)) > kernel_vm_end) {
967                         pmap_growkernel(ksize);
968                         goto retry;
969                 }
970         }
971         *addr = start;
972         return (0);
973 }
974
975 /*
976  *      vm_map_find finds an unallocated region in the target address
977  *      map with the given length.  The search is defined to be
978  *      first-fit from the specified address; the region found is
979  *      returned in the same parameter.
980  *
981  *      If object is non-NULL, ref count must be bumped by caller
982  *      prior to making call to account for the new entry.
983  */
984 int
985 vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
986             vm_offset_t *addr,  vm_size_t length,
987             boolean_t find_space,
988             vm_maptype_t maptype,
989             vm_prot_t prot, vm_prot_t max,
990             int cow)
991 {
992         vm_offset_t start;
993         int result;
994         int count;
995
996         start = *addr;
997
998         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
999         vm_map_lock(map);
1000         if (find_space) {
1001                 if (vm_map_findspace(map, start, length, 1, addr)) {
1002                         vm_map_unlock(map);
1003                         vm_map_entry_release(count);
1004                         return (KERN_NO_SPACE);
1005                 }
1006                 start = *addr;
1007         }
1008         result = vm_map_insert(map, &count, object, offset,
1009                                start, start + length,
1010                                maptype,
1011                                prot, max,
1012                                cow);
1013         vm_map_unlock(map);
1014         vm_map_entry_release(count);
1015
1016         return (result);
1017 }
1018
1019 /*
1020  *      vm_map_simplify_entry:
1021  *
1022  *      Simplify the given map entry by merging with either neighbor.  This
1023  *      routine also has the ability to merge with both neighbors.
1024  *
1025  *      The map must be locked.
1026  *
1027  *      This routine guarentees that the passed entry remains valid (though
1028  *      possibly extended).  When merging, this routine may delete one or
1029  *      both neighbors.  No action is taken on entries which have their
1030  *      in-transition flag set.
1031  */
1032 void
1033 vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry, int *countp)
1034 {
1035         vm_map_entry_t next, prev;
1036         vm_size_t prevsize, esize;
1037
1038         if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
1039                 ++mycpu->gd_cnt.v_intrans_coll;
1040                 return;
1041         }
1042
1043         if (entry->maptype == VM_MAPTYPE_SUBMAP)
1044                 return;
1045
1046         prev = entry->prev;
1047         if (prev != &map->header) {
1048                 prevsize = prev->end - prev->start;
1049                 if ( (prev->end == entry->start) &&
1050                      (prev->maptype == entry->maptype) &&
1051                      (prev->object.vm_object == entry->object.vm_object) &&
1052                      (!prev->object.vm_object ||
1053                         (prev->offset + prevsize == entry->offset)) &&
1054                      (prev->eflags == entry->eflags) &&
1055                      (prev->protection == entry->protection) &&
1056                      (prev->max_protection == entry->max_protection) &&
1057                      (prev->inheritance == entry->inheritance) &&
1058                      (prev->wired_count == entry->wired_count)) {
1059                         if (map->first_free == prev)
1060                                 map->first_free = entry;
1061                         if (map->hint == prev)
1062                                 map->hint = entry;
1063                         vm_map_entry_unlink(map, prev);
1064                         entry->start = prev->start;
1065                         entry->offset = prev->offset;
1066                         if (prev->object.vm_object)
1067                                 vm_object_deallocate(prev->object.vm_object);
1068                         vm_map_entry_dispose(map, prev, countp);
1069                 }
1070         }
1071
1072         next = entry->next;
1073         if (next != &map->header) {
1074                 esize = entry->end - entry->start;
1075                 if ((entry->end == next->start) &&
1076                     (next->maptype == entry->maptype) &&
1077                     (next->object.vm_object == entry->object.vm_object) &&
1078                      (!entry->object.vm_object ||
1079                         (entry->offset + esize == next->offset)) &&
1080                     (next->eflags == entry->eflags) &&
1081                     (next->protection == entry->protection) &&
1082                     (next->max_protection == entry->max_protection) &&
1083                     (next->inheritance == entry->inheritance) &&
1084                     (next->wired_count == entry->wired_count)) {
1085                         if (map->first_free == next)
1086                                 map->first_free = entry;
1087                         if (map->hint == next)
1088                                 map->hint = entry;
1089                         vm_map_entry_unlink(map, next);
1090                         entry->end = next->end;
1091                         if (next->object.vm_object)
1092                                 vm_object_deallocate(next->object.vm_object);
1093                         vm_map_entry_dispose(map, next, countp);
1094                 }
1095         }
1096 }
1097 /*
1098  *      vm_map_clip_start:      [ internal use only ]
1099  *
1100  *      Asserts that the given entry begins at or after
1101  *      the specified address; if necessary,
1102  *      it splits the entry into two.
1103  */
1104 #define vm_map_clip_start(map, entry, startaddr, countp) \
1105 { \
1106         if (startaddr > entry->start) \
1107                 _vm_map_clip_start(map, entry, startaddr, countp); \
1108 }
1109
1110 /*
1111  *      This routine is called only when it is known that
1112  *      the entry must be split.
1113  */
1114 static void
1115 _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start, int *countp)
1116 {
1117         vm_map_entry_t new_entry;
1118
1119         /*
1120          * Split off the front portion -- note that we must insert the new
1121          * entry BEFORE this one, so that this entry has the specified
1122          * starting address.
1123          */
1124
1125         vm_map_simplify_entry(map, entry, countp);
1126
1127         /*
1128          * If there is no object backing this entry, we might as well create
1129          * one now.  If we defer it, an object can get created after the map
1130          * is clipped, and individual objects will be created for the split-up
1131          * map.  This is a bit of a hack, but is also about the best place to
1132          * put this improvement.
1133          */
1134         if (entry->object.vm_object == NULL && !map->system_map) {
1135                 vm_map_entry_allocate_object(entry);
1136         }
1137
1138         new_entry = vm_map_entry_create(map, countp);
1139         *new_entry = *entry;
1140
1141         new_entry->end = start;
1142         entry->offset += (start - entry->start);
1143         entry->start = start;
1144
1145         vm_map_entry_link(map, entry->prev, new_entry);
1146
1147         switch(entry->maptype) {
1148         case VM_MAPTYPE_NORMAL:
1149         case VM_MAPTYPE_VPAGETABLE:
1150                 vm_object_reference(new_entry->object.vm_object);
1151                 break;
1152         default:
1153                 break;
1154         }
1155 }
1156
1157 /*
1158  *      vm_map_clip_end:        [ internal use only ]
1159  *
1160  *      Asserts that the given entry ends at or before
1161  *      the specified address; if necessary,
1162  *      it splits the entry into two.
1163  */
1164
1165 #define vm_map_clip_end(map, entry, endaddr, countp) \
1166 { \
1167         if (endaddr < entry->end) \
1168                 _vm_map_clip_end(map, entry, endaddr, countp); \
1169 }
1170
1171 /*
1172  *      This routine is called only when it is known that
1173  *      the entry must be split.
1174  */
1175 static void
1176 _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end, int *countp)
1177 {
1178         vm_map_entry_t new_entry;
1179
1180         /*
1181          * If there is no object backing this entry, we might as well create
1182          * one now.  If we defer it, an object can get created after the map
1183          * is clipped, and individual objects will be created for the split-up
1184          * map.  This is a bit of a hack, but is also about the best place to
1185          * put this improvement.
1186          */
1187
1188         if (entry->object.vm_object == NULL && !map->system_map) {
1189                 vm_map_entry_allocate_object(entry);
1190         }
1191
1192         /*
1193          * Create a new entry and insert it AFTER the specified entry
1194          */
1195
1196         new_entry = vm_map_entry_create(map, countp);
1197         *new_entry = *entry;
1198
1199         new_entry->start = entry->end = end;
1200         new_entry->offset += (end - entry->start);
1201
1202         vm_map_entry_link(map, entry, new_entry);
1203
1204         switch(entry->maptype) {
1205         case VM_MAPTYPE_NORMAL:
1206         case VM_MAPTYPE_VPAGETABLE:
1207                 vm_object_reference(new_entry->object.vm_object);
1208                 break;
1209         default:
1210                 break;
1211         }
1212 }
1213
1214 /*
1215  *      VM_MAP_RANGE_CHECK:     [ internal use only ]
1216  *
1217  *      Asserts that the starting and ending region
1218  *      addresses fall within the valid range of the map.
1219  */
1220 #define VM_MAP_RANGE_CHECK(map, start, end)             \
1221                 {                                       \
1222                 if (start < vm_map_min(map))            \
1223                         start = vm_map_min(map);        \
1224                 if (end > vm_map_max(map))              \
1225                         end = vm_map_max(map);          \
1226                 if (start > end)                        \
1227                         start = end;                    \
1228                 }
1229
1230 /*
1231  *      vm_map_transition_wait: [ kernel use only ]
1232  *
1233  *      Used to block when an in-transition collison occurs.  The map
1234  *      is unlocked for the sleep and relocked before the return.
1235  */
1236 static
1237 void
1238 vm_map_transition_wait(vm_map_t map)
1239 {
1240         vm_map_unlock(map);
1241         tsleep(map, 0, "vment", 0);
1242         vm_map_lock(map);
1243 }
1244
1245 /*
1246  * CLIP_CHECK_BACK
1247  * CLIP_CHECK_FWD
1248  *
1249  *      When we do blocking operations with the map lock held it is
1250  *      possible that a clip might have occured on our in-transit entry,
1251  *      requiring an adjustment to the entry in our loop.  These macros
1252  *      help the pageable and clip_range code deal with the case.  The
1253  *      conditional costs virtually nothing if no clipping has occured.
1254  */
1255
1256 #define CLIP_CHECK_BACK(entry, save_start)              \
1257     do {                                                \
1258             while (entry->start != save_start) {        \
1259                     entry = entry->prev;                \
1260                     KASSERT(entry != &map->header, ("bad entry clip")); \
1261             }                                           \
1262     } while(0)
1263
1264 #define CLIP_CHECK_FWD(entry, save_end)                 \
1265     do {                                                \
1266             while (entry->end != save_end) {            \
1267                     entry = entry->next;                \
1268                     KASSERT(entry != &map->header, ("bad entry clip")); \
1269             }                                           \
1270     } while(0)
1271
1272
1273 /*
1274  *      vm_map_clip_range:      [ kernel use only ]
1275  *
1276  *      Clip the specified range and return the base entry.  The
1277  *      range may cover several entries starting at the returned base
1278  *      and the first and last entry in the covering sequence will be
1279  *      properly clipped to the requested start and end address.
1280  *
1281  *      If no holes are allowed you should pass the MAP_CLIP_NO_HOLES
1282  *      flag.  
1283  *
1284  *      The MAP_ENTRY_IN_TRANSITION flag will be set for the entries
1285  *      covered by the requested range.
1286  *
1287  *      The map must be exclusively locked on entry and will remain locked
1288  *      on return. If no range exists or the range contains holes and you
1289  *      specified that no holes were allowed, NULL will be returned.  This
1290  *      routine may temporarily unlock the map in order avoid a deadlock when
1291  *      sleeping.
1292  */
1293 static
1294 vm_map_entry_t
1295 vm_map_clip_range(vm_map_t map, vm_offset_t start, vm_offset_t end, 
1296         int *countp, int flags)
1297 {
1298         vm_map_entry_t start_entry;
1299         vm_map_entry_t entry;
1300
1301         /*
1302          * Locate the entry and effect initial clipping.  The in-transition
1303          * case does not occur very often so do not try to optimize it.
1304          */
1305 again:
1306         if (vm_map_lookup_entry(map, start, &start_entry) == FALSE)
1307                 return (NULL);
1308         entry = start_entry;
1309         if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
1310                 entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
1311                 ++mycpu->gd_cnt.v_intrans_coll;
1312                 ++mycpu->gd_cnt.v_intrans_wait;
1313                 vm_map_transition_wait(map);
1314                 /*
1315                  * entry and/or start_entry may have been clipped while
1316                  * we slept, or may have gone away entirely.  We have
1317                  * to restart from the lookup.
1318                  */
1319                 goto again;
1320         }
1321         /*
1322          * Since we hold an exclusive map lock we do not have to restart
1323          * after clipping, even though clipping may block in zalloc.
1324          */
1325         vm_map_clip_start(map, entry, start, countp);
1326         vm_map_clip_end(map, entry, end, countp);
1327         entry->eflags |= MAP_ENTRY_IN_TRANSITION;
1328
1329         /*
1330          * Scan entries covered by the range.  When working on the next
1331          * entry a restart need only re-loop on the current entry which
1332          * we have already locked, since 'next' may have changed.  Also,
1333          * even though entry is safe, it may have been clipped so we
1334          * have to iterate forwards through the clip after sleeping.
1335          */
1336         while (entry->next != &map->header && entry->next->start < end) {
1337                 vm_map_entry_t next = entry->next;
1338
1339                 if (flags & MAP_CLIP_NO_HOLES) {
1340                         if (next->start > entry->end) {
1341                                 vm_map_unclip_range(map, start_entry,
1342                                         start, entry->end, countp, flags);
1343                                 return(NULL);
1344                         }
1345                 }
1346
1347                 if (next->eflags & MAP_ENTRY_IN_TRANSITION) {
1348                         vm_offset_t save_end = entry->end;
1349                         next->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
1350                         ++mycpu->gd_cnt.v_intrans_coll;
1351                         ++mycpu->gd_cnt.v_intrans_wait;
1352                         vm_map_transition_wait(map);
1353
1354                         /*
1355                          * clips might have occured while we blocked.
1356                          */
1357                         CLIP_CHECK_FWD(entry, save_end);
1358                         CLIP_CHECK_BACK(start_entry, start);
1359                         continue;
1360                 }
1361                 /*
1362                  * No restart necessary even though clip_end may block, we
1363                  * are holding the map lock.
1364                  */
1365                 vm_map_clip_end(map, next, end, countp);
1366                 next->eflags |= MAP_ENTRY_IN_TRANSITION;
1367                 entry = next;
1368         }
1369         if (flags & MAP_CLIP_NO_HOLES) {
1370                 if (entry->end != end) {
1371                         vm_map_unclip_range(map, start_entry,
1372                                 start, entry->end, countp, flags);
1373                         return(NULL);
1374                 }
1375         }
1376         return(start_entry);
1377 }
1378
1379 /*
1380  *      vm_map_unclip_range:    [ kernel use only ]
1381  *
1382  *      Undo the effect of vm_map_clip_range().  You should pass the same
1383  *      flags and the same range that you passed to vm_map_clip_range().
1384  *      This code will clear the in-transition flag on the entries and
1385  *      wake up anyone waiting.  This code will also simplify the sequence 
1386  *      and attempt to merge it with entries before and after the sequence.
1387  *
1388  *      The map must be locked on entry and will remain locked on return.
1389  *
1390  *      Note that you should also pass the start_entry returned by 
1391  *      vm_map_clip_range().  However, if you block between the two calls
1392  *      with the map unlocked please be aware that the start_entry may
1393  *      have been clipped and you may need to scan it backwards to find
1394  *      the entry corresponding with the original start address.  You are
1395  *      responsible for this, vm_map_unclip_range() expects the correct
1396  *      start_entry to be passed to it and will KASSERT otherwise.
1397  */
1398 static
1399 void
1400 vm_map_unclip_range(
1401         vm_map_t map,
1402         vm_map_entry_t start_entry,
1403         vm_offset_t start,
1404         vm_offset_t end,
1405         int *countp,
1406         int flags)
1407 {
1408         vm_map_entry_t entry;
1409
1410         entry = start_entry;
1411
1412         KASSERT(entry->start == start, ("unclip_range: illegal base entry"));
1413         while (entry != &map->header && entry->start < end) {
1414                 KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION, ("in-transition flag not set during unclip on: %p", entry));
1415                 KASSERT(entry->end <= end, ("unclip_range: tail wasn't clipped"));
1416                 entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
1417                 if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
1418                         entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
1419                         wakeup(map);
1420                 }
1421                 entry = entry->next;
1422         }
1423
1424         /*
1425          * Simplification does not block so there is no restart case.
1426          */
1427         entry = start_entry;
1428         while (entry != &map->header && entry->start < end) {
1429                 vm_map_simplify_entry(map, entry, countp);
1430                 entry = entry->next;
1431         }
1432 }
1433
1434 /*
1435  *      vm_map_submap:          [ kernel use only ]
1436  *
1437  *      Mark the given range as handled by a subordinate map.
1438  *
1439  *      This range must have been created with vm_map_find,
1440  *      and no other operations may have been performed on this
1441  *      range prior to calling vm_map_submap.
1442  *
1443  *      Only a limited number of operations can be performed
1444  *      within this rage after calling vm_map_submap:
1445  *              vm_fault
1446  *      [Don't try vm_map_copy!]
1447  *
1448  *      To remove a submapping, one must first remove the
1449  *      range from the superior map, and then destroy the
1450  *      submap (if desired).  [Better yet, don't try it.]
1451  */
1452 int
1453 vm_map_submap(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_map_t submap)
1454 {
1455         vm_map_entry_t entry;
1456         int result = KERN_INVALID_ARGUMENT;
1457         int count;
1458
1459         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
1460         vm_map_lock(map);
1461
1462         VM_MAP_RANGE_CHECK(map, start, end);
1463
1464         if (vm_map_lookup_entry(map, start, &entry)) {
1465                 vm_map_clip_start(map, entry, start, &count);
1466         } else {
1467                 entry = entry->next;
1468         }
1469
1470         vm_map_clip_end(map, entry, end, &count);
1471
1472         if ((entry->start == start) && (entry->end == end) &&
1473             ((entry->eflags & MAP_ENTRY_COW) == 0) &&
1474             (entry->object.vm_object == NULL)) {
1475                 entry->object.sub_map = submap;
1476                 entry->maptype = VM_MAPTYPE_SUBMAP;
1477                 result = KERN_SUCCESS;
1478         }
1479         vm_map_unlock(map);
1480         vm_map_entry_release(count);
1481
1482         return (result);
1483 }
1484
1485 /*
1486  * vm_map_protect:
1487  *
1488  * Sets the protection of the specified address region in the target map. 
1489  * If "set_max" is specified, the maximum protection is to be set;
1490  * otherwise, only the current protection is affected.
1491  *
1492  * The protection is not applicable to submaps, but is applicable to normal
1493  * maps and maps governed by virtual page tables.  For example, when operating
1494  * on a virtual page table our protection basically controls how COW occurs
1495  * on the backing object, whereas the virtual page table abstraction itself
1496  * is an abstraction for userland.
1497  */
1498 int
1499 vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
1500                vm_prot_t new_prot, boolean_t set_max)
1501 {
1502         vm_map_entry_t current;
1503         vm_map_entry_t entry;
1504         int count;
1505
1506         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
1507         vm_map_lock(map);
1508
1509         VM_MAP_RANGE_CHECK(map, start, end);
1510
1511         if (vm_map_lookup_entry(map, start, &entry)) {
1512                 vm_map_clip_start(map, entry, start, &count);
1513         } else {
1514                 entry = entry->next;
1515         }
1516
1517         /*
1518          * Make a first pass to check for protection violations.
1519          */
1520         current = entry;
1521         while ((current != &map->header) && (current->start < end)) {
1522                 if (current->maptype == VM_MAPTYPE_SUBMAP) {
1523                         vm_map_unlock(map);
1524                         vm_map_entry_release(count);
1525                         return (KERN_INVALID_ARGUMENT);
1526                 }
1527                 if ((new_prot & current->max_protection) != new_prot) {
1528                         vm_map_unlock(map);
1529                         vm_map_entry_release(count);
1530                         return (KERN_PROTECTION_FAILURE);
1531                 }
1532                 current = current->next;
1533         }
1534
1535         /*
1536          * Go back and fix up protections. [Note that clipping is not
1537          * necessary the second time.]
1538          */
1539         current = entry;
1540
1541         while ((current != &map->header) && (current->start < end)) {
1542                 vm_prot_t old_prot;
1543
1544                 vm_map_clip_end(map, current, end, &count);
1545
1546                 old_prot = current->protection;
1547                 if (set_max) {
1548                         current->protection =
1549                             (current->max_protection = new_prot) &
1550                             old_prot;
1551                 } else {
1552                         current->protection = new_prot;
1553                 }
1554
1555                 /*
1556                  * Update physical map if necessary. Worry about copy-on-write
1557                  * here -- CHECK THIS XXX
1558                  */
1559
1560                 if (current->protection != old_prot) {
1561 #define MASK(entry)     (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
1562                                                         VM_PROT_ALL)
1563
1564                         pmap_protect(map->pmap, current->start,
1565                             current->end,
1566                             current->protection & MASK(current));
1567 #undef  MASK
1568                 }
1569
1570                 vm_map_simplify_entry(map, current, &count);
1571
1572                 current = current->next;
1573         }
1574
1575         vm_map_unlock(map);
1576         vm_map_entry_release(count);
1577         return (KERN_SUCCESS);
1578 }
1579
1580 /*
1581  *      vm_map_madvise:
1582  *
1583  *      This routine traverses a processes map handling the madvise
1584  *      system call.  Advisories are classified as either those effecting
1585  *      the vm_map_entry structure, or those effecting the underlying 
1586  *      objects.
1587  *
1588  *      The <value> argument is used for extended madvise calls.
1589  */
1590 int
1591 vm_map_madvise(vm_map_t map, vm_offset_t start, vm_offset_t end,
1592                int behav, off_t value)
1593 {
1594         vm_map_entry_t current, entry;
1595         int modify_map = 0;
1596         int error = 0;
1597         int count;
1598
1599         /*
1600          * Some madvise calls directly modify the vm_map_entry, in which case
1601          * we need to use an exclusive lock on the map and we need to perform 
1602          * various clipping operations.  Otherwise we only need a read-lock
1603          * on the map.
1604          */
1605
1606         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
1607
1608         switch(behav) {
1609         case MADV_NORMAL:
1610         case MADV_SEQUENTIAL:
1611         case MADV_RANDOM:
1612         case MADV_NOSYNC:
1613         case MADV_AUTOSYNC:
1614         case MADV_NOCORE:
1615         case MADV_CORE:
1616         case MADV_SETMAP:
1617         case MADV_INVAL:
1618                 modify_map = 1;
1619                 vm_map_lock(map);
1620                 break;
1621         case MADV_WILLNEED:
1622         case MADV_DONTNEED:
1623         case MADV_FREE:
1624                 vm_map_lock_read(map);
1625                 break;
1626         default:
1627                 vm_map_entry_release(count);
1628                 return (EINVAL);
1629         }
1630
1631         /*
1632          * Locate starting entry and clip if necessary.
1633          */
1634
1635         VM_MAP_RANGE_CHECK(map, start, end);
1636
1637         if (vm_map_lookup_entry(map, start, &entry)) {
1638                 if (modify_map)
1639                         vm_map_clip_start(map, entry, start, &count);
1640         } else {
1641                 entry = entry->next;
1642         }
1643
1644         if (modify_map) {
1645                 /*
1646                  * madvise behaviors that are implemented in the vm_map_entry.
1647                  *
1648                  * We clip the vm_map_entry so that behavioral changes are
1649                  * limited to the specified address range.
1650                  */
1651                 for (current = entry;
1652                      (current != &map->header) && (current->start < end);
1653                      current = current->next
1654                 ) {
1655                         if (current->maptype == VM_MAPTYPE_SUBMAP)
1656                                 continue;
1657
1658                         vm_map_clip_end(map, current, end, &count);
1659
1660                         switch (behav) {
1661                         case MADV_NORMAL:
1662                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL);
1663                                 break;
1664                         case MADV_SEQUENTIAL:
1665                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL);
1666                                 break;
1667                         case MADV_RANDOM:
1668                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM);
1669                                 break;
1670                         case MADV_NOSYNC:
1671                                 current->eflags |= MAP_ENTRY_NOSYNC;
1672                                 break;
1673                         case MADV_AUTOSYNC:
1674                                 current->eflags &= ~MAP_ENTRY_NOSYNC;
1675                                 break;
1676                         case MADV_NOCORE:
1677                                 current->eflags |= MAP_ENTRY_NOCOREDUMP;
1678                                 break;
1679                         case MADV_CORE:
1680                                 current->eflags &= ~MAP_ENTRY_NOCOREDUMP;
1681                                 break;
1682                         case MADV_INVAL:
1683                                 /*
1684                                  * Invalidate the related pmap entries, used
1685                                  * to flush portions of the real kernel's
1686                                  * pmap when the caller has removed or
1687                                  * modified existing mappings in a virtual
1688                                  * page table.
1689                                  */
1690                                 pmap_remove(map->pmap,
1691                                             current->start, current->end);
1692                                 break;
1693                         case MADV_SETMAP:
1694                                 /*
1695                                  * Set the page directory page for a map
1696                                  * governed by a virtual page table.  Mark
1697                                  * the entry as being governed by a virtual
1698                                  * page table if it is not.
1699                                  *
1700                                  * XXX the page directory page is stored
1701                                  * in the avail_ssize field if the map_entry.
1702                                  *
1703                                  * XXX the map simplification code does not
1704                                  * compare this field so weird things may
1705                                  * happen if you do not apply this function
1706                                  * to the entire mapping governed by the
1707                                  * virtual page table.
1708                                  */
1709                                 if (current->maptype != VM_MAPTYPE_VPAGETABLE) {
1710                                         error = EINVAL;
1711                                         break;
1712                                 }
1713                                 current->aux.master_pde = value;
1714                                 pmap_remove(map->pmap,
1715                                             current->start, current->end);
1716                                 break;
1717                         default:
1718                                 error = EINVAL;
1719                                 break;
1720                         }
1721                         vm_map_simplify_entry(map, current, &count);
1722                 }
1723                 vm_map_unlock(map);
1724         } else {
1725                 vm_pindex_t pindex;
1726                 int count;
1727
1728                 /*
1729                  * madvise behaviors that are implemented in the underlying
1730                  * vm_object.
1731                  *
1732                  * Since we don't clip the vm_map_entry, we have to clip
1733                  * the vm_object pindex and count.
1734                  *
1735                  * NOTE!  We currently do not support these functions on
1736                  * virtual page tables.
1737                  */
1738                 for (current = entry;
1739                      (current != &map->header) && (current->start < end);
1740                      current = current->next
1741                 ) {
1742                         vm_offset_t useStart;
1743
1744                         if (current->maptype != VM_MAPTYPE_NORMAL)
1745                                 continue;
1746
1747                         pindex = OFF_TO_IDX(current->offset);
1748                         count = atop(current->end - current->start);
1749                         useStart = current->start;
1750
1751                         if (current->start < start) {
1752                                 pindex += atop(start - current->start);
1753                                 count -= atop(start - current->start);
1754                                 useStart = start;
1755                         }
1756                         if (current->end > end)
1757                                 count -= atop(current->end - end);
1758
1759                         if (count <= 0)
1760                                 continue;
1761
1762                         vm_object_madvise(current->object.vm_object,
1763                                           pindex, count, behav);
1764
1765                         /*
1766                          * Try to populate the page table.  Mappings governed
1767                          * by virtual page tables cannot be pre-populated
1768                          * without a lot of work so don't try.
1769                          */
1770                         if (behav == MADV_WILLNEED &&
1771                             current->maptype != VM_MAPTYPE_VPAGETABLE) {
1772                                 pmap_object_init_pt(
1773                                     map->pmap, 
1774                                     useStart,
1775                                     current->protection,
1776                                     current->object.vm_object,
1777                                     pindex, 
1778                                     (count << PAGE_SHIFT),
1779                                     MAP_PREFAULT_MADVISE
1780                                 );
1781                         }
1782                 }
1783                 vm_map_unlock_read(map);
1784         }
1785         vm_map_entry_release(count);
1786         return(error);
1787 }       
1788
1789
1790 /*
1791  *      vm_map_inherit:
1792  *
1793  *      Sets the inheritance of the specified address
1794  *      range in the target map.  Inheritance
1795  *      affects how the map will be shared with
1796  *      child maps at the time of vm_map_fork.
1797  */
1798 int
1799 vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
1800                vm_inherit_t new_inheritance)
1801 {
1802         vm_map_entry_t entry;
1803         vm_map_entry_t temp_entry;
1804         int count;
1805
1806         switch (new_inheritance) {
1807         case VM_INHERIT_NONE:
1808         case VM_INHERIT_COPY:
1809         case VM_INHERIT_SHARE:
1810                 break;
1811         default:
1812                 return (KERN_INVALID_ARGUMENT);
1813         }
1814
1815         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
1816         vm_map_lock(map);
1817
1818         VM_MAP_RANGE_CHECK(map, start, end);
1819
1820         if (vm_map_lookup_entry(map, start, &temp_entry)) {
1821                 entry = temp_entry;
1822                 vm_map_clip_start(map, entry, start, &count);
1823         } else
1824                 entry = temp_entry->next;
1825
1826         while ((entry != &map->header) && (entry->start < end)) {
1827                 vm_map_clip_end(map, entry, end, &count);
1828
1829                 entry->inheritance = new_inheritance;
1830
1831                 vm_map_simplify_entry(map, entry, &count);
1832
1833                 entry = entry->next;
1834         }
1835         vm_map_unlock(map);
1836         vm_map_entry_release(count);
1837         return (KERN_SUCCESS);
1838 }
1839
1840 /*
1841  * Implement the semantics of mlock
1842  */
1843 int
1844 vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t real_end,
1845     boolean_t new_pageable)
1846 {
1847         vm_map_entry_t entry;
1848         vm_map_entry_t start_entry;
1849         vm_offset_t end;
1850         int rv = KERN_SUCCESS;
1851         int count;
1852
1853         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
1854         vm_map_lock(map);
1855         VM_MAP_RANGE_CHECK(map, start, real_end);
1856         end = real_end;
1857
1858         start_entry = vm_map_clip_range(map, start, end, &count, MAP_CLIP_NO_HOLES);
1859         if (start_entry == NULL) {
1860                 vm_map_unlock(map);
1861                 vm_map_entry_release(count);
1862                 return (KERN_INVALID_ADDRESS);
1863         }
1864
1865         if (new_pageable == 0) {
1866                 entry = start_entry;
1867                 while ((entry != &map->header) && (entry->start < end)) {
1868                         vm_offset_t save_start;
1869                         vm_offset_t save_end;
1870
1871                         /*
1872                          * Already user wired or hard wired (trivial cases)
1873                          */
1874                         if (entry->eflags & MAP_ENTRY_USER_WIRED) {
1875                                 entry = entry->next;
1876                                 continue;
1877                         }
1878                         if (entry->wired_count != 0) {
1879                                 entry->wired_count++;
1880                                 entry->eflags |= MAP_ENTRY_USER_WIRED;
1881                                 entry = entry->next;
1882                                 continue;
1883                         }
1884
1885                         /*
1886                          * A new wiring requires instantiation of appropriate
1887                          * management structures and the faulting in of the
1888                          * page.
1889                          */
1890                         if (entry->maptype != VM_MAPTYPE_SUBMAP) {
1891                                 int copyflag = entry->eflags & MAP_ENTRY_NEEDS_COPY;
1892                                 if (copyflag && ((entry->protection & VM_PROT_WRITE) != 0)) {
1893                                         vm_map_entry_shadow(entry);
1894                                 } else if (entry->object.vm_object == NULL &&
1895                                            !map->system_map) {
1896                                         vm_map_entry_allocate_object(entry);
1897                                 }
1898                         }
1899                         entry->wired_count++;
1900                         entry->eflags |= MAP_ENTRY_USER_WIRED;
1901
1902                         /*
1903                          * Now fault in the area.  Note that vm_fault_wire()
1904                          * may release the map lock temporarily, it will be
1905                          * relocked on return.  The in-transition
1906                          * flag protects the entries. 
1907                          */
1908                         save_start = entry->start;
1909                         save_end = entry->end;
1910                         rv = vm_fault_wire(map, entry, TRUE);
1911                         if (rv) {
1912                                 CLIP_CHECK_BACK(entry, save_start);
1913                                 for (;;) {
1914                                         KASSERT(entry->wired_count == 1, ("bad wired_count on entry"));
1915                                         entry->eflags &= ~MAP_ENTRY_USER_WIRED;
1916                                         entry->wired_count = 0;
1917                                         if (entry->end == save_end)
1918                                                 break;
1919                                         entry = entry->next;
1920                                         KASSERT(entry != &map->header, ("bad entry clip during backout"));
1921                                 }
1922                                 end = save_start;       /* unwire the rest */
1923                                 break;
1924                         }
1925                         /*
1926                          * note that even though the entry might have been
1927                          * clipped, the USER_WIRED flag we set prevents
1928                          * duplication so we do not have to do a 
1929                          * clip check.
1930                          */
1931                         entry = entry->next;
1932                 }
1933
1934                 /*
1935                  * If we failed fall through to the unwiring section to
1936                  * unwire what we had wired so far.  'end' has already
1937                  * been adjusted.
1938                  */
1939                 if (rv)
1940                         new_pageable = 1;
1941
1942                 /*
1943                  * start_entry might have been clipped if we unlocked the
1944                  * map and blocked.  No matter how clipped it has gotten
1945                  * there should be a fragment that is on our start boundary.
1946                  */
1947                 CLIP_CHECK_BACK(start_entry, start);
1948         }
1949
1950         /*
1951          * Deal with the unwiring case.
1952          */
1953         if (new_pageable) {
1954                 /*
1955                  * This is the unwiring case.  We must first ensure that the
1956                  * range to be unwired is really wired down.  We know there
1957                  * are no holes.
1958                  */
1959                 entry = start_entry;
1960                 while ((entry != &map->header) && (entry->start < end)) {
1961                         if ((entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
1962                                 rv = KERN_INVALID_ARGUMENT;
1963                                 goto done;
1964                         }
1965                         KASSERT(entry->wired_count != 0, ("wired count was 0 with USER_WIRED set! %p", entry));
1966                         entry = entry->next;
1967                 }
1968
1969                 /*
1970                  * Now decrement the wiring count for each region. If a region
1971                  * becomes completely unwired, unwire its physical pages and
1972                  * mappings.
1973                  */
1974                 /*
1975                  * The map entries are processed in a loop, checking to
1976                  * make sure the entry is wired and asserting it has a wired
1977                  * count. However, another loop was inserted more-or-less in
1978                  * the middle of the unwiring path. This loop picks up the
1979                  * "entry" loop variable from the first loop without first
1980                  * setting it to start_entry. Naturally, the secound loop
1981                  * is never entered and the pages backing the entries are
1982                  * never unwired. This can lead to a leak of wired pages.
1983                  */
1984                 entry = start_entry;
1985                 while ((entry != &map->header) && (entry->start < end)) {
1986                         KASSERT(entry->eflags & MAP_ENTRY_USER_WIRED,
1987                                 ("expected USER_WIRED on entry %p", entry));
1988                         entry->eflags &= ~MAP_ENTRY_USER_WIRED;
1989                         entry->wired_count--;
1990                         if (entry->wired_count == 0)
1991                                 vm_fault_unwire(map, entry);
1992                         entry = entry->next;
1993                 }
1994         }
1995 done:
1996         vm_map_unclip_range(map, start_entry, start, real_end, &count,
1997                 MAP_CLIP_NO_HOLES);
1998         map->timestamp++;
1999         vm_map_unlock(map);
2000         vm_map_entry_release(count);
2001         return (rv);
2002 }
2003
2004 /*
2005  *      vm_map_wire:
2006  *
2007  *      Sets the pageability of the specified address
2008  *      range in the target map.  Regions specified
2009  *      as not pageable require locked-down physical
2010  *      memory and physical page maps.
2011  *
2012  *      The map must not be locked, but a reference
2013  *      must remain to the map throughout the call.
2014  *
2015  *      This function may be called via the zalloc path and must properly
2016  *      reserve map entries for kernel_map.
2017  */
2018 int
2019 vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t real_end, int kmflags)
2020 {
2021         vm_map_entry_t entry;
2022         vm_map_entry_t start_entry;
2023         vm_offset_t end;
2024         int rv = KERN_SUCCESS;
2025         int count;
2026
2027         if (kmflags & KM_KRESERVE)
2028                 count = vm_map_entry_kreserve(MAP_RESERVE_COUNT);
2029         else
2030                 count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2031         vm_map_lock(map);
2032         VM_MAP_RANGE_CHECK(map, start, real_end);
2033         end = real_end;
2034
2035         start_entry = vm_map_clip_range(map, start, end, &count, MAP_CLIP_NO_HOLES);
2036         if (start_entry == NULL) {
2037                 vm_map_unlock(map);
2038                 rv = KERN_INVALID_ADDRESS;
2039                 goto failure;
2040         }
2041         if ((kmflags & KM_PAGEABLE) == 0) {
2042                 /*
2043                  * Wiring.  
2044                  *
2045                  * 1.  Holding the write lock, we create any shadow or zero-fill
2046                  * objects that need to be created. Then we clip each map
2047                  * entry to the region to be wired and increment its wiring
2048                  * count.  We create objects before clipping the map entries
2049                  * to avoid object proliferation.
2050                  *
2051                  * 2.  We downgrade to a read lock, and call vm_fault_wire to
2052                  * fault in the pages for any newly wired area (wired_count is
2053                  * 1).
2054                  *
2055                  * Downgrading to a read lock for vm_fault_wire avoids a 
2056                  * possible deadlock with another process that may have faulted
2057                  * on one of the pages to be wired (it would mark the page busy,
2058                  * blocking us, then in turn block on the map lock that we
2059                  * hold).  Because of problems in the recursive lock package,
2060                  * we cannot upgrade to a write lock in vm_map_lookup.  Thus,
2061                  * any actions that require the write lock must be done
2062                  * beforehand.  Because we keep the read lock on the map, the
2063                  * copy-on-write status of the entries we modify here cannot
2064                  * change.
2065                  */
2066
2067                 entry = start_entry;
2068                 while ((entry != &map->header) && (entry->start < end)) {
2069                         /*
2070                          * Trivial case if the entry is already wired
2071                          */
2072                         if (entry->wired_count) {
2073                                 entry->wired_count++;
2074                                 entry = entry->next;
2075                                 continue;
2076                         }
2077
2078                         /*
2079                          * The entry is being newly wired, we have to setup
2080                          * appropriate management structures.  A shadow 
2081                          * object is required for a copy-on-write region,
2082                          * or a normal object for a zero-fill region.  We
2083                          * do not have to do this for entries that point to sub
2084                          * maps because we won't hold the lock on the sub map.
2085                          */
2086                         if (entry->maptype != VM_MAPTYPE_SUBMAP) {
2087                                 int copyflag = entry->eflags & MAP_ENTRY_NEEDS_COPY;
2088                                 if (copyflag &&
2089                                     ((entry->protection & VM_PROT_WRITE) != 0)) {
2090                                         vm_map_entry_shadow(entry);
2091                                 } else if (entry->object.vm_object == NULL &&
2092                                            !map->system_map) {
2093                                         vm_map_entry_allocate_object(entry);
2094                                 }
2095                         }
2096
2097                         entry->wired_count++;
2098                         entry = entry->next;
2099                 }
2100
2101                 /*
2102                  * Pass 2.
2103                  */
2104
2105                 /*
2106                  * HACK HACK HACK HACK
2107                  *
2108                  * Unlock the map to avoid deadlocks.  The in-transit flag
2109                  * protects us from most changes but note that
2110                  * clipping may still occur.  To prevent clipping from
2111                  * occuring after the unlock, except for when we are
2112                  * blocking in vm_fault_wire, we must run in a critical
2113                  * section, otherwise our accesses to entry->start and 
2114                  * entry->end could be corrupted.  We have to enter the
2115                  * critical section prior to unlocking so start_entry does
2116                  * not change out from under us at the very beginning of the
2117                  * loop.
2118                  *
2119                  * HACK HACK HACK HACK
2120                  */
2121
2122                 crit_enter();
2123
2124                 entry = start_entry;
2125                 while (entry != &map->header && entry->start < end) {
2126                         /*
2127                          * If vm_fault_wire fails for any page we need to undo
2128                          * what has been done.  We decrement the wiring count
2129                          * for those pages which have not yet been wired (now)
2130                          * and unwire those that have (later).
2131                          */
2132                         vm_offset_t save_start = entry->start;
2133                         vm_offset_t save_end = entry->end;
2134
2135                         if (entry->wired_count == 1)
2136                                 rv = vm_fault_wire(map, entry, FALSE);
2137                         if (rv) {
2138                                 CLIP_CHECK_BACK(entry, save_start);
2139                                 for (;;) {
2140                                         KASSERT(entry->wired_count == 1, ("wired_count changed unexpectedly"));
2141                                         entry->wired_count = 0;
2142                                         if (entry->end == save_end)
2143                                                 break;
2144                                         entry = entry->next;
2145                                         KASSERT(entry != &map->header, ("bad entry clip during backout"));
2146                                 }
2147                                 end = save_start;
2148                                 break;
2149                         }
2150                         CLIP_CHECK_FWD(entry, save_end);
2151                         entry = entry->next;
2152                 }
2153                 crit_exit();
2154
2155                 /*
2156                  * If a failure occured undo everything by falling through
2157                  * to the unwiring code.  'end' has already been adjusted
2158                  * appropriately.
2159                  */
2160                 if (rv)
2161                         kmflags |= KM_PAGEABLE;
2162
2163                 /*
2164                  * start_entry is still IN_TRANSITION but may have been 
2165                  * clipped since vm_fault_wire() unlocks and relocks the
2166                  * map.  No matter how clipped it has gotten there should
2167                  * be a fragment that is on our start boundary.
2168                  */
2169                 CLIP_CHECK_BACK(start_entry, start);
2170         }
2171
2172         if (kmflags & KM_PAGEABLE) {
2173                 /*
2174                  * This is the unwiring case.  We must first ensure that the
2175                  * range to be unwired is really wired down.  We know there
2176                  * are no holes.
2177                  */
2178                 entry = start_entry;
2179                 while ((entry != &map->header) && (entry->start < end)) {
2180                         if (entry->wired_count == 0) {
2181                                 rv = KERN_INVALID_ARGUMENT;
2182                                 goto done;
2183                         }
2184                         entry = entry->next;
2185                 }
2186
2187                 /*
2188                  * Now decrement the wiring count for each region. If a region
2189                  * becomes completely unwired, unwire its physical pages and
2190                  * mappings.
2191                  */
2192                 entry = start_entry;
2193                 while ((entry != &map->header) && (entry->start < end)) {
2194                         entry->wired_count--;
2195                         if (entry->wired_count == 0)
2196                                 vm_fault_unwire(map, entry);
2197                         entry = entry->next;
2198                 }
2199         }
2200 done:
2201         vm_map_unclip_range(map, start_entry, start, real_end, &count,
2202                 MAP_CLIP_NO_HOLES);
2203         map->timestamp++;
2204         vm_map_unlock(map);
2205 failure:
2206         if (kmflags & KM_KRESERVE)
2207                 vm_map_entry_krelease(count);
2208         else
2209                 vm_map_entry_release(count);
2210         return (rv);
2211 }
2212
2213 /*
2214  * vm_map_set_wired_quick()
2215  *
2216  *      Mark a newly allocated address range as wired but do not fault in
2217  *      the pages.  The caller is expected to load the pages into the object.
2218  *
2219  *      The map must be locked on entry and will remain locked on return.
2220  */
2221 void
2222 vm_map_set_wired_quick(vm_map_t map, vm_offset_t addr, vm_size_t size, int *countp)
2223 {
2224         vm_map_entry_t scan;
2225         vm_map_entry_t entry;
2226
2227         entry = vm_map_clip_range(map, addr, addr + size, countp, MAP_CLIP_NO_HOLES);
2228         for (scan = entry; scan != &map->header && scan->start < addr + size; scan = scan->next) {
2229             KKASSERT(entry->wired_count == 0);
2230             entry->wired_count = 1;                                              
2231         }
2232         vm_map_unclip_range(map, entry, addr, addr + size, countp, MAP_CLIP_NO_HOLES);
2233 }
2234
2235 /*
2236  * vm_map_clean
2237  *
2238  * Push any dirty cached pages in the address range to their pager.
2239  * If syncio is TRUE, dirty pages are written synchronously.
2240  * If invalidate is TRUE, any cached pages are freed as well.
2241  *
2242  * Returns an error if any part of the specified range is not mapped.
2243  */
2244 int
2245 vm_map_clean(vm_map_t map, vm_offset_t start, vm_offset_t end, boolean_t syncio,
2246     boolean_t invalidate)
2247 {
2248         vm_map_entry_t current;
2249         vm_map_entry_t entry;
2250         vm_size_t size;
2251         vm_object_t object;
2252         vm_ooffset_t offset;
2253
2254         vm_map_lock_read(map);
2255         VM_MAP_RANGE_CHECK(map, start, end);
2256         if (!vm_map_lookup_entry(map, start, &entry)) {
2257                 vm_map_unlock_read(map);
2258                 return (KERN_INVALID_ADDRESS);
2259         }
2260         /*
2261          * Make a first pass to check for holes.
2262          */
2263         for (current = entry; current->start < end; current = current->next) {
2264                 if (current->maptype == VM_MAPTYPE_SUBMAP) {
2265                         vm_map_unlock_read(map);
2266                         return (KERN_INVALID_ARGUMENT);
2267                 }
2268                 if (end > current->end &&
2269                     (current->next == &map->header ||
2270                         current->end != current->next->start)) {
2271                         vm_map_unlock_read(map);
2272                         return (KERN_INVALID_ADDRESS);
2273                 }
2274         }
2275
2276         if (invalidate)
2277                 pmap_remove(vm_map_pmap(map), start, end);
2278         /*
2279          * Make a second pass, cleaning/uncaching pages from the indicated
2280          * objects as we go.
2281          */
2282         for (current = entry; current->start < end; current = current->next) {
2283                 offset = current->offset + (start - current->start);
2284                 size = (end <= current->end ? end : current->end) - start;
2285                 if (current->maptype == VM_MAPTYPE_SUBMAP) {
2286                         vm_map_t smap;
2287                         vm_map_entry_t tentry;
2288                         vm_size_t tsize;
2289
2290                         smap = current->object.sub_map;
2291                         vm_map_lock_read(smap);
2292                         vm_map_lookup_entry(smap, offset, &tentry);
2293                         tsize = tentry->end - offset;
2294                         if (tsize < size)
2295                                 size = tsize;
2296                         object = tentry->object.vm_object;
2297                         offset = tentry->offset + (offset - tentry->start);
2298                         vm_map_unlock_read(smap);
2299                 } else {
2300                         object = current->object.vm_object;
2301                 }
2302                 /*
2303                  * Note that there is absolutely no sense in writing out
2304                  * anonymous objects, so we track down the vnode object
2305                  * to write out.
2306                  * We invalidate (remove) all pages from the address space
2307                  * anyway, for semantic correctness.
2308                  *
2309                  * note: certain anonymous maps, such as MAP_NOSYNC maps,
2310                  * may start out with a NULL object.
2311                  */
2312                 while (object && object->backing_object) {
2313                         offset += object->backing_object_offset;
2314                         object = object->backing_object;
2315                         if (object->size < OFF_TO_IDX( offset + size))
2316                                 size = IDX_TO_OFF(object->size) - offset;
2317                 }
2318                 if (object && (object->type == OBJT_VNODE) && 
2319                     (current->protection & VM_PROT_WRITE)) {
2320                         /*
2321                          * Flush pages if writing is allowed, invalidate them
2322                          * if invalidation requested.  Pages undergoing I/O
2323                          * will be ignored by vm_object_page_remove().
2324                          *
2325                          * We cannot lock the vnode and then wait for paging
2326                          * to complete without deadlocking against vm_fault.
2327                          * Instead we simply call vm_object_page_remove() and
2328                          * allow it to block internally on a page-by-page 
2329                          * basis when it encounters pages undergoing async 
2330                          * I/O.
2331                          */
2332                         int flags;
2333
2334                         vm_object_reference(object);
2335                         vn_lock(object->handle, LK_EXCLUSIVE | LK_RETRY);
2336                         flags = (syncio || invalidate) ? OBJPC_SYNC : 0;
2337                         flags |= invalidate ? OBJPC_INVAL : 0;
2338
2339                         /*
2340                          * When operating on a virtual page table just
2341                          * flush the whole object.  XXX we probably ought
2342                          * to 
2343                          */
2344                         switch(current->maptype) {
2345                         case VM_MAPTYPE_NORMAL:
2346                                 vm_object_page_clean(object,
2347                                     OFF_TO_IDX(offset),
2348                                     OFF_TO_IDX(offset + size + PAGE_MASK),
2349                                     flags);
2350                                 break;
2351                         case VM_MAPTYPE_VPAGETABLE:
2352                                 vm_object_page_clean(object, 0, 0, flags);
2353                                 break;
2354                         }
2355                         vn_unlock(((struct vnode *)object->handle));
2356                         vm_object_deallocate(object);
2357                 }
2358                 if (object && invalidate &&
2359                    ((object->type == OBJT_VNODE) ||
2360                     (object->type == OBJT_DEVICE))) {
2361                         int clean_only = 
2362                                 (object->type == OBJT_DEVICE) ? FALSE : TRUE;
2363                         vm_object_reference(object);
2364                         switch(current->maptype) {
2365                         case VM_MAPTYPE_NORMAL:
2366                                 vm_object_page_remove(object,
2367                                     OFF_TO_IDX(offset),
2368                                     OFF_TO_IDX(offset + size + PAGE_MASK),
2369                                     clean_only);
2370                                 break;
2371                         case VM_MAPTYPE_VPAGETABLE:
2372                                 vm_object_page_remove(object, 0, 0, clean_only);
2373                                 break;
2374                         }
2375                         vm_object_deallocate(object);
2376                 }
2377                 start += size;
2378         }
2379
2380         vm_map_unlock_read(map);
2381         return (KERN_SUCCESS);
2382 }
2383
2384 /*
2385  *      vm_map_entry_unwire:    [ internal use only ]
2386  *
2387  *      Make the region specified by this entry pageable.
2388  *
2389  *      The map in question should be locked.
2390  *      [This is the reason for this routine's existence.]
2391  */
2392 static void 
2393 vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
2394 {
2395         entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2396         entry->wired_count = 0;
2397         vm_fault_unwire(map, entry);
2398 }
2399
2400 /*
2401  *      vm_map_entry_delete:    [ internal use only ]
2402  *
2403  *      Deallocate the given entry from the target map.
2404  */
2405 static void
2406 vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry, int *countp)
2407 {
2408         vm_map_entry_unlink(map, entry);
2409         map->size -= entry->end - entry->start;
2410
2411         switch(entry->maptype) {
2412         case VM_MAPTYPE_NORMAL:
2413         case VM_MAPTYPE_VPAGETABLE:
2414                 vm_object_deallocate(entry->object.vm_object);
2415                 break;
2416         default:
2417                 break;
2418         }
2419
2420         vm_map_entry_dispose(map, entry, countp);
2421 }
2422
2423 /*
2424  *      vm_map_delete:  [ internal use only ]
2425  *
2426  *      Deallocates the given address range from the target
2427  *      map.
2428  */
2429 int
2430 vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end, int *countp)
2431 {
2432         vm_object_t object;
2433         vm_map_entry_t entry;
2434         vm_map_entry_t first_entry;
2435
2436 again:
2437         /*
2438          * Find the start of the region, and clip it.  Set entry to point
2439          * at the first record containing the requested address or, if no
2440          * such record exists, the next record with a greater address.  The
2441          * loop will run from this point until a record beyond the termination
2442          * address is encountered.
2443          *
2444          * map->hint must be adjusted to not point to anything we delete,
2445          * so set it to the entry prior to the one being deleted.
2446          *
2447          * GGG see other GGG comment.
2448          */
2449         if (vm_map_lookup_entry(map, start, &first_entry)) {
2450                 entry = first_entry;
2451                 vm_map_clip_start(map, entry, start, countp);
2452                 map->hint = entry->prev;        /* possible problem XXX */
2453         } else {
2454                 map->hint = first_entry;        /* possible problem XXX */
2455                 entry = first_entry->next;
2456         }
2457
2458         /*
2459          * If a hole opens up prior to the current first_free then
2460          * adjust first_free.  As with map->hint, map->first_free
2461          * cannot be left set to anything we might delete.
2462          */
2463         if (entry == &map->header) {
2464                 map->first_free = &map->header;
2465         } else if (map->first_free->start >= start) {
2466                 map->first_free = entry->prev;
2467         }
2468
2469         /*
2470          * Step through all entries in this region
2471          */
2472
2473         while ((entry != &map->header) && (entry->start < end)) {
2474                 vm_map_entry_t next;
2475                 vm_offset_t s, e;
2476                 vm_pindex_t offidxstart, offidxend, count;
2477
2478                 /*
2479                  * If we hit an in-transition entry we have to sleep and
2480                  * retry.  It's easier (and not really slower) to just retry
2481                  * since this case occurs so rarely and the hint is already
2482                  * pointing at the right place.  We have to reset the
2483                  * start offset so as not to accidently delete an entry
2484                  * another process just created in vacated space.
2485                  */
2486                 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
2487                         entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
2488                         start = entry->start;
2489                         ++mycpu->gd_cnt.v_intrans_coll;
2490                         ++mycpu->gd_cnt.v_intrans_wait;
2491                         vm_map_transition_wait(map);
2492                         goto again;
2493                 }
2494                 vm_map_clip_end(map, entry, end, countp);
2495
2496                 s = entry->start;
2497                 e = entry->end;
2498                 next = entry->next;
2499
2500                 offidxstart = OFF_TO_IDX(entry->offset);
2501                 count = OFF_TO_IDX(e - s);
2502                 object = entry->object.vm_object;
2503
2504                 /*
2505                  * Unwire before removing addresses from the pmap; otherwise,
2506                  * unwiring will put the entries back in the pmap.
2507                  */
2508                 if (entry->wired_count != 0)
2509                         vm_map_entry_unwire(map, entry);
2510
2511                 offidxend = offidxstart + count;
2512
2513                 if (object == &kernel_object) {
2514                         vm_object_page_remove(object, offidxstart, offidxend, FALSE);
2515                 } else {
2516                         pmap_remove(map->pmap, s, e);
2517                         if (object != NULL &&
2518                             object->ref_count != 1 &&
2519                             (object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING &&
2520                             (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) {
2521                                 vm_object_collapse(object);
2522                                 vm_object_page_remove(object, offidxstart, offidxend, FALSE);
2523                                 if (object->type == OBJT_SWAP) {
2524                                         swap_pager_freespace(object, offidxstart, count);
2525                                 }
2526                                 if (offidxend >= object->size &&
2527                                     offidxstart < object->size) {
2528                                         object->size = offidxstart;
2529                                 }
2530                         }
2531                 }
2532
2533                 /*
2534                  * Delete the entry (which may delete the object) only after
2535                  * removing all pmap entries pointing to its pages.
2536                  * (Otherwise, its page frames may be reallocated, and any
2537                  * modify bits will be set in the wrong object!)
2538                  */
2539                 vm_map_entry_delete(map, entry, countp);
2540                 entry = next;
2541         }
2542         return (KERN_SUCCESS);
2543 }
2544
2545 /*
2546  *      vm_map_remove:
2547  *
2548  *      Remove the given address range from the target map.
2549  *      This is the exported form of vm_map_delete.
2550  */
2551 int
2552 vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
2553 {
2554         int result;
2555         int count;
2556
2557         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2558         vm_map_lock(map);
2559         VM_MAP_RANGE_CHECK(map, start, end);
2560         result = vm_map_delete(map, start, end, &count);
2561         vm_map_unlock(map);
2562         vm_map_entry_release(count);
2563
2564         return (result);
2565 }
2566
2567 /*
2568  *      vm_map_check_protection:
2569  *
2570  *      Assert that the target map allows the specified
2571  *      privilege on the entire address region given.
2572  *      The entire region must be allocated.
2573  */
2574 boolean_t
2575 vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
2576                         vm_prot_t protection)
2577 {
2578         vm_map_entry_t entry;
2579         vm_map_entry_t tmp_entry;
2580
2581         if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
2582                 return (FALSE);
2583         }
2584         entry = tmp_entry;
2585
2586         while (start < end) {
2587                 if (entry == &map->header) {
2588                         return (FALSE);
2589                 }
2590                 /*
2591                  * No holes allowed!
2592                  */
2593
2594                 if (start < entry->start) {
2595                         return (FALSE);
2596                 }
2597                 /*
2598                  * Check protection associated with entry.
2599                  */
2600
2601                 if ((entry->protection & protection) != protection) {
2602                         return (FALSE);
2603                 }
2604                 /* go to next entry */
2605
2606                 start = entry->end;
2607                 entry = entry->next;
2608         }
2609         return (TRUE);
2610 }
2611
2612 /*
2613  * Split the pages in a map entry into a new object.  This affords
2614  * easier removal of unused pages, and keeps object inheritance from
2615  * being a negative impact on memory usage.
2616  */
2617 static void
2618 vm_map_split(vm_map_entry_t entry)
2619 {
2620         vm_page_t m;
2621         vm_object_t orig_object, new_object, source;
2622         vm_offset_t s, e;
2623         vm_pindex_t offidxstart, offidxend, idx;
2624         vm_size_t size;
2625         vm_ooffset_t offset;
2626
2627         orig_object = entry->object.vm_object;
2628         if (orig_object->type != OBJT_DEFAULT && orig_object->type != OBJT_SWAP)
2629                 return;
2630         if (orig_object->ref_count <= 1)
2631                 return;
2632
2633         offset = entry->offset;
2634         s = entry->start;
2635         e = entry->end;
2636
2637         offidxstart = OFF_TO_IDX(offset);
2638         offidxend = offidxstart + OFF_TO_IDX(e - s);
2639         size = offidxend - offidxstart;
2640
2641         new_object = vm_pager_allocate(orig_object->type, NULL,
2642                                        IDX_TO_OFF(size), VM_PROT_ALL, 0);
2643         if (new_object == NULL)
2644                 return;
2645
2646         source = orig_object->backing_object;
2647         if (source != NULL) {
2648                 vm_object_reference(source);    /* Referenced by new_object */
2649                 LIST_INSERT_HEAD(&source->shadow_head,
2650                                   new_object, shadow_list);
2651                 vm_object_clear_flag(source, OBJ_ONEMAPPING);
2652                 new_object->backing_object_offset = 
2653                         orig_object->backing_object_offset + IDX_TO_OFF(offidxstart);
2654                 new_object->backing_object = source;
2655                 source->shadow_count++;
2656                 source->generation++;
2657         }
2658
2659         for (idx = 0; idx < size; idx++) {
2660                 vm_page_t m;
2661
2662                 /*
2663                  * A critical section is required to avoid a race between
2664                  * the lookup and an interrupt/unbusy/free and our busy
2665                  * check.
2666                  */
2667                 crit_enter();
2668         retry:
2669                 m = vm_page_lookup(orig_object, offidxstart + idx);
2670                 if (m == NULL) {
2671                         crit_exit();
2672                         continue;
2673                 }
2674
2675                 /*
2676                  * We must wait for pending I/O to complete before we can
2677                  * rename the page.
2678                  *
2679                  * We do not have to VM_PROT_NONE the page as mappings should
2680                  * not be changed by this operation.
2681                  */
2682                 if (vm_page_sleep_busy(m, TRUE, "spltwt"))
2683                         goto retry;
2684                 vm_page_busy(m);
2685                 vm_page_rename(m, new_object, idx);
2686                 /* page automatically made dirty by rename and cache handled */
2687                 vm_page_busy(m);
2688                 crit_exit();
2689         }
2690
2691         if (orig_object->type == OBJT_SWAP) {
2692                 vm_object_pip_add(orig_object, 1);
2693                 /*
2694                  * copy orig_object pages into new_object
2695                  * and destroy unneeded pages in
2696                  * shadow object.
2697                  */
2698                 swap_pager_copy(orig_object, new_object, offidxstart, 0);
2699                 vm_object_pip_wakeup(orig_object);
2700         }
2701
2702         /*
2703          * Wakeup the pages we played with.  No spl protection is needed
2704          * for a simple wakeup.
2705          */
2706         for (idx = 0; idx < size; idx++) {
2707                 m = vm_page_lookup(new_object, idx);
2708                 if (m)
2709                         vm_page_wakeup(m);
2710         }
2711
2712         entry->object.vm_object = new_object;
2713         entry->offset = 0LL;
2714         vm_object_deallocate(orig_object);
2715 }
2716
2717 /*
2718  *      vm_map_copy_entry:
2719  *
2720  *      Copies the contents of the source entry to the destination
2721  *      entry.  The entries *must* be aligned properly.
2722  */
2723 static void
2724 vm_map_copy_entry(vm_map_t src_map, vm_map_t dst_map,
2725         vm_map_entry_t src_entry, vm_map_entry_t dst_entry)
2726 {
2727         vm_object_t src_object;
2728
2729         if (dst_entry->maptype == VM_MAPTYPE_SUBMAP)
2730                 return;
2731         if (src_entry->maptype == VM_MAPTYPE_SUBMAP)
2732                 return;
2733
2734         if (src_entry->wired_count == 0) {
2735                 /*
2736                  * If the source entry is marked needs_copy, it is already
2737                  * write-protected.
2738                  */
2739                 if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) {
2740                         pmap_protect(src_map->pmap,
2741                             src_entry->start,
2742                             src_entry->end,
2743                             src_entry->protection & ~VM_PROT_WRITE);
2744                 }
2745
2746                 /*
2747                  * Make a copy of the object.
2748                  */
2749                 if ((src_object = src_entry->object.vm_object) != NULL) {
2750                         if ((src_object->handle == NULL) &&
2751                                 (src_object->type == OBJT_DEFAULT ||
2752                                  src_object->type == OBJT_SWAP)) {
2753                                 vm_object_collapse(src_object);
2754                                 if ((src_object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING) {
2755                                         vm_map_split(src_entry);
2756                                         src_object = src_entry->object.vm_object;
2757                                 }
2758                         }
2759
2760                         vm_object_reference(src_object);
2761                         vm_object_clear_flag(src_object, OBJ_ONEMAPPING);
2762                         dst_entry->object.vm_object = src_object;
2763                         src_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY);
2764                         dst_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY);
2765                         dst_entry->offset = src_entry->offset;
2766                 } else {
2767                         dst_entry->object.vm_object = NULL;
2768                         dst_entry->offset = 0;
2769                 }
2770
2771                 pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start,
2772                     dst_entry->end - dst_entry->start, src_entry->start);
2773         } else {
2774                 /*
2775                  * Of course, wired down pages can't be set copy-on-write.
2776                  * Cause wired pages to be copied into the new map by
2777                  * simulating faults (the new pages are pageable)
2778                  */
2779                 vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry);
2780         }
2781 }
2782
2783 /*
2784  * vmspace_fork:
2785  * Create a new process vmspace structure and vm_map
2786  * based on those of an existing process.  The new map
2787  * is based on the old map, according to the inheritance
2788  * values on the regions in that map.
2789  *
2790  * The source map must not be locked.
2791  */
2792 struct vmspace *
2793 vmspace_fork(struct vmspace *vm1)
2794 {
2795         struct vmspace *vm2;
2796         vm_map_t old_map = &vm1->vm_map;
2797         vm_map_t new_map;
2798         vm_map_entry_t old_entry;
2799         vm_map_entry_t new_entry;
2800         vm_object_t object;
2801         int count;
2802
2803         vm_map_lock(old_map);
2804         old_map->infork = 1;
2805
2806         /*
2807          * XXX Note: upcalls are not copied.
2808          */
2809         vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset);
2810         bcopy(&vm1->vm_startcopy, &vm2->vm_startcopy,
2811             (caddr_t)&vm1->vm_endcopy - (caddr_t)&vm1->vm_startcopy);
2812         new_map = &vm2->vm_map; /* XXX */
2813         new_map->timestamp = 1;
2814
2815         count = 0;
2816         old_entry = old_map->header.next;
2817         while (old_entry != &old_map->header) {
2818                 ++count;
2819                 old_entry = old_entry->next;
2820         }
2821
2822         count = vm_map_entry_reserve(count + MAP_RESERVE_COUNT);
2823
2824         old_entry = old_map->header.next;
2825         while (old_entry != &old_map->header) {
2826                 if (old_entry->maptype == VM_MAPTYPE_SUBMAP)
2827                         panic("vm_map_fork: encountered a submap");
2828
2829                 switch (old_entry->inheritance) {
2830                 case VM_INHERIT_NONE:
2831                         break;
2832
2833                 case VM_INHERIT_SHARE:
2834                         /*
2835                          * Clone the entry, creating the shared object if
2836                          * necessary.
2837                          */
2838                         object = old_entry->object.vm_object;
2839                         if (object == NULL) {
2840                                 vm_map_entry_allocate_object(old_entry);
2841                                 object = old_entry->object.vm_object;
2842                         }
2843
2844                         /*
2845                          * Add the reference before calling vm_map_entry_shadow
2846                          * to insure that a shadow object is created.
2847                          */
2848                         vm_object_reference(object);
2849                         if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
2850                                 vm_map_entry_shadow(old_entry);
2851                                 /* Transfer the second reference too. */
2852                                 vm_object_reference(
2853                                     old_entry->object.vm_object);
2854                                 vm_object_deallocate(object);
2855                                 object = old_entry->object.vm_object;
2856                         }
2857                         vm_object_clear_flag(object, OBJ_ONEMAPPING);
2858
2859                         /*
2860                          * Clone the entry, referencing the shared object.
2861                          */
2862                         new_entry = vm_map_entry_create(new_map, &count);
2863                         *new_entry = *old_entry;
2864                         new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2865                         new_entry->wired_count = 0;
2866
2867                         /*
2868                          * Insert the entry into the new map -- we know we're
2869                          * inserting at the end of the new map.
2870                          */
2871
2872                         vm_map_entry_link(new_map, new_map->header.prev,
2873                             new_entry);
2874
2875                         /*
2876                          * Update the physical map
2877                          */
2878
2879                         pmap_copy(new_map->pmap, old_map->pmap,
2880                             new_entry->start,
2881                             (old_entry->end - old_entry->start),
2882                             old_entry->start);
2883                         break;
2884
2885                 case VM_INHERIT_COPY:
2886                         /*
2887                          * Clone the entry and link into the map.
2888                          */
2889                         new_entry = vm_map_entry_create(new_map, &count);
2890                         *new_entry = *old_entry;
2891                         new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2892                         new_entry->wired_count = 0;
2893                         new_entry->object.vm_object = NULL;
2894                         vm_map_entry_link(new_map, new_map->header.prev,
2895                             new_entry);
2896                         vm_map_copy_entry(old_map, new_map, old_entry,
2897                             new_entry);
2898                         break;
2899                 }
2900                 old_entry = old_entry->next;
2901         }
2902
2903         new_map->size = old_map->size;
2904         old_map->infork = 0;
2905         vm_map_unlock(old_map);
2906         vm_map_entry_release(count);
2907
2908         return (vm2);
2909 }
2910
2911 int
2912 vm_map_stack (vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
2913               vm_prot_t prot, vm_prot_t max, int cow)
2914 {
2915         vm_map_entry_t prev_entry;
2916         vm_map_entry_t new_stack_entry;
2917         vm_size_t      init_ssize;
2918         int            rv;
2919         int             count;
2920
2921         if (VM_MIN_USER_ADDRESS > 0 && addrbos < VM_MIN_USER_ADDRESS)
2922                 return (KERN_NO_SPACE);
2923
2924         if (max_ssize < sgrowsiz)
2925                 init_ssize = max_ssize;
2926         else
2927                 init_ssize = sgrowsiz;
2928
2929         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2930         vm_map_lock(map);
2931
2932         /* If addr is already mapped, no go */
2933         if (vm_map_lookup_entry(map, addrbos, &prev_entry)) {
2934                 vm_map_unlock(map);
2935                 vm_map_entry_release(count);
2936                 return (KERN_NO_SPACE);
2937         }
2938
2939         /* If we would blow our VMEM resource limit, no go */
2940         if (map->size + init_ssize >
2941             curproc->p_rlimit[RLIMIT_VMEM].rlim_cur) {
2942                 vm_map_unlock(map);
2943                 vm_map_entry_release(count);
2944                 return (KERN_NO_SPACE);
2945         }
2946
2947         /* If we can't accomodate max_ssize in the current mapping,
2948          * no go.  However, we need to be aware that subsequent user
2949          * mappings might map into the space we have reserved for
2950          * stack, and currently this space is not protected.  
2951          * 
2952          * Hopefully we will at least detect this condition 
2953          * when we try to grow the stack.
2954          */
2955         if ((prev_entry->next != &map->header) &&
2956             (prev_entry->next->start < addrbos + max_ssize)) {
2957                 vm_map_unlock(map);
2958                 vm_map_entry_release(count);
2959                 return (KERN_NO_SPACE);
2960         }
2961
2962         /* We initially map a stack of only init_ssize.  We will
2963          * grow as needed later.  Since this is to be a grow 
2964          * down stack, we map at the top of the range.
2965          *
2966          * Note: we would normally expect prot and max to be
2967          * VM_PROT_ALL, and cow to be 0.  Possibly we should
2968          * eliminate these as input parameters, and just
2969          * pass these values here in the insert call.
2970          */
2971         rv = vm_map_insert(map, &count,
2972                            NULL, 0, addrbos + max_ssize - init_ssize,
2973                            addrbos + max_ssize,
2974                            VM_MAPTYPE_NORMAL,
2975                            prot, max,
2976                            cow);
2977
2978         /* Now set the avail_ssize amount */
2979         if (rv == KERN_SUCCESS) {
2980                 if (prev_entry != &map->header)
2981                         vm_map_clip_end(map, prev_entry, addrbos + max_ssize - init_ssize, &count);
2982                 new_stack_entry = prev_entry->next;
2983                 if (new_stack_entry->end   != addrbos + max_ssize ||
2984                     new_stack_entry->start != addrbos + max_ssize - init_ssize)
2985                         panic ("Bad entry start/end for new stack entry");
2986                 else 
2987                         new_stack_entry->aux.avail_ssize = max_ssize - init_ssize;
2988         }
2989
2990         vm_map_unlock(map);
2991         vm_map_entry_release(count);
2992         return (rv);
2993 }
2994
2995 /* Attempts to grow a vm stack entry.  Returns KERN_SUCCESS if the
2996  * desired address is already mapped, or if we successfully grow
2997  * the stack.  Also returns KERN_SUCCESS if addr is outside the
2998  * stack range (this is strange, but preserves compatibility with
2999  * the grow function in vm_machdep.c).
3000  */
3001 int
3002 vm_map_growstack (struct proc *p, vm_offset_t addr)
3003 {
3004         vm_map_entry_t prev_entry;
3005         vm_map_entry_t stack_entry;
3006         vm_map_entry_t new_stack_entry;
3007         struct vmspace *vm = p->p_vmspace;
3008         vm_map_t map = &vm->vm_map;
3009         vm_offset_t    end;
3010         int grow_amount;
3011         int rv = KERN_SUCCESS;
3012         int is_procstack;
3013         int use_read_lock = 1;
3014         int count;
3015
3016         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
3017 Retry:
3018         if (use_read_lock)
3019                 vm_map_lock_read(map);
3020         else
3021                 vm_map_lock(map);
3022
3023         /* If addr is already in the entry range, no need to grow.*/
3024         if (vm_map_lookup_entry(map, addr, &prev_entry))
3025                 goto done;
3026
3027         if ((stack_entry = prev_entry->next) == &map->header)
3028                 goto done;
3029         if (prev_entry == &map->header) 
3030                 end = stack_entry->start - stack_entry->aux.avail_ssize;
3031         else
3032                 end = prev_entry->end;
3033
3034         /* This next test mimics the old grow function in vm_machdep.c.
3035          * It really doesn't quite make sense, but we do it anyway
3036          * for compatibility.
3037          *
3038          * If not growable stack, return success.  This signals the
3039          * caller to proceed as he would normally with normal vm.
3040          */
3041         if (stack_entry->aux.avail_ssize < 1 ||
3042             addr >= stack_entry->start ||
3043             addr <  stack_entry->start - stack_entry->aux.avail_ssize) {
3044                 goto done;
3045         } 
3046         
3047         /* Find the minimum grow amount */
3048         grow_amount = roundup (stack_entry->start - addr, PAGE_SIZE);
3049         if (grow_amount > stack_entry->aux.avail_ssize) {
3050                 rv = KERN_NO_SPACE;
3051                 goto done;
3052         }
3053
3054         /* If there is no longer enough space between the entries
3055          * nogo, and adjust the available space.  Note: this 
3056          * should only happen if the user has mapped into the
3057          * stack area after the stack was created, and is
3058          * probably an error.
3059          *
3060          * This also effectively destroys any guard page the user
3061          * might have intended by limiting the stack size.
3062          */
3063         if (grow_amount > stack_entry->start - end) {
3064                 if (use_read_lock && vm_map_lock_upgrade(map)) {
3065                         use_read_lock = 0;
3066                         goto Retry;
3067                 }
3068                 use_read_lock = 0;
3069                 stack_entry->aux.avail_ssize = stack_entry->start - end;
3070                 rv = KERN_NO_SPACE;
3071                 goto done;
3072         }
3073
3074         is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr;
3075
3076         /* If this is the main process stack, see if we're over the 
3077          * stack limit.
3078          */
3079         if (is_procstack && (ctob(vm->vm_ssize) + grow_amount >
3080                              p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
3081                 rv = KERN_NO_SPACE;
3082                 goto done;
3083         }
3084
3085         /* Round up the grow amount modulo SGROWSIZ */
3086         grow_amount = roundup (grow_amount, sgrowsiz);
3087         if (grow_amount > stack_entry->aux.avail_ssize) {
3088                 grow_amount = stack_entry->aux.avail_ssize;
3089         }
3090         if (is_procstack && (ctob(vm->vm_ssize) + grow_amount >
3091                              p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
3092                 grow_amount = p->p_rlimit[RLIMIT_STACK].rlim_cur -
3093                               ctob(vm->vm_ssize);
3094         }
3095
3096         /* If we would blow our VMEM resource limit, no go */
3097         if (map->size + grow_amount > p->p_rlimit[RLIMIT_VMEM].rlim_cur) {
3098                 rv = KERN_NO_SPACE;
3099                 goto done;
3100         }
3101
3102         if (use_read_lock && vm_map_lock_upgrade(map)) {
3103                 use_read_lock = 0;
3104                 goto Retry;
3105         }
3106         use_read_lock = 0;
3107
3108         /* Get the preliminary new entry start value */
3109         addr = stack_entry->start - grow_amount;
3110
3111         /* If this puts us into the previous entry, cut back our growth
3112          * to the available space.  Also, see the note above.
3113          */
3114         if (addr < end) {
3115                 stack_entry->aux.avail_ssize = stack_entry->start - end;
3116                 addr = end;
3117         }
3118
3119         rv = vm_map_insert(map, &count,
3120                            NULL, 0, addr, stack_entry->start,
3121                            VM_MAPTYPE_NORMAL,
3122                            VM_PROT_ALL, VM_PROT_ALL,
3123                            0);
3124
3125         /* Adjust the available stack space by the amount we grew. */
3126         if (rv == KERN_SUCCESS) {
3127                 if (prev_entry != &map->header)
3128                         vm_map_clip_end(map, prev_entry, addr, &count);
3129                 new_stack_entry = prev_entry->next;
3130                 if (new_stack_entry->end   != stack_entry->start  ||
3131                     new_stack_entry->start != addr)
3132                         panic ("Bad stack grow start/end in new stack entry");
3133                 else {
3134                         new_stack_entry->aux.avail_ssize =
3135                                 stack_entry->aux.avail_ssize -
3136                                 (new_stack_entry->end - new_stack_entry->start);
3137                         if (is_procstack)
3138                                 vm->vm_ssize += btoc(new_stack_entry->end -
3139                                                      new_stack_entry->start);
3140                 }
3141         }
3142
3143 done:
3144         if (use_read_lock)
3145                 vm_map_unlock_read(map);
3146         else
3147                 vm_map_unlock(map);
3148         vm_map_entry_release(count);
3149         return (rv);
3150 }
3151
3152 /*
3153  * Unshare the specified VM space for exec.  If other processes are
3154  * mapped to it, then create a new one.  The new vmspace is null.
3155  */
3156
3157 void
3158 vmspace_exec(struct proc *p, struct vmspace *vmcopy) 
3159 {
3160         struct vmspace *oldvmspace = p->p_vmspace;
3161         struct vmspace *newvmspace;
3162         vm_map_t map = &p->p_vmspace->vm_map;
3163
3164         /*
3165          * If we are execing a resident vmspace we fork it, otherwise
3166          * we create a new vmspace.  Note that exitingcnt and upcalls
3167          * are not copied to the new vmspace.
3168          */
3169         if (vmcopy)  {
3170             newvmspace = vmspace_fork(vmcopy);
3171         } else {
3172             newvmspace = vmspace_alloc(map->min_offset, map->max_offset);
3173             bcopy(&oldvmspace->vm_startcopy, &newvmspace->vm_startcopy,
3174                 (caddr_t)&oldvmspace->vm_endcopy - 
3175                     (caddr_t)&oldvmspace->vm_startcopy);
3176         }
3177
3178         /*
3179          * This code is written like this for prototype purposes.  The
3180          * goal is to avoid running down the vmspace here, but let the
3181          * other process's that are still using the vmspace to finally
3182          * run it down.  Even though there is little or no chance of blocking
3183          * here, it is a good idea to keep this form for future mods.
3184          */
3185         p->p_vmspace = newvmspace;
3186         pmap_pinit2(vmspace_pmap(newvmspace));
3187         if (p == curproc)
3188                 pmap_activate(p);
3189         vmspace_free(oldvmspace);
3190 }
3191
3192 /*
3193  * Unshare the specified VM space for forcing COW.  This
3194  * is called by rfork, for the (RFMEM|RFPROC) == 0 case.
3195  *
3196  * The exitingcnt test is not strictly necessary but has been
3197  * included for code sanity (to make the code a bit more deterministic).
3198  */
3199
3200 void
3201 vmspace_unshare(struct proc *p) 
3202 {
3203         struct vmspace *oldvmspace = p->p_vmspace;
3204         struct vmspace *newvmspace;
3205
3206         if (oldvmspace->vm_refcnt == 1 && oldvmspace->vm_exitingcnt == 0)
3207                 return;
3208         newvmspace = vmspace_fork(oldvmspace);
3209         p->p_vmspace = newvmspace;
3210         pmap_pinit2(vmspace_pmap(newvmspace));
3211         if (p == curproc)
3212                 pmap_activate(p);
3213         vmspace_free(oldvmspace);
3214 }
3215
3216 /*
3217  *      vm_map_lookup:
3218  *
3219  *      Finds the VM object, offset, and
3220  *      protection for a given virtual address in the
3221  *      specified map, assuming a page fault of the
3222  *      type specified.
3223  *
3224  *      Leaves the map in question locked for read; return
3225  *      values are guaranteed until a vm_map_lookup_done
3226  *      call is performed.  Note that the map argument
3227  *      is in/out; the returned map must be used in
3228  *      the call to vm_map_lookup_done.
3229  *
3230  *      A handle (out_entry) is returned for use in
3231  *      vm_map_lookup_done, to make that fast.
3232  *
3233  *      If a lookup is requested with "write protection"
3234  *      specified, the map may be changed to perform virtual
3235  *      copying operations, although the data referenced will
3236  *      remain the same.
3237  */
3238 int
3239 vm_map_lookup(vm_map_t *var_map,                /* IN/OUT */
3240               vm_offset_t vaddr,
3241               vm_prot_t fault_typea,
3242               vm_map_entry_t *out_entry,        /* OUT */
3243               vm_object_t *object,              /* OUT */
3244               vm_pindex_t *pindex,              /* OUT */
3245               vm_prot_t *out_prot,              /* OUT */
3246               boolean_t *wired)                 /* OUT */
3247 {
3248         vm_map_entry_t entry;
3249         vm_map_t map = *var_map;
3250         vm_prot_t prot;
3251         vm_prot_t fault_type = fault_typea;
3252         int use_read_lock = 1;
3253         int rv = KERN_SUCCESS;
3254
3255 RetryLookup:
3256         if (use_read_lock)
3257                 vm_map_lock_read(map);
3258         else
3259                 vm_map_lock(map);
3260
3261         /*
3262          * If the map has an interesting hint, try it before calling full
3263          * blown lookup routine.
3264          */
3265         entry = map->hint;
3266         *out_entry = entry;
3267
3268         if ((entry == &map->header) ||
3269             (vaddr < entry->start) || (vaddr >= entry->end)) {
3270                 vm_map_entry_t tmp_entry;
3271
3272                 /*
3273                  * Entry was either not a valid hint, or the vaddr was not
3274                  * contained in the entry, so do a full lookup.
3275                  */
3276                 if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
3277                         rv = KERN_INVALID_ADDRESS;
3278                         goto done;
3279                 }
3280
3281                 entry = tmp_entry;
3282                 *out_entry = entry;
3283         }
3284         
3285         /*
3286          * Handle submaps.
3287          */
3288         if (entry->maptype == VM_MAPTYPE_SUBMAP) {
3289                 vm_map_t old_map = map;
3290
3291                 *var_map = map = entry->object.sub_map;
3292                 if (use_read_lock)
3293                         vm_map_unlock_read(old_map);
3294                 else
3295                         vm_map_unlock(old_map);
3296                 use_read_lock = 1;
3297                 goto RetryLookup;
3298         }
3299
3300         /*
3301          * Check whether this task is allowed to have this page.
3302          * Note the special case for MAP_ENTRY_COW
3303          * pages with an override.  This is to implement a forced
3304          * COW for debuggers.
3305          */
3306
3307         if (fault_type & VM_PROT_OVERRIDE_WRITE)
3308                 prot = entry->max_protection;
3309         else
3310                 prot = entry->protection;
3311
3312         fault_type &= (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
3313         if ((fault_type & prot) != fault_type) {
3314                 rv = KERN_PROTECTION_FAILURE;
3315                 goto done;
3316         }
3317
3318         if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
3319             (entry->eflags & MAP_ENTRY_COW) &&
3320             (fault_type & VM_PROT_WRITE) &&
3321             (fault_typea & VM_PROT_OVERRIDE_WRITE) == 0) {
3322                 rv = KERN_PROTECTION_FAILURE;
3323                 goto done;
3324         }
3325
3326         /*
3327          * If this page is not pageable, we have to get it for all possible
3328          * accesses.
3329          */
3330         *wired = (entry->wired_count != 0);
3331         if (*wired)
3332                 prot = fault_type = entry->protection;
3333
3334         /*
3335          * Virtual page tables may need to update the accessed (A) bit
3336          * in a page table entry.  Upgrade the fault to a write fault for
3337          * that case if the map will support it.  If the map does not support
3338          * it the page table entry simply will not be updated.
3339          */
3340         if (entry->maptype == VM_MAPTYPE_VPAGETABLE) {
3341                 if (prot & VM_PROT_WRITE)
3342                         fault_type |= VM_PROT_WRITE;
3343         }
3344
3345         /*
3346          * If the entry was copy-on-write, we either ...
3347          */
3348         if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
3349                 /*
3350                  * If we want to write the page, we may as well handle that
3351                  * now since we've got the map locked.
3352                  *
3353                  * If we don't need to write the page, we just demote the
3354                  * permissions allowed.
3355                  */
3356
3357                 if (fault_type & VM_PROT_WRITE) {
3358                         /*
3359                          * Make a new object, and place it in the object
3360                          * chain.  Note that no new references have appeared
3361                          * -- one just moved from the map to the new
3362                          * object.
3363                          */
3364
3365                         if (use_read_lock && vm_map_lock_upgrade(map)) {
3366                                 use_read_lock = 0;
3367                                 goto RetryLookup;
3368                         }
3369                         use_read_lock = 0;
3370
3371                         vm_map_entry_shadow(entry);
3372                 } else {
3373                         /*
3374                          * We're attempting to read a copy-on-write page --
3375                          * don't allow writes.
3376                          */
3377
3378                         prot &= ~VM_PROT_WRITE;
3379                 }
3380         }
3381
3382         /*
3383          * Create an object if necessary.
3384          */
3385         if (entry->object.vm_object == NULL &&
3386             !map->system_map) {
3387                 if (use_read_lock && vm_map_lock_upgrade(map))  {
3388                         use_read_lock = 0;
3389                         goto RetryLookup;
3390                 }
3391                 use_read_lock = 0;
3392                 vm_map_entry_allocate_object(entry);
3393         }
3394
3395         /*
3396          * Return the object/offset from this entry.  If the entry was
3397          * copy-on-write or empty, it has been fixed up.
3398          */
3399
3400         *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
3401         *object = entry->object.vm_object;
3402
3403         /*
3404          * Return whether this is the only map sharing this data.  On
3405          * success we return with a read lock held on the map.  On failure
3406          * we return with the map unlocked.
3407          */
3408         *out_prot = prot;
3409 done:
3410         if (rv == KERN_SUCCESS) {
3411                 if (use_read_lock == 0)
3412                         vm_map_lock_downgrade(map);
3413         } else if (use_read_lock) {
3414                 vm_map_unlock_read(map);
3415         } else {
3416                 vm_map_unlock(map);
3417         }
3418         return (rv);
3419 }
3420
3421 /*
3422  *      vm_map_lookup_done:
3423  *
3424  *      Releases locks acquired by a vm_map_lookup
3425  *      (according to the handle returned by that lookup).
3426  */
3427
3428 void
3429 vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry, int count)
3430 {
3431         /*
3432          * Unlock the main-level map
3433          */
3434         vm_map_unlock_read(map);
3435         if (count)
3436                 vm_map_entry_release(count);
3437 }
3438
3439 #include "opt_ddb.h"
3440 #ifdef DDB
3441 #include <sys/kernel.h>
3442
3443 #include <ddb/ddb.h>
3444
3445 /*
3446  *      vm_map_print:   [ debug ]
3447  */
3448 DB_SHOW_COMMAND(map, vm_map_print)
3449 {
3450         static int nlines;
3451         /* XXX convert args. */
3452         vm_map_t map = (vm_map_t)addr;
3453         boolean_t full = have_addr;
3454
3455         vm_map_entry_t entry;
3456
3457         db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
3458             (void *)map,
3459             (void *)map->pmap, map->nentries, map->timestamp);
3460         nlines++;
3461
3462         if (!full && db_indent)
3463                 return;
3464
3465         db_indent += 2;
3466         for (entry = map->header.next; entry != &map->header;
3467             entry = entry->next) {
3468                 db_iprintf("map entry %p: start=%p, end=%p\n",
3469                     (void *)entry, (void *)entry->start, (void *)entry->end);
3470                 nlines++;
3471                 {
3472                         static char *inheritance_name[4] =
3473                         {"share", "copy", "none", "donate_copy"};
3474
3475                         db_iprintf(" prot=%x/%x/%s",
3476                             entry->protection,
3477                             entry->max_protection,
3478                             inheritance_name[(int)(unsigned char)entry->inheritance]);
3479                         if (entry->wired_count != 0)
3480                                 db_printf(", wired");
3481                 }
3482                 if (entry->maptype == VM_MAPTYPE_SUBMAP) {
3483                         /* XXX no %qd in kernel.  Truncate entry->offset. */
3484                         db_printf(", share=%p, offset=0x%lx\n",
3485                             (void *)entry->object.sub_map,
3486                             (long)entry->offset);
3487                         nlines++;
3488                         if ((entry->prev == &map->header) ||
3489                             (entry->prev->object.sub_map !=
3490                                 entry->object.sub_map)) {
3491                                 db_indent += 2;
3492                                 vm_map_print((db_expr_t)(intptr_t)
3493                                              entry->object.sub_map,
3494                                              full, 0, (char *)0);
3495                                 db_indent -= 2;
3496                         }
3497                 } else {
3498                         /* XXX no %qd in kernel.  Truncate entry->offset. */
3499                         db_printf(", object=%p, offset=0x%lx",
3500                             (void *)entry->object.vm_object,
3501                             (long)entry->offset);
3502                         if (entry->eflags & MAP_ENTRY_COW)
3503                                 db_printf(", copy (%s)",
3504                                     (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
3505                         db_printf("\n");
3506                         nlines++;
3507
3508                         if ((entry->prev == &map->header) ||
3509                             (entry->prev->object.vm_object !=
3510                                 entry->object.vm_object)) {
3511                                 db_indent += 2;
3512                                 vm_object_print((db_expr_t)(intptr_t)
3513                                                 entry->object.vm_object,
3514                                                 full, 0, (char *)0);
3515                                 nlines += 4;
3516                                 db_indent -= 2;
3517                         }
3518                 }
3519         }
3520         db_indent -= 2;
3521         if (db_indent == 0)
3522                 nlines = 0;
3523 }
3524
3525
3526 DB_SHOW_COMMAND(procvm, procvm)
3527 {
3528         struct proc *p;
3529
3530         if (have_addr) {
3531                 p = (struct proc *) addr;
3532         } else {
3533                 p = curproc;
3534         }
3535
3536         db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
3537             (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
3538             (void *)vmspace_pmap(p->p_vmspace));
3539
3540         vm_map_print((db_expr_t)(intptr_t)&p->p_vmspace->vm_map, 1, 0, NULL);
3541 }
3542
3543 #endif /* DDB */