kernel - Fix lapic mask for > 16 apic ids
[dragonfly.git] / sys / vm / vm_map.c
1 /*
2  * (MPSAFE)
3  *
4  * Copyright (c) 1991, 1993
5  *      The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * The Mach Operating System project at Carnegie-Mellon University.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *      This product includes software developed by the University of
21  *      California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *      from: @(#)vm_map.c      8.3 (Berkeley) 1/12/94
39  *
40  *
41  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
42  * All rights reserved.
43  *
44  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
45  *
46  * Permission to use, copy, modify and distribute this software and
47  * its documentation is hereby granted, provided that both the copyright
48  * notice and this permission notice appear in all copies of the
49  * software, derivative works or modified versions, and any portions
50  * thereof, and that both notices appear in supporting documentation.
51  *
52  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
53  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
54  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
55  *
56  * Carnegie Mellon requests users of this software to return to
57  *
58  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
59  *  School of Computer Science
60  *  Carnegie Mellon University
61  *  Pittsburgh PA 15213-3890
62  *
63  * any improvements or extensions that they make and grant Carnegie the
64  * rights to redistribute these changes.
65  *
66  * $FreeBSD: src/sys/vm/vm_map.c,v 1.187.2.19 2003/05/27 00:47:02 alc Exp $
67  * $DragonFly: src/sys/vm/vm_map.c,v 1.56 2007/04/29 18:25:41 dillon Exp $
68  */
69
70 /*
71  *      Virtual memory mapping module.
72  */
73
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/kernel.h>
77 #include <sys/proc.h>
78 #include <sys/serialize.h>
79 #include <sys/lock.h>
80 #include <sys/vmmeter.h>
81 #include <sys/mman.h>
82 #include <sys/vnode.h>
83 #include <sys/resourcevar.h>
84 #include <sys/shm.h>
85 #include <sys/tree.h>
86 #include <sys/malloc.h>
87
88 #include <vm/vm.h>
89 #include <vm/vm_param.h>
90 #include <vm/pmap.h>
91 #include <vm/vm_map.h>
92 #include <vm/vm_page.h>
93 #include <vm/vm_object.h>
94 #include <vm/vm_pager.h>
95 #include <vm/vm_kern.h>
96 #include <vm/vm_extern.h>
97 #include <vm/swap_pager.h>
98 #include <vm/vm_zone.h>
99
100 #include <sys/thread2.h>
101 #include <sys/sysref2.h>
102 #include <sys/random.h>
103 #include <sys/sysctl.h>
104
105 /*
106  * Virtual memory maps provide for the mapping, protection, and sharing
107  * of virtual memory objects.  In addition, this module provides for an
108  * efficient virtual copy of memory from one map to another.
109  *
110  * Synchronization is required prior to most operations.
111  *
112  * Maps consist of an ordered doubly-linked list of simple entries.
113  * A hint and a RB tree is used to speed-up lookups.
114  *
115  * Callers looking to modify maps specify start/end addresses which cause
116  * the related map entry to be clipped if necessary, and then later
117  * recombined if the pieces remained compatible.
118  *
119  * Virtual copy operations are performed by copying VM object references
120  * from one map to another, and then marking both regions as copy-on-write.
121  */
122 static void vmspace_terminate(struct vmspace *vm);
123 static void vmspace_lock(struct vmspace *vm);
124 static void vmspace_unlock(struct vmspace *vm);
125 static void vmspace_dtor(void *obj, void *private);
126
127 MALLOC_DEFINE(M_VMSPACE, "vmspace", "vmspace objcache backingstore");
128
129 struct sysref_class vmspace_sysref_class = {
130         .name =         "vmspace",
131         .mtype =        M_VMSPACE,
132         .proto =        SYSREF_PROTO_VMSPACE,
133         .offset =       offsetof(struct vmspace, vm_sysref),
134         .objsize =      sizeof(struct vmspace),
135         .mag_capacity = 32,
136         .flags = SRC_MANAGEDINIT,
137         .dtor = vmspace_dtor,
138         .ops = {
139                 .terminate = (sysref_terminate_func_t)vmspace_terminate,
140                 .lock = (sysref_lock_func_t)vmspace_lock,
141                 .unlock = (sysref_lock_func_t)vmspace_unlock
142         }
143 };
144
145 #define VMEPERCPU       2
146
147 static struct vm_zone mapentzone_store, mapzone_store;
148 static vm_zone_t mapentzone, mapzone;
149 static struct vm_object mapentobj, mapobj;
150
151 static struct vm_map_entry map_entry_init[MAX_MAPENT];
152 static struct vm_map_entry cpu_map_entry_init[MAXCPU][VMEPERCPU];
153 static struct vm_map map_init[MAX_KMAP];
154
155 static int randomize_mmap;
156 SYSCTL_INT(_vm, OID_AUTO, randomize_mmap, CTLFLAG_RW, &randomize_mmap, 0,
157     "Randomize mmap offsets");
158
159 static void vm_map_entry_shadow(vm_map_entry_t entry);
160 static vm_map_entry_t vm_map_entry_create(vm_map_t map, int *);
161 static void vm_map_entry_dispose (vm_map_t map, vm_map_entry_t entry, int *);
162 static void _vm_map_clip_end (vm_map_t, vm_map_entry_t, vm_offset_t, int *);
163 static void _vm_map_clip_start (vm_map_t, vm_map_entry_t, vm_offset_t, int *);
164 static void vm_map_entry_delete (vm_map_t, vm_map_entry_t, int *);
165 static void vm_map_entry_unwire (vm_map_t, vm_map_entry_t);
166 static void vm_map_copy_entry (vm_map_t, vm_map_t, vm_map_entry_t,
167                 vm_map_entry_t);
168 static void vm_map_split (vm_map_entry_t);
169 static void vm_map_unclip_range (vm_map_t map, vm_map_entry_t start_entry, vm_offset_t start, vm_offset_t end, int *count, int flags);
170
171 /*
172  * Initialize the vm_map module.  Must be called before any other vm_map
173  * routines.
174  *
175  * Map and entry structures are allocated from the general purpose
176  * memory pool with some exceptions:
177  *
178  *      - The kernel map is allocated statically.
179  *      - Initial kernel map entries are allocated out of a static pool.
180  *
181  *      These restrictions are necessary since malloc() uses the
182  *      maps and requires map entries.
183  *
184  * Called from the low level boot code only.
185  */
186 void
187 vm_map_startup(void)
188 {
189         mapzone = &mapzone_store;
190         zbootinit(mapzone, "MAP", sizeof (struct vm_map),
191                 map_init, MAX_KMAP);
192         mapentzone = &mapentzone_store;
193         zbootinit(mapentzone, "MAP ENTRY", sizeof (struct vm_map_entry),
194                 map_entry_init, MAX_MAPENT);
195 }
196
197 /*
198  * Called prior to any vmspace allocations.
199  *
200  * Called from the low level boot code only.
201  */
202 void
203 vm_init2(void) 
204 {
205         zinitna(mapentzone, &mapentobj, NULL, 0, 0, 
206                 ZONE_USE_RESERVE | ZONE_SPECIAL, 1);
207         zinitna(mapzone, &mapobj, NULL, 0, 0, 0, 1);
208         pmap_init2();
209         vm_object_init2();
210 }
211
212
213 /*
214  * Red black tree functions
215  *
216  * The caller must hold the related map lock.
217  */
218 static int rb_vm_map_compare(vm_map_entry_t a, vm_map_entry_t b);
219 RB_GENERATE(vm_map_rb_tree, vm_map_entry, rb_entry, rb_vm_map_compare);
220
221 /* a->start is address, and the only field has to be initialized */
222 static int
223 rb_vm_map_compare(vm_map_entry_t a, vm_map_entry_t b)
224 {
225         if (a->start < b->start)
226                 return(-1);
227         else if (a->start > b->start)
228                 return(1);
229         return(0);
230 }
231
232 /*
233  * Allocate a vmspace structure, including a vm_map and pmap.
234  * Initialize numerous fields.  While the initial allocation is zerod,
235  * subsequence reuse from the objcache leaves elements of the structure
236  * intact (particularly the pmap), so portions must be zerod.
237  *
238  * The structure is not considered activated until we call sysref_activate().
239  *
240  * No requirements.
241  */
242 struct vmspace *
243 vmspace_alloc(vm_offset_t min, vm_offset_t max)
244 {
245         struct vmspace *vm;
246
247         lwkt_gettoken(&vmspace_token);
248         vm = sysref_alloc(&vmspace_sysref_class);
249         bzero(&vm->vm_startcopy,
250               (char *)&vm->vm_endcopy - (char *)&vm->vm_startcopy);
251         vm_map_init(&vm->vm_map, min, max, NULL);
252         pmap_pinit(vmspace_pmap(vm));           /* (some fields reused) */
253         vm->vm_map.pmap = vmspace_pmap(vm);             /* XXX */
254         vm->vm_shm = NULL;
255         vm->vm_exitingcnt = 0;
256         cpu_vmspace_alloc(vm);
257         sysref_activate(&vm->vm_sysref);
258         lwkt_reltoken(&vmspace_token);
259
260         return (vm);
261 }
262
263 /*
264  * dtor function - Some elements of the pmap are retained in the
265  * free-cached vmspaces to improve performance.  We have to clean them up
266  * here before returning the vmspace to the memory pool.
267  *
268  * No requirements.
269  */
270 static void
271 vmspace_dtor(void *obj, void *private)
272 {
273         struct vmspace *vm = obj;
274
275         pmap_puninit(vmspace_pmap(vm));
276 }
277
278 /*
279  * Called in two cases: 
280  *
281  * (1) When the last sysref is dropped, but exitingcnt might still be
282  *     non-zero.
283  *
284  * (2) When there are no sysrefs (i.e. refcnt is negative) left and the
285  *     exitingcnt becomes zero
286  *
287  * sysref will not scrap the object until we call sysref_put() once more
288  * after the last ref has been dropped.
289  *
290  * Interlocked by the sysref API.
291  */
292 static void
293 vmspace_terminate(struct vmspace *vm)
294 {
295         int count;
296
297         /*
298          * If exitingcnt is non-zero we can't get rid of the entire vmspace
299          * yet, but we can scrap user memory.
300          */
301         lwkt_gettoken(&vmspace_token);
302         if (vm->vm_exitingcnt) {
303                 shmexit(vm);
304                 pmap_remove_pages(vmspace_pmap(vm), VM_MIN_USER_ADDRESS,
305                                   VM_MAX_USER_ADDRESS);
306                 vm_map_remove(&vm->vm_map, VM_MIN_USER_ADDRESS,
307                               VM_MAX_USER_ADDRESS);
308                 lwkt_reltoken(&vmspace_token);
309                 return;
310         }
311         cpu_vmspace_free(vm);
312
313         /*
314          * Make sure any SysV shm is freed, it might not have in
315          * exit1()
316          */
317         shmexit(vm);
318
319         KKASSERT(vm->vm_upcalls == NULL);
320
321         /*
322          * Lock the map, to wait out all other references to it.
323          * Delete all of the mappings and pages they hold, then call
324          * the pmap module to reclaim anything left.
325          */
326         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
327         vm_map_lock(&vm->vm_map);
328         vm_map_delete(&vm->vm_map, vm->vm_map.min_offset,
329                 vm->vm_map.max_offset, &count);
330         vm_map_unlock(&vm->vm_map);
331         vm_map_entry_release(count);
332
333         pmap_release(vmspace_pmap(vm));
334         sysref_put(&vm->vm_sysref);
335         lwkt_reltoken(&vmspace_token);
336 }
337
338 /*
339  * vmspaces are not currently locked.
340  */
341 static void
342 vmspace_lock(struct vmspace *vm __unused)
343 {
344 }
345
346 static void
347 vmspace_unlock(struct vmspace *vm __unused)
348 {
349 }
350
351 /*
352  * This is called during exit indicating that the vmspace is no
353  * longer in used by an exiting process, but the process has not yet
354  * been cleaned up.
355  *
356  * No requirements.
357  */
358 void
359 vmspace_exitbump(struct vmspace *vm)
360 {
361         lwkt_gettoken(&vmspace_token);
362         ++vm->vm_exitingcnt;
363         lwkt_reltoken(&vmspace_token);
364 }
365
366 /*
367  * This is called in the wait*() handling code.  The vmspace can be terminated
368  * after the last wait is finished using it.
369  *
370  * No requirements.
371  */
372 void
373 vmspace_exitfree(struct proc *p)
374 {
375         struct vmspace *vm;
376
377         lwkt_gettoken(&vmspace_token);
378         vm = p->p_vmspace;
379         p->p_vmspace = NULL;
380
381         if (--vm->vm_exitingcnt == 0 && sysref_isinactive(&vm->vm_sysref))
382                 vmspace_terminate(vm);
383         lwkt_reltoken(&vmspace_token);
384 }
385
386 /*
387  * Swap useage is determined by taking the proportional swap used by
388  * VM objects backing the VM map.  To make up for fractional losses,
389  * if the VM object has any swap use at all the associated map entries
390  * count for at least 1 swap page.
391  *
392  * No requirements.
393  */
394 int
395 vmspace_swap_count(struct vmspace *vmspace)
396 {
397         vm_map_t map = &vmspace->vm_map;
398         vm_map_entry_t cur;
399         vm_object_t object;
400         int count = 0;
401         int n;
402
403         lwkt_gettoken(&vmspace_token);
404         for (cur = map->header.next; cur != &map->header; cur = cur->next) {
405                 switch(cur->maptype) {
406                 case VM_MAPTYPE_NORMAL:
407                 case VM_MAPTYPE_VPAGETABLE:
408                         if ((object = cur->object.vm_object) == NULL)
409                                 break;
410                         if (object->swblock_count) {
411                                 n = (cur->end - cur->start) / PAGE_SIZE;
412                                 count += object->swblock_count *
413                                     SWAP_META_PAGES * n / object->size + 1;
414                         }
415                         break;
416                 default:
417                         break;
418                 }
419         }
420         lwkt_reltoken(&vmspace_token);
421         return(count);
422 }
423
424 /*
425  * Calculate the approximate number of anonymous pages in use by
426  * this vmspace.  To make up for fractional losses, we count each
427  * VM object as having at least 1 anonymous page.
428  *
429  * No requirements.
430  */
431 int
432 vmspace_anonymous_count(struct vmspace *vmspace)
433 {
434         vm_map_t map = &vmspace->vm_map;
435         vm_map_entry_t cur;
436         vm_object_t object;
437         int count = 0;
438
439         lwkt_gettoken(&vmspace_token);
440         for (cur = map->header.next; cur != &map->header; cur = cur->next) {
441                 switch(cur->maptype) {
442                 case VM_MAPTYPE_NORMAL:
443                 case VM_MAPTYPE_VPAGETABLE:
444                         if ((object = cur->object.vm_object) == NULL)
445                                 break;
446                         if (object->type != OBJT_DEFAULT &&
447                             object->type != OBJT_SWAP) {
448                                 break;
449                         }
450                         count += object->resident_page_count;
451                         break;
452                 default:
453                         break;
454                 }
455         }
456         lwkt_reltoken(&vmspace_token);
457         return(count);
458 }
459
460 /*
461  * Creates and returns a new empty VM map with the given physical map
462  * structure, and having the given lower and upper address bounds.
463  *
464  * No requirements.
465  */
466 vm_map_t
467 vm_map_create(vm_map_t result, pmap_t pmap, vm_offset_t min, vm_offset_t max)
468 {
469         if (result == NULL)
470                 result = zalloc(mapzone);
471         vm_map_init(result, min, max, pmap);
472         return (result);
473 }
474
475 /*
476  * Initialize an existing vm_map structure such as that in the vmspace
477  * structure.  The pmap is initialized elsewhere.
478  *
479  * No requirements.
480  */
481 void
482 vm_map_init(struct vm_map *map, vm_offset_t min, vm_offset_t max, pmap_t pmap)
483 {
484         map->header.next = map->header.prev = &map->header;
485         RB_INIT(&map->rb_root);
486         map->nentries = 0;
487         map->size = 0;
488         map->system_map = 0;
489         map->infork = 0;
490         map->min_offset = min;
491         map->max_offset = max;
492         map->pmap = pmap;
493         map->first_free = &map->header;
494         map->hint = &map->header;
495         map->timestamp = 0;
496         map->flags = 0;
497         lockinit(&map->lock, "thrd_sleep", 0, 0);
498 }
499
500 /*
501  * Shadow the vm_map_entry's object.  This typically needs to be done when
502  * a write fault is taken on an entry which had previously been cloned by
503  * fork().  The shared object (which might be NULL) must become private so
504  * we add a shadow layer above it.
505  *
506  * Object allocation for anonymous mappings is defered as long as possible.
507  * When creating a shadow, however, the underlying object must be instantiated
508  * so it can be shared.
509  *
510  * If the map segment is governed by a virtual page table then it is
511  * possible to address offsets beyond the mapped area.  Just allocate
512  * a maximally sized object for this case.
513  *
514  * The vm_map must be exclusively locked.
515  * No other requirements.
516  */
517 static
518 void
519 vm_map_entry_shadow(vm_map_entry_t entry)
520 {
521         if (entry->maptype == VM_MAPTYPE_VPAGETABLE) {
522                 vm_object_shadow(&entry->object.vm_object, &entry->offset,
523                                  0x7FFFFFFF);   /* XXX */
524         } else {
525                 vm_object_shadow(&entry->object.vm_object, &entry->offset,
526                                  atop(entry->end - entry->start));
527         }
528         entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
529 }
530
531 /*
532  * Allocate an object for a vm_map_entry.
533  *
534  * Object allocation for anonymous mappings is defered as long as possible.
535  * This function is called when we can defer no longer, generally when a map
536  * entry might be split or forked or takes a page fault.
537  *
538  * If the map segment is governed by a virtual page table then it is
539  * possible to address offsets beyond the mapped area.  Just allocate
540  * a maximally sized object for this case.
541  *
542  * The vm_map must be exclusively locked.
543  * No other requirements.
544  */
545 void 
546 vm_map_entry_allocate_object(vm_map_entry_t entry)
547 {
548         vm_object_t obj;
549
550         if (entry->maptype == VM_MAPTYPE_VPAGETABLE) {
551                 obj = vm_object_allocate(OBJT_DEFAULT, 0x7FFFFFFF); /* XXX */
552         } else {
553                 obj = vm_object_allocate(OBJT_DEFAULT,
554                                          atop(entry->end - entry->start));
555         }
556         entry->object.vm_object = obj;
557         entry->offset = 0;
558 }
559
560 /*
561  * Set an initial negative count so the first attempt to reserve
562  * space preloads a bunch of vm_map_entry's for this cpu.  Also
563  * pre-allocate 2 vm_map_entries which will be needed by zalloc() to
564  * map a new page for vm_map_entry structures.  SMP systems are
565  * particularly sensitive.
566  *
567  * This routine is called in early boot so we cannot just call
568  * vm_map_entry_reserve().
569  *
570  * Called from the low level boot code only (for each cpu)
571  */
572 void
573 vm_map_entry_reserve_cpu_init(globaldata_t gd)
574 {
575         vm_map_entry_t entry;
576         int i;
577
578         gd->gd_vme_avail -= MAP_RESERVE_COUNT * 2;
579         entry = &cpu_map_entry_init[gd->gd_cpuid][0];
580         for (i = 0; i < VMEPERCPU; ++i, ++entry) {
581                 entry->next = gd->gd_vme_base;
582                 gd->gd_vme_base = entry;
583         }
584 }
585
586 /*
587  * Reserves vm_map_entry structures so code later on can manipulate
588  * map_entry structures within a locked map without blocking trying
589  * to allocate a new vm_map_entry.
590  *
591  * No requirements.
592  */
593 int
594 vm_map_entry_reserve(int count)
595 {
596         struct globaldata *gd = mycpu;
597         vm_map_entry_t entry;
598
599         /*
600          * Make sure we have enough structures in gd_vme_base to handle
601          * the reservation request.
602          */
603         crit_enter();
604         while (gd->gd_vme_avail < count) {
605                 entry = zalloc(mapentzone);
606                 entry->next = gd->gd_vme_base;
607                 gd->gd_vme_base = entry;
608                 ++gd->gd_vme_avail;
609         }
610         gd->gd_vme_avail -= count;
611         crit_exit();
612
613         return(count);
614 }
615
616 /*
617  * Releases previously reserved vm_map_entry structures that were not
618  * used.  If we have too much junk in our per-cpu cache clean some of
619  * it out.
620  *
621  * No requirements.
622  */
623 void
624 vm_map_entry_release(int count)
625 {
626         struct globaldata *gd = mycpu;
627         vm_map_entry_t entry;
628
629         crit_enter();
630         gd->gd_vme_avail += count;
631         while (gd->gd_vme_avail > MAP_RESERVE_SLOP) {
632                 entry = gd->gd_vme_base;
633                 KKASSERT(entry != NULL);
634                 gd->gd_vme_base = entry->next;
635                 --gd->gd_vme_avail;
636                 crit_exit();
637                 zfree(mapentzone, entry);
638                 crit_enter();
639         }
640         crit_exit();
641 }
642
643 /*
644  * Reserve map entry structures for use in kernel_map itself.  These
645  * entries have *ALREADY* been reserved on a per-cpu basis when the map
646  * was inited.  This function is used by zalloc() to avoid a recursion
647  * when zalloc() itself needs to allocate additional kernel memory.
648  *
649  * This function works like the normal reserve but does not load the
650  * vm_map_entry cache (because that would result in an infinite
651  * recursion).  Note that gd_vme_avail may go negative.  This is expected.
652  *
653  * Any caller of this function must be sure to renormalize after
654  * potentially eating entries to ensure that the reserve supply
655  * remains intact.
656  *
657  * No requirements.
658  */
659 int
660 vm_map_entry_kreserve(int count)
661 {
662         struct globaldata *gd = mycpu;
663
664         crit_enter();
665         gd->gd_vme_avail -= count;
666         crit_exit();
667         KASSERT(gd->gd_vme_base != NULL,
668                 ("no reserved entries left, gd_vme_avail = %d\n",
669                 gd->gd_vme_avail));
670         return(count);
671 }
672
673 /*
674  * Release previously reserved map entries for kernel_map.  We do not
675  * attempt to clean up like the normal release function as this would
676  * cause an unnecessary (but probably not fatal) deep procedure call.
677  *
678  * No requirements.
679  */
680 void
681 vm_map_entry_krelease(int count)
682 {
683         struct globaldata *gd = mycpu;
684
685         crit_enter();
686         gd->gd_vme_avail += count;
687         crit_exit();
688 }
689
690 /*
691  * Allocates a VM map entry for insertion.  No entry fields are filled in.
692  *
693  * The entries should have previously been reserved.  The reservation count
694  * is tracked in (*countp).
695  *
696  * No requirements.
697  */
698 static vm_map_entry_t
699 vm_map_entry_create(vm_map_t map, int *countp)
700 {
701         struct globaldata *gd = mycpu;
702         vm_map_entry_t entry;
703
704         KKASSERT(*countp > 0);
705         --*countp;
706         crit_enter();
707         entry = gd->gd_vme_base;
708         KASSERT(entry != NULL, ("gd_vme_base NULL! count %d", *countp));
709         gd->gd_vme_base = entry->next;
710         crit_exit();
711
712         return(entry);
713 }
714
715 /*
716  * Dispose of a vm_map_entry that is no longer being referenced.
717  *
718  * No requirements.
719  */
720 static void
721 vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry, int *countp)
722 {
723         struct globaldata *gd = mycpu;
724
725         KKASSERT(map->hint != entry);
726         KKASSERT(map->first_free != entry);
727
728         ++*countp;
729         crit_enter();
730         entry->next = gd->gd_vme_base;
731         gd->gd_vme_base = entry;
732         crit_exit();
733 }
734
735
736 /*
737  * Insert/remove entries from maps.
738  *
739  * The related map must be exclusively locked.
740  * No other requirements.
741  *
742  * NOTE! We currently acquire the vmspace_token only to avoid races
743  *       against the pageout daemon's calls to vmspace_*_count(), which
744  *       are unable to safely lock the vm_map without potentially
745  *       deadlocking.
746  */
747 static __inline void
748 vm_map_entry_link(vm_map_t map,
749                   vm_map_entry_t after_where,
750                   vm_map_entry_t entry)
751 {
752         ASSERT_VM_MAP_LOCKED(map);
753
754         lwkt_gettoken(&vmspace_token);
755         map->nentries++;
756         entry->prev = after_where;
757         entry->next = after_where->next;
758         entry->next->prev = entry;
759         after_where->next = entry;
760         if (vm_map_rb_tree_RB_INSERT(&map->rb_root, entry))
761                 panic("vm_map_entry_link: dup addr map %p ent %p", map, entry);
762         lwkt_reltoken(&vmspace_token);
763 }
764
765 static __inline void
766 vm_map_entry_unlink(vm_map_t map,
767                     vm_map_entry_t entry)
768 {
769         vm_map_entry_t prev;
770         vm_map_entry_t next;
771
772         ASSERT_VM_MAP_LOCKED(map);
773
774         if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
775                 panic("vm_map_entry_unlink: attempt to mess with "
776                       "locked entry! %p", entry);
777         }
778         lwkt_gettoken(&vmspace_token);
779         prev = entry->prev;
780         next = entry->next;
781         next->prev = prev;
782         prev->next = next;
783         vm_map_rb_tree_RB_REMOVE(&map->rb_root, entry);
784         map->nentries--;
785         lwkt_reltoken(&vmspace_token);
786 }
787
788 /*
789  * Finds the map entry containing (or immediately preceding) the specified
790  * address in the given map.  The entry is returned in (*entry).
791  *
792  * The boolean result indicates whether the address is actually contained
793  * in the map.
794  *
795  * The related map must be locked.
796  * No other requirements.
797  */
798 boolean_t
799 vm_map_lookup_entry(vm_map_t map, vm_offset_t address, vm_map_entry_t *entry)
800 {
801         vm_map_entry_t tmp;
802         vm_map_entry_t last;
803
804         ASSERT_VM_MAP_LOCKED(map);
805 #if 0
806         /*
807          * XXX TEMPORARILY DISABLED.  For some reason our attempt to revive
808          * the hint code with the red-black lookup meets with system crashes
809          * and lockups.  We do not yet know why.
810          *
811          * It is possible that the problem is related to the setting
812          * of the hint during map_entry deletion, in the code specified
813          * at the GGG comment later on in this file.
814          */
815         /*
816          * Quickly check the cached hint, there's a good chance of a match.
817          */
818         if (map->hint != &map->header) {
819                 tmp = map->hint;
820                 if (address >= tmp->start && address < tmp->end) {
821                         *entry = tmp;
822                         return(TRUE);
823                 }
824         }
825 #endif
826
827         /*
828          * Locate the record from the top of the tree.  'last' tracks the
829          * closest prior record and is returned if no match is found, which
830          * in binary tree terms means tracking the most recent right-branch
831          * taken.  If there is no prior record, &map->header is returned.
832          */
833         last = &map->header;
834         tmp = RB_ROOT(&map->rb_root);
835
836         while (tmp) {
837                 if (address >= tmp->start) {
838                         if (address < tmp->end) {
839                                 *entry = tmp;
840                                 map->hint = tmp;
841                                 return(TRUE);
842                         }
843                         last = tmp;
844                         tmp = RB_RIGHT(tmp, rb_entry);
845                 } else {
846                         tmp = RB_LEFT(tmp, rb_entry);
847                 }
848         }
849         *entry = last;
850         return (FALSE);
851 }
852
853 /*
854  * Inserts the given whole VM object into the target map at the specified
855  * address range.  The object's size should match that of the address range.
856  *
857  * The map must be exclusively locked.
858  * The caller must have reserved sufficient vm_map_entry structures.
859  *
860  * If object is non-NULL, ref count must be bumped by caller
861  * prior to making call to account for the new entry.
862  */
863 int
864 vm_map_insert(vm_map_t map, int *countp,
865               vm_object_t object, vm_ooffset_t offset,
866               vm_offset_t start, vm_offset_t end,
867               vm_maptype_t maptype,
868               vm_prot_t prot, vm_prot_t max,
869               int cow)
870 {
871         vm_map_entry_t new_entry;
872         vm_map_entry_t prev_entry;
873         vm_map_entry_t temp_entry;
874         vm_eflags_t protoeflags;
875
876         ASSERT_VM_MAP_LOCKED(map);
877
878         /*
879          * Check that the start and end points are not bogus.
880          */
881         if ((start < map->min_offset) || (end > map->max_offset) ||
882             (start >= end))
883                 return (KERN_INVALID_ADDRESS);
884
885         /*
886          * Find the entry prior to the proposed starting address; if it's part
887          * of an existing entry, this range is bogus.
888          */
889         if (vm_map_lookup_entry(map, start, &temp_entry))
890                 return (KERN_NO_SPACE);
891
892         prev_entry = temp_entry;
893
894         /*
895          * Assert that the next entry doesn't overlap the end point.
896          */
897
898         if ((prev_entry->next != &map->header) &&
899             (prev_entry->next->start < end))
900                 return (KERN_NO_SPACE);
901
902         protoeflags = 0;
903
904         if (cow & MAP_COPY_ON_WRITE)
905                 protoeflags |= MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY;
906
907         if (cow & MAP_NOFAULT) {
908                 protoeflags |= MAP_ENTRY_NOFAULT;
909
910                 KASSERT(object == NULL,
911                         ("vm_map_insert: paradoxical MAP_NOFAULT request"));
912         }
913         if (cow & MAP_DISABLE_SYNCER)
914                 protoeflags |= MAP_ENTRY_NOSYNC;
915         if (cow & MAP_DISABLE_COREDUMP)
916                 protoeflags |= MAP_ENTRY_NOCOREDUMP;
917         if (cow & MAP_IS_STACK)
918                 protoeflags |= MAP_ENTRY_STACK;
919         if (cow & MAP_IS_KSTACK)
920                 protoeflags |= MAP_ENTRY_KSTACK;
921
922         lwkt_gettoken(&vm_token);
923         lwkt_gettoken(&vmobj_token);
924
925         if (object) {
926                 /*
927                  * When object is non-NULL, it could be shared with another
928                  * process.  We have to set or clear OBJ_ONEMAPPING 
929                  * appropriately.
930                  */
931                 if ((object->ref_count > 1) || (object->shadow_count != 0)) {
932                         vm_object_clear_flag(object, OBJ_ONEMAPPING);
933                 }
934         }
935         else if ((prev_entry != &map->header) &&
936                  (prev_entry->eflags == protoeflags) &&
937                  (prev_entry->end == start) &&
938                  (prev_entry->wired_count == 0) &&
939                  prev_entry->maptype == maptype &&
940                  ((prev_entry->object.vm_object == NULL) ||
941                   vm_object_coalesce(prev_entry->object.vm_object,
942                                      OFF_TO_IDX(prev_entry->offset),
943                                      (vm_size_t)(prev_entry->end - prev_entry->start),
944                                      (vm_size_t)(end - prev_entry->end)))) {
945                 /*
946                  * We were able to extend the object.  Determine if we
947                  * can extend the previous map entry to include the 
948                  * new range as well.
949                  */
950                 if ((prev_entry->inheritance == VM_INHERIT_DEFAULT) &&
951                     (prev_entry->protection == prot) &&
952                     (prev_entry->max_protection == max)) {
953                         lwkt_reltoken(&vmobj_token);
954                         lwkt_reltoken(&vm_token);
955                         map->size += (end - prev_entry->end);
956                         prev_entry->end = end;
957                         vm_map_simplify_entry(map, prev_entry, countp);
958                         return (KERN_SUCCESS);
959                 }
960
961                 /*
962                  * If we can extend the object but cannot extend the
963                  * map entry, we have to create a new map entry.  We
964                  * must bump the ref count on the extended object to
965                  * account for it.  object may be NULL.
966                  */
967                 object = prev_entry->object.vm_object;
968                 offset = prev_entry->offset +
969                         (prev_entry->end - prev_entry->start);
970                 vm_object_reference_locked(object);
971         }
972
973         lwkt_reltoken(&vmobj_token);
974         lwkt_reltoken(&vm_token);
975
976         /*
977          * NOTE: if conditionals fail, object can be NULL here.  This occurs
978          * in things like the buffer map where we manage kva but do not manage
979          * backing objects.
980          */
981
982         /*
983          * Create a new entry
984          */
985
986         new_entry = vm_map_entry_create(map, countp);
987         new_entry->start = start;
988         new_entry->end = end;
989
990         new_entry->maptype = maptype;
991         new_entry->eflags = protoeflags;
992         new_entry->object.vm_object = object;
993         new_entry->offset = offset;
994         new_entry->aux.master_pde = 0;
995
996         new_entry->inheritance = VM_INHERIT_DEFAULT;
997         new_entry->protection = prot;
998         new_entry->max_protection = max;
999         new_entry->wired_count = 0;
1000
1001         /*
1002          * Insert the new entry into the list
1003          */
1004
1005         vm_map_entry_link(map, prev_entry, new_entry);
1006         map->size += new_entry->end - new_entry->start;
1007
1008         /*
1009          * Update the free space hint.  Entries cannot overlap.
1010          * An exact comparison is needed to avoid matching
1011          * against the map->header.
1012          */
1013         if ((map->first_free == prev_entry) &&
1014             (prev_entry->end == new_entry->start)) {
1015                 map->first_free = new_entry;
1016         }
1017
1018 #if 0
1019         /*
1020          * Temporarily removed to avoid MAP_STACK panic, due to
1021          * MAP_STACK being a huge hack.  Will be added back in
1022          * when MAP_STACK (and the user stack mapping) is fixed.
1023          */
1024         /*
1025          * It may be possible to simplify the entry
1026          */
1027         vm_map_simplify_entry(map, new_entry, countp);
1028 #endif
1029
1030         /*
1031          * Try to pre-populate the page table.  Mappings governed by virtual
1032          * page tables cannot be prepopulated without a lot of work, so
1033          * don't try.
1034          */
1035         if ((cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL)) &&
1036             maptype != VM_MAPTYPE_VPAGETABLE) {
1037                 pmap_object_init_pt(map->pmap, start, prot,
1038                                     object, OFF_TO_IDX(offset), end - start,
1039                                     cow & MAP_PREFAULT_PARTIAL);
1040         }
1041
1042         return (KERN_SUCCESS);
1043 }
1044
1045 /*
1046  * Find sufficient space for `length' bytes in the given map, starting at
1047  * `start'.  Returns 0 on success, 1 on no space.
1048  *
1049  * This function will returned an arbitrarily aligned pointer.  If no
1050  * particular alignment is required you should pass align as 1.  Note that
1051  * the map may return PAGE_SIZE aligned pointers if all the lengths used in
1052  * the map are a multiple of PAGE_SIZE, even if you pass a smaller align
1053  * argument.
1054  *
1055  * 'align' should be a power of 2 but is not required to be.
1056  *
1057  * The map must be exclusively locked.
1058  * No other requirements.
1059  */
1060 int
1061 vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length,
1062                  vm_size_t align, int flags, vm_offset_t *addr)
1063 {
1064         vm_map_entry_t entry, next;
1065         vm_offset_t end;
1066         vm_offset_t align_mask;
1067
1068         if (start < map->min_offset)
1069                 start = map->min_offset;
1070         if (start > map->max_offset)
1071                 return (1);
1072
1073         /*
1074          * If the alignment is not a power of 2 we will have to use
1075          * a mod/division, set align_mask to a special value.
1076          */
1077         if ((align | (align - 1)) + 1 != (align << 1))
1078                 align_mask = (vm_offset_t)-1;
1079         else
1080                 align_mask = align - 1;
1081
1082         /*
1083          * Look for the first possible address; if there's already something
1084          * at this address, we have to start after it.
1085          */
1086         if (start == map->min_offset) {
1087                 if ((entry = map->first_free) != &map->header)
1088                         start = entry->end;
1089         } else {
1090                 vm_map_entry_t tmp;
1091
1092                 if (vm_map_lookup_entry(map, start, &tmp))
1093                         start = tmp->end;
1094                 entry = tmp;
1095         }
1096
1097         /*
1098          * Look through the rest of the map, trying to fit a new region in the
1099          * gap between existing regions, or after the very last region.
1100          */
1101         for (;; start = (entry = next)->end) {
1102                 /*
1103                  * Adjust the proposed start by the requested alignment,
1104                  * be sure that we didn't wrap the address.
1105                  */
1106                 if (align_mask == (vm_offset_t)-1)
1107                         end = ((start + align - 1) / align) * align;
1108                 else
1109                         end = (start + align_mask) & ~align_mask;
1110                 if (end < start)
1111                         return (1);
1112                 start = end;
1113                 /*
1114                  * Find the end of the proposed new region.  Be sure we didn't
1115                  * go beyond the end of the map, or wrap around the address.
1116                  * Then check to see if this is the last entry or if the 
1117                  * proposed end fits in the gap between this and the next
1118                  * entry.
1119                  */
1120                 end = start + length;
1121                 if (end > map->max_offset || end < start)
1122                         return (1);
1123                 next = entry->next;
1124
1125                 /*
1126                  * If the next entry's start address is beyond the desired
1127                  * end address we may have found a good entry.
1128                  *
1129                  * If the next entry is a stack mapping we do not map into
1130                  * the stack's reserved space.
1131                  *
1132                  * XXX continue to allow mapping into the stack's reserved
1133                  * space if doing a MAP_STACK mapping inside a MAP_STACK
1134                  * mapping, for backwards compatibility.  But the caller
1135                  * really should use MAP_STACK | MAP_TRYFIXED if they
1136                  * want to do that.
1137                  */
1138                 if (next == &map->header)
1139                         break;
1140                 if (next->start >= end) {
1141                         if ((next->eflags & MAP_ENTRY_STACK) == 0)
1142                                 break;
1143                         if (flags & MAP_STACK)
1144                                 break;
1145                         if (next->start - next->aux.avail_ssize >= end)
1146                                 break;
1147                 }
1148         }
1149         map->hint = entry;
1150
1151         /*
1152          * Grow the kernel_map if necessary.  pmap_growkernel() will panic
1153          * if it fails.  The kernel_map is locked and nothing can steal
1154          * our address space if pmap_growkernel() blocks.
1155          *
1156          * NOTE: This may be unconditionally called for kldload areas on
1157          *       x86_64 because these do not bump kernel_vm_end (which would
1158          *       fill 128G worth of page tables!).  Therefore we must not
1159          *       retry.
1160          */
1161         if (map == &kernel_map) {
1162                 vm_offset_t kstop;
1163
1164                 kstop = round_page(start + length);
1165                 if (kstop > kernel_vm_end)
1166                         pmap_growkernel(start, kstop);
1167         }
1168         *addr = start;
1169         return (0);
1170 }
1171
1172 /*
1173  * vm_map_find finds an unallocated region in the target address map with
1174  * the given length.  The search is defined to be first-fit from the
1175  * specified address; the region found is returned in the same parameter.
1176  *
1177  * If object is non-NULL, ref count must be bumped by caller
1178  * prior to making call to account for the new entry.
1179  *
1180  * No requirements.  This function will lock the map temporarily.
1181  */
1182 int
1183 vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1184             vm_offset_t *addr,  vm_size_t length, vm_size_t align,
1185             boolean_t fitit,
1186             vm_maptype_t maptype,
1187             vm_prot_t prot, vm_prot_t max,
1188             int cow)
1189 {
1190         vm_offset_t start;
1191         int result;
1192         int count;
1193
1194         start = *addr;
1195
1196         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
1197         vm_map_lock(map);
1198         if (fitit) {
1199                 if (vm_map_findspace(map, start, length, align, 0, addr)) {
1200                         vm_map_unlock(map);
1201                         vm_map_entry_release(count);
1202                         return (KERN_NO_SPACE);
1203                 }
1204                 start = *addr;
1205         }
1206         result = vm_map_insert(map, &count, object, offset,
1207                                start, start + length,
1208                                maptype,
1209                                prot, max,
1210                                cow);
1211         vm_map_unlock(map);
1212         vm_map_entry_release(count);
1213
1214         return (result);
1215 }
1216
1217 /*
1218  * Simplify the given map entry by merging with either neighbor.  This
1219  * routine also has the ability to merge with both neighbors.
1220  *
1221  * This routine guarentees that the passed entry remains valid (though
1222  * possibly extended).  When merging, this routine may delete one or
1223  * both neighbors.  No action is taken on entries which have their
1224  * in-transition flag set.
1225  *
1226  * The map must be exclusively locked.
1227  */
1228 void
1229 vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry, int *countp)
1230 {
1231         vm_map_entry_t next, prev;
1232         vm_size_t prevsize, esize;
1233
1234         if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
1235                 ++mycpu->gd_cnt.v_intrans_coll;
1236                 return;
1237         }
1238
1239         if (entry->maptype == VM_MAPTYPE_SUBMAP)
1240                 return;
1241
1242         prev = entry->prev;
1243         if (prev != &map->header) {
1244                 prevsize = prev->end - prev->start;
1245                 if ( (prev->end == entry->start) &&
1246                      (prev->maptype == entry->maptype) &&
1247                      (prev->object.vm_object == entry->object.vm_object) &&
1248                      (!prev->object.vm_object ||
1249                         (prev->offset + prevsize == entry->offset)) &&
1250                      (prev->eflags == entry->eflags) &&
1251                      (prev->protection == entry->protection) &&
1252                      (prev->max_protection == entry->max_protection) &&
1253                      (prev->inheritance == entry->inheritance) &&
1254                      (prev->wired_count == entry->wired_count)) {
1255                         if (map->first_free == prev)
1256                                 map->first_free = entry;
1257                         if (map->hint == prev)
1258                                 map->hint = entry;
1259                         vm_map_entry_unlink(map, prev);
1260                         entry->start = prev->start;
1261                         entry->offset = prev->offset;
1262                         if (prev->object.vm_object)
1263                                 vm_object_deallocate(prev->object.vm_object);
1264                         vm_map_entry_dispose(map, prev, countp);
1265                 }
1266         }
1267
1268         next = entry->next;
1269         if (next != &map->header) {
1270                 esize = entry->end - entry->start;
1271                 if ((entry->end == next->start) &&
1272                     (next->maptype == entry->maptype) &&
1273                     (next->object.vm_object == entry->object.vm_object) &&
1274                      (!entry->object.vm_object ||
1275                         (entry->offset + esize == next->offset)) &&
1276                     (next->eflags == entry->eflags) &&
1277                     (next->protection == entry->protection) &&
1278                     (next->max_protection == entry->max_protection) &&
1279                     (next->inheritance == entry->inheritance) &&
1280                     (next->wired_count == entry->wired_count)) {
1281                         if (map->first_free == next)
1282                                 map->first_free = entry;
1283                         if (map->hint == next)
1284                                 map->hint = entry;
1285                         vm_map_entry_unlink(map, next);
1286                         entry->end = next->end;
1287                         if (next->object.vm_object)
1288                                 vm_object_deallocate(next->object.vm_object);
1289                         vm_map_entry_dispose(map, next, countp);
1290                 }
1291         }
1292 }
1293
1294 /*
1295  * Asserts that the given entry begins at or after the specified address.
1296  * If necessary, it splits the entry into two.
1297  */
1298 #define vm_map_clip_start(map, entry, startaddr, countp)                \
1299 {                                                                       \
1300         if (startaddr > entry->start)                                   \
1301                 _vm_map_clip_start(map, entry, startaddr, countp);      \
1302 }
1303
1304 /*
1305  * This routine is called only when it is known that the entry must be split.
1306  *
1307  * The map must be exclusively locked.
1308  */
1309 static void
1310 _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start,
1311                    int *countp)
1312 {
1313         vm_map_entry_t new_entry;
1314
1315         /*
1316          * Split off the front portion -- note that we must insert the new
1317          * entry BEFORE this one, so that this entry has the specified
1318          * starting address.
1319          */
1320
1321         vm_map_simplify_entry(map, entry, countp);
1322
1323         /*
1324          * If there is no object backing this entry, we might as well create
1325          * one now.  If we defer it, an object can get created after the map
1326          * is clipped, and individual objects will be created for the split-up
1327          * map.  This is a bit of a hack, but is also about the best place to
1328          * put this improvement.
1329          */
1330         if (entry->object.vm_object == NULL && !map->system_map) {
1331                 vm_map_entry_allocate_object(entry);
1332         }
1333
1334         new_entry = vm_map_entry_create(map, countp);
1335         *new_entry = *entry;
1336
1337         new_entry->end = start;
1338         entry->offset += (start - entry->start);
1339         entry->start = start;
1340
1341         vm_map_entry_link(map, entry->prev, new_entry);
1342
1343         switch(entry->maptype) {
1344         case VM_MAPTYPE_NORMAL:
1345         case VM_MAPTYPE_VPAGETABLE:
1346                 vm_object_reference(new_entry->object.vm_object);
1347                 break;
1348         default:
1349                 break;
1350         }
1351 }
1352
1353 /*
1354  * Asserts that the given entry ends at or before the specified address.
1355  * If necessary, it splits the entry into two.
1356  *
1357  * The map must be exclusively locked.
1358  */
1359 #define vm_map_clip_end(map, entry, endaddr, countp)            \
1360 {                                                               \
1361         if (endaddr < entry->end)                               \
1362                 _vm_map_clip_end(map, entry, endaddr, countp);  \
1363 }
1364
1365 /*
1366  * This routine is called only when it is known that the entry must be split.
1367  *
1368  * The map must be exclusively locked.
1369  */
1370 static void
1371 _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end,
1372                  int *countp)
1373 {
1374         vm_map_entry_t new_entry;
1375
1376         /*
1377          * If there is no object backing this entry, we might as well create
1378          * one now.  If we defer it, an object can get created after the map
1379          * is clipped, and individual objects will be created for the split-up
1380          * map.  This is a bit of a hack, but is also about the best place to
1381          * put this improvement.
1382          */
1383
1384         if (entry->object.vm_object == NULL && !map->system_map) {
1385                 vm_map_entry_allocate_object(entry);
1386         }
1387
1388         /*
1389          * Create a new entry and insert it AFTER the specified entry
1390          */
1391
1392         new_entry = vm_map_entry_create(map, countp);
1393         *new_entry = *entry;
1394
1395         new_entry->start = entry->end = end;
1396         new_entry->offset += (end - entry->start);
1397
1398         vm_map_entry_link(map, entry, new_entry);
1399
1400         switch(entry->maptype) {
1401         case VM_MAPTYPE_NORMAL:
1402         case VM_MAPTYPE_VPAGETABLE:
1403                 vm_object_reference(new_entry->object.vm_object);
1404                 break;
1405         default:
1406                 break;
1407         }
1408 }
1409
1410 /*
1411  * Asserts that the starting and ending region addresses fall within the
1412  * valid range for the map.
1413  */
1414 #define VM_MAP_RANGE_CHECK(map, start, end)     \
1415 {                                               \
1416         if (start < vm_map_min(map))            \
1417                 start = vm_map_min(map);        \
1418         if (end > vm_map_max(map))              \
1419                 end = vm_map_max(map);          \
1420         if (start > end)                        \
1421                 start = end;                    \
1422 }
1423
1424 /*
1425  * Used to block when an in-transition collison occurs.  The map
1426  * is unlocked for the sleep and relocked before the return.
1427  */
1428 void
1429 vm_map_transition_wait(vm_map_t map)
1430 {
1431         tsleep_interlock(map, 0);
1432         vm_map_unlock(map);
1433         tsleep(map, PINTERLOCKED, "vment", 0);
1434         vm_map_lock(map);
1435 }
1436
1437 /*
1438  * When we do blocking operations with the map lock held it is
1439  * possible that a clip might have occured on our in-transit entry,
1440  * requiring an adjustment to the entry in our loop.  These macros
1441  * help the pageable and clip_range code deal with the case.  The
1442  * conditional costs virtually nothing if no clipping has occured.
1443  */
1444
1445 #define CLIP_CHECK_BACK(entry, save_start)              \
1446     do {                                                \
1447             while (entry->start != save_start) {        \
1448                     entry = entry->prev;                \
1449                     KASSERT(entry != &map->header, ("bad entry clip")); \
1450             }                                           \
1451     } while(0)
1452
1453 #define CLIP_CHECK_FWD(entry, save_end)                 \
1454     do {                                                \
1455             while (entry->end != save_end) {            \
1456                     entry = entry->next;                \
1457                     KASSERT(entry != &map->header, ("bad entry clip")); \
1458             }                                           \
1459     } while(0)
1460
1461
1462 /*
1463  * Clip the specified range and return the base entry.  The
1464  * range may cover several entries starting at the returned base
1465  * and the first and last entry in the covering sequence will be
1466  * properly clipped to the requested start and end address.
1467  *
1468  * If no holes are allowed you should pass the MAP_CLIP_NO_HOLES
1469  * flag.
1470  *
1471  * The MAP_ENTRY_IN_TRANSITION flag will be set for the entries
1472  * covered by the requested range.
1473  *
1474  * The map must be exclusively locked on entry and will remain locked
1475  * on return. If no range exists or the range contains holes and you
1476  * specified that no holes were allowed, NULL will be returned.  This
1477  * routine may temporarily unlock the map in order avoid a deadlock when
1478  * sleeping.
1479  */
1480 static
1481 vm_map_entry_t
1482 vm_map_clip_range(vm_map_t map, vm_offset_t start, vm_offset_t end, 
1483                   int *countp, int flags)
1484 {
1485         vm_map_entry_t start_entry;
1486         vm_map_entry_t entry;
1487
1488         /*
1489          * Locate the entry and effect initial clipping.  The in-transition
1490          * case does not occur very often so do not try to optimize it.
1491          */
1492 again:
1493         if (vm_map_lookup_entry(map, start, &start_entry) == FALSE)
1494                 return (NULL);
1495         entry = start_entry;
1496         if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
1497                 entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
1498                 ++mycpu->gd_cnt.v_intrans_coll;
1499                 ++mycpu->gd_cnt.v_intrans_wait;
1500                 vm_map_transition_wait(map);
1501                 /*
1502                  * entry and/or start_entry may have been clipped while
1503                  * we slept, or may have gone away entirely.  We have
1504                  * to restart from the lookup.
1505                  */
1506                 goto again;
1507         }
1508
1509         /*
1510          * Since we hold an exclusive map lock we do not have to restart
1511          * after clipping, even though clipping may block in zalloc.
1512          */
1513         vm_map_clip_start(map, entry, start, countp);
1514         vm_map_clip_end(map, entry, end, countp);
1515         entry->eflags |= MAP_ENTRY_IN_TRANSITION;
1516
1517         /*
1518          * Scan entries covered by the range.  When working on the next
1519          * entry a restart need only re-loop on the current entry which
1520          * we have already locked, since 'next' may have changed.  Also,
1521          * even though entry is safe, it may have been clipped so we
1522          * have to iterate forwards through the clip after sleeping.
1523          */
1524         while (entry->next != &map->header && entry->next->start < end) {
1525                 vm_map_entry_t next = entry->next;
1526
1527                 if (flags & MAP_CLIP_NO_HOLES) {
1528                         if (next->start > entry->end) {
1529                                 vm_map_unclip_range(map, start_entry,
1530                                         start, entry->end, countp, flags);
1531                                 return(NULL);
1532                         }
1533                 }
1534
1535                 if (next->eflags & MAP_ENTRY_IN_TRANSITION) {
1536                         vm_offset_t save_end = entry->end;
1537                         next->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
1538                         ++mycpu->gd_cnt.v_intrans_coll;
1539                         ++mycpu->gd_cnt.v_intrans_wait;
1540                         vm_map_transition_wait(map);
1541
1542                         /*
1543                          * clips might have occured while we blocked.
1544                          */
1545                         CLIP_CHECK_FWD(entry, save_end);
1546                         CLIP_CHECK_BACK(start_entry, start);
1547                         continue;
1548                 }
1549                 /*
1550                  * No restart necessary even though clip_end may block, we
1551                  * are holding the map lock.
1552                  */
1553                 vm_map_clip_end(map, next, end, countp);
1554                 next->eflags |= MAP_ENTRY_IN_TRANSITION;
1555                 entry = next;
1556         }
1557         if (flags & MAP_CLIP_NO_HOLES) {
1558                 if (entry->end != end) {
1559                         vm_map_unclip_range(map, start_entry,
1560                                 start, entry->end, countp, flags);
1561                         return(NULL);
1562                 }
1563         }
1564         return(start_entry);
1565 }
1566
1567 /*
1568  * Undo the effect of vm_map_clip_range().  You should pass the same
1569  * flags and the same range that you passed to vm_map_clip_range().
1570  * This code will clear the in-transition flag on the entries and
1571  * wake up anyone waiting.  This code will also simplify the sequence
1572  * and attempt to merge it with entries before and after the sequence.
1573  *
1574  * The map must be locked on entry and will remain locked on return.
1575  *
1576  * Note that you should also pass the start_entry returned by
1577  * vm_map_clip_range().  However, if you block between the two calls
1578  * with the map unlocked please be aware that the start_entry may
1579  * have been clipped and you may need to scan it backwards to find
1580  * the entry corresponding with the original start address.  You are
1581  * responsible for this, vm_map_unclip_range() expects the correct
1582  * start_entry to be passed to it and will KASSERT otherwise.
1583  */
1584 static
1585 void
1586 vm_map_unclip_range(vm_map_t map, vm_map_entry_t start_entry,
1587                     vm_offset_t start, vm_offset_t end,
1588                     int *countp, int flags)
1589 {
1590         vm_map_entry_t entry;
1591
1592         entry = start_entry;
1593
1594         KASSERT(entry->start == start, ("unclip_range: illegal base entry"));
1595         while (entry != &map->header && entry->start < end) {
1596                 KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION,
1597                         ("in-transition flag not set during unclip on: %p",
1598                         entry));
1599                 KASSERT(entry->end <= end,
1600                         ("unclip_range: tail wasn't clipped"));
1601                 entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
1602                 if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
1603                         entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
1604                         wakeup(map);
1605                 }
1606                 entry = entry->next;
1607         }
1608
1609         /*
1610          * Simplification does not block so there is no restart case.
1611          */
1612         entry = start_entry;
1613         while (entry != &map->header && entry->start < end) {
1614                 vm_map_simplify_entry(map, entry, countp);
1615                 entry = entry->next;
1616         }
1617 }
1618
1619 /*
1620  * Mark the given range as handled by a subordinate map.
1621  *
1622  * This range must have been created with vm_map_find(), and no other
1623  * operations may have been performed on this range prior to calling
1624  * vm_map_submap().
1625  *
1626  * Submappings cannot be removed.
1627  *
1628  * No requirements.
1629  */
1630 int
1631 vm_map_submap(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_map_t submap)
1632 {
1633         vm_map_entry_t entry;
1634         int result = KERN_INVALID_ARGUMENT;
1635         int count;
1636
1637         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
1638         vm_map_lock(map);
1639
1640         VM_MAP_RANGE_CHECK(map, start, end);
1641
1642         if (vm_map_lookup_entry(map, start, &entry)) {
1643                 vm_map_clip_start(map, entry, start, &count);
1644         } else {
1645                 entry = entry->next;
1646         }
1647
1648         vm_map_clip_end(map, entry, end, &count);
1649
1650         if ((entry->start == start) && (entry->end == end) &&
1651             ((entry->eflags & MAP_ENTRY_COW) == 0) &&
1652             (entry->object.vm_object == NULL)) {
1653                 entry->object.sub_map = submap;
1654                 entry->maptype = VM_MAPTYPE_SUBMAP;
1655                 result = KERN_SUCCESS;
1656         }
1657         vm_map_unlock(map);
1658         vm_map_entry_release(count);
1659
1660         return (result);
1661 }
1662
1663 /*
1664  * Sets the protection of the specified address region in the target map. 
1665  * If "set_max" is specified, the maximum protection is to be set;
1666  * otherwise, only the current protection is affected.
1667  *
1668  * The protection is not applicable to submaps, but is applicable to normal
1669  * maps and maps governed by virtual page tables.  For example, when operating
1670  * on a virtual page table our protection basically controls how COW occurs
1671  * on the backing object, whereas the virtual page table abstraction itself
1672  * is an abstraction for userland.
1673  *
1674  * No requirements.
1675  */
1676 int
1677 vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
1678                vm_prot_t new_prot, boolean_t set_max)
1679 {
1680         vm_map_entry_t current;
1681         vm_map_entry_t entry;
1682         int count;
1683
1684         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
1685         vm_map_lock(map);
1686
1687         VM_MAP_RANGE_CHECK(map, start, end);
1688
1689         if (vm_map_lookup_entry(map, start, &entry)) {
1690                 vm_map_clip_start(map, entry, start, &count);
1691         } else {
1692                 entry = entry->next;
1693         }
1694
1695         /*
1696          * Make a first pass to check for protection violations.
1697          */
1698         current = entry;
1699         while ((current != &map->header) && (current->start < end)) {
1700                 if (current->maptype == VM_MAPTYPE_SUBMAP) {
1701                         vm_map_unlock(map);
1702                         vm_map_entry_release(count);
1703                         return (KERN_INVALID_ARGUMENT);
1704                 }
1705                 if ((new_prot & current->max_protection) != new_prot) {
1706                         vm_map_unlock(map);
1707                         vm_map_entry_release(count);
1708                         return (KERN_PROTECTION_FAILURE);
1709                 }
1710                 current = current->next;
1711         }
1712
1713         /*
1714          * Go back and fix up protections. [Note that clipping is not
1715          * necessary the second time.]
1716          */
1717         current = entry;
1718
1719         while ((current != &map->header) && (current->start < end)) {
1720                 vm_prot_t old_prot;
1721
1722                 vm_map_clip_end(map, current, end, &count);
1723
1724                 old_prot = current->protection;
1725                 if (set_max) {
1726                         current->protection =
1727                             (current->max_protection = new_prot) &
1728                             old_prot;
1729                 } else {
1730                         current->protection = new_prot;
1731                 }
1732
1733                 /*
1734                  * Update physical map if necessary. Worry about copy-on-write
1735                  * here -- CHECK THIS XXX
1736                  */
1737
1738                 if (current->protection != old_prot) {
1739 #define MASK(entry)     (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
1740                                                         VM_PROT_ALL)
1741
1742                         pmap_protect(map->pmap, current->start,
1743                             current->end,
1744                             current->protection & MASK(current));
1745 #undef  MASK
1746                 }
1747
1748                 vm_map_simplify_entry(map, current, &count);
1749
1750                 current = current->next;
1751         }
1752
1753         vm_map_unlock(map);
1754         vm_map_entry_release(count);
1755         return (KERN_SUCCESS);
1756 }
1757
1758 /*
1759  * This routine traverses a processes map handling the madvise
1760  * system call.  Advisories are classified as either those effecting
1761  * the vm_map_entry structure, or those effecting the underlying
1762  * objects.
1763  *
1764  * The <value> argument is used for extended madvise calls.
1765  *
1766  * No requirements.
1767  */
1768 int
1769 vm_map_madvise(vm_map_t map, vm_offset_t start, vm_offset_t end,
1770                int behav, off_t value)
1771 {
1772         vm_map_entry_t current, entry;
1773         int modify_map = 0;
1774         int error = 0;
1775         int count;
1776
1777         /*
1778          * Some madvise calls directly modify the vm_map_entry, in which case
1779          * we need to use an exclusive lock on the map and we need to perform 
1780          * various clipping operations.  Otherwise we only need a read-lock
1781          * on the map.
1782          */
1783
1784         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
1785
1786         switch(behav) {
1787         case MADV_NORMAL:
1788         case MADV_SEQUENTIAL:
1789         case MADV_RANDOM:
1790         case MADV_NOSYNC:
1791         case MADV_AUTOSYNC:
1792         case MADV_NOCORE:
1793         case MADV_CORE:
1794         case MADV_SETMAP:
1795         case MADV_INVAL:
1796                 modify_map = 1;
1797                 vm_map_lock(map);
1798                 break;
1799         case MADV_WILLNEED:
1800         case MADV_DONTNEED:
1801         case MADV_FREE:
1802                 vm_map_lock_read(map);
1803                 break;
1804         default:
1805                 vm_map_entry_release(count);
1806                 return (EINVAL);
1807         }
1808
1809         /*
1810          * Locate starting entry and clip if necessary.
1811          */
1812
1813         VM_MAP_RANGE_CHECK(map, start, end);
1814
1815         if (vm_map_lookup_entry(map, start, &entry)) {
1816                 if (modify_map)
1817                         vm_map_clip_start(map, entry, start, &count);
1818         } else {
1819                 entry = entry->next;
1820         }
1821
1822         if (modify_map) {
1823                 /*
1824                  * madvise behaviors that are implemented in the vm_map_entry.
1825                  *
1826                  * We clip the vm_map_entry so that behavioral changes are
1827                  * limited to the specified address range.
1828                  */
1829                 for (current = entry;
1830                      (current != &map->header) && (current->start < end);
1831                      current = current->next
1832                 ) {
1833                         if (current->maptype == VM_MAPTYPE_SUBMAP)
1834                                 continue;
1835
1836                         vm_map_clip_end(map, current, end, &count);
1837
1838                         switch (behav) {
1839                         case MADV_NORMAL:
1840                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL);
1841                                 break;
1842                         case MADV_SEQUENTIAL:
1843                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL);
1844                                 break;
1845                         case MADV_RANDOM:
1846                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM);
1847                                 break;
1848                         case MADV_NOSYNC:
1849                                 current->eflags |= MAP_ENTRY_NOSYNC;
1850                                 break;
1851                         case MADV_AUTOSYNC:
1852                                 current->eflags &= ~MAP_ENTRY_NOSYNC;
1853                                 break;
1854                         case MADV_NOCORE:
1855                                 current->eflags |= MAP_ENTRY_NOCOREDUMP;
1856                                 break;
1857                         case MADV_CORE:
1858                                 current->eflags &= ~MAP_ENTRY_NOCOREDUMP;
1859                                 break;
1860                         case MADV_INVAL:
1861                                 /*
1862                                  * Invalidate the related pmap entries, used
1863                                  * to flush portions of the real kernel's
1864                                  * pmap when the caller has removed or
1865                                  * modified existing mappings in a virtual
1866                                  * page table.
1867                                  */
1868                                 pmap_remove(map->pmap,
1869                                             current->start, current->end);
1870                                 break;
1871                         case MADV_SETMAP:
1872                                 /*
1873                                  * Set the page directory page for a map
1874                                  * governed by a virtual page table.  Mark
1875                                  * the entry as being governed by a virtual
1876                                  * page table if it is not.
1877                                  *
1878                                  * XXX the page directory page is stored
1879                                  * in the avail_ssize field if the map_entry.
1880                                  *
1881                                  * XXX the map simplification code does not
1882                                  * compare this field so weird things may
1883                                  * happen if you do not apply this function
1884                                  * to the entire mapping governed by the
1885                                  * virtual page table.
1886                                  */
1887                                 if (current->maptype != VM_MAPTYPE_VPAGETABLE) {
1888                                         error = EINVAL;
1889                                         break;
1890                                 }
1891                                 current->aux.master_pde = value;
1892                                 pmap_remove(map->pmap,
1893                                             current->start, current->end);
1894                                 break;
1895                         default:
1896                                 error = EINVAL;
1897                                 break;
1898                         }
1899                         vm_map_simplify_entry(map, current, &count);
1900                 }
1901                 vm_map_unlock(map);
1902         } else {
1903                 vm_pindex_t pindex;
1904                 int count;
1905
1906                 /*
1907                  * madvise behaviors that are implemented in the underlying
1908                  * vm_object.
1909                  *
1910                  * Since we don't clip the vm_map_entry, we have to clip
1911                  * the vm_object pindex and count.
1912                  *
1913                  * NOTE!  We currently do not support these functions on
1914                  * virtual page tables.
1915                  */
1916                 for (current = entry;
1917                      (current != &map->header) && (current->start < end);
1918                      current = current->next
1919                 ) {
1920                         vm_offset_t useStart;
1921
1922                         if (current->maptype != VM_MAPTYPE_NORMAL)
1923                                 continue;
1924
1925                         pindex = OFF_TO_IDX(current->offset);
1926                         count = atop(current->end - current->start);
1927                         useStart = current->start;
1928
1929                         if (current->start < start) {
1930                                 pindex += atop(start - current->start);
1931                                 count -= atop(start - current->start);
1932                                 useStart = start;
1933                         }
1934                         if (current->end > end)
1935                                 count -= atop(current->end - end);
1936
1937                         if (count <= 0)
1938                                 continue;
1939
1940                         vm_object_madvise(current->object.vm_object,
1941                                           pindex, count, behav);
1942
1943                         /*
1944                          * Try to populate the page table.  Mappings governed
1945                          * by virtual page tables cannot be pre-populated
1946                          * without a lot of work so don't try.
1947                          */
1948                         if (behav == MADV_WILLNEED &&
1949                             current->maptype != VM_MAPTYPE_VPAGETABLE) {
1950                                 pmap_object_init_pt(
1951                                     map->pmap, 
1952                                     useStart,
1953                                     current->protection,
1954                                     current->object.vm_object,
1955                                     pindex, 
1956                                     (count << PAGE_SHIFT),
1957                                     MAP_PREFAULT_MADVISE
1958                                 );
1959                         }
1960                 }
1961                 vm_map_unlock_read(map);
1962         }
1963         vm_map_entry_release(count);
1964         return(error);
1965 }       
1966
1967
1968 /*
1969  * Sets the inheritance of the specified address range in the target map.
1970  * Inheritance affects how the map will be shared with child maps at the
1971  * time of vm_map_fork.
1972  */
1973 int
1974 vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
1975                vm_inherit_t new_inheritance)
1976 {
1977         vm_map_entry_t entry;
1978         vm_map_entry_t temp_entry;
1979         int count;
1980
1981         switch (new_inheritance) {
1982         case VM_INHERIT_NONE:
1983         case VM_INHERIT_COPY:
1984         case VM_INHERIT_SHARE:
1985                 break;
1986         default:
1987                 return (KERN_INVALID_ARGUMENT);
1988         }
1989
1990         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
1991         vm_map_lock(map);
1992
1993         VM_MAP_RANGE_CHECK(map, start, end);
1994
1995         if (vm_map_lookup_entry(map, start, &temp_entry)) {
1996                 entry = temp_entry;
1997                 vm_map_clip_start(map, entry, start, &count);
1998         } else
1999                 entry = temp_entry->next;
2000
2001         while ((entry != &map->header) && (entry->start < end)) {
2002                 vm_map_clip_end(map, entry, end, &count);
2003
2004                 entry->inheritance = new_inheritance;
2005
2006                 vm_map_simplify_entry(map, entry, &count);
2007
2008                 entry = entry->next;
2009         }
2010         vm_map_unlock(map);
2011         vm_map_entry_release(count);
2012         return (KERN_SUCCESS);
2013 }
2014
2015 /*
2016  * Implement the semantics of mlock
2017  */
2018 int
2019 vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t real_end,
2020               boolean_t new_pageable)
2021 {
2022         vm_map_entry_t entry;
2023         vm_map_entry_t start_entry;
2024         vm_offset_t end;
2025         int rv = KERN_SUCCESS;
2026         int count;
2027
2028         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2029         vm_map_lock(map);
2030         VM_MAP_RANGE_CHECK(map, start, real_end);
2031         end = real_end;
2032
2033         start_entry = vm_map_clip_range(map, start, end, &count,
2034                                         MAP_CLIP_NO_HOLES);
2035         if (start_entry == NULL) {
2036                 vm_map_unlock(map);
2037                 vm_map_entry_release(count);
2038                 return (KERN_INVALID_ADDRESS);
2039         }
2040
2041         if (new_pageable == 0) {
2042                 entry = start_entry;
2043                 while ((entry != &map->header) && (entry->start < end)) {
2044                         vm_offset_t save_start;
2045                         vm_offset_t save_end;
2046
2047                         /*
2048                          * Already user wired or hard wired (trivial cases)
2049                          */
2050                         if (entry->eflags & MAP_ENTRY_USER_WIRED) {
2051                                 entry = entry->next;
2052                                 continue;
2053                         }
2054                         if (entry->wired_count != 0) {
2055                                 entry->wired_count++;
2056                                 entry->eflags |= MAP_ENTRY_USER_WIRED;
2057                                 entry = entry->next;
2058                                 continue;
2059                         }
2060
2061                         /*
2062                          * A new wiring requires instantiation of appropriate
2063                          * management structures and the faulting in of the
2064                          * page.
2065                          */
2066                         if (entry->maptype != VM_MAPTYPE_SUBMAP) {
2067                                 int copyflag = entry->eflags &
2068                                                MAP_ENTRY_NEEDS_COPY;
2069                                 if (copyflag && ((entry->protection &
2070                                                   VM_PROT_WRITE) != 0)) {
2071                                         vm_map_entry_shadow(entry);
2072                                 } else if (entry->object.vm_object == NULL &&
2073                                            !map->system_map) {
2074                                         vm_map_entry_allocate_object(entry);
2075                                 }
2076                         }
2077                         entry->wired_count++;
2078                         entry->eflags |= MAP_ENTRY_USER_WIRED;
2079
2080                         /*
2081                          * Now fault in the area.  Note that vm_fault_wire()
2082                          * may release the map lock temporarily, it will be
2083                          * relocked on return.  The in-transition
2084                          * flag protects the entries. 
2085                          */
2086                         save_start = entry->start;
2087                         save_end = entry->end;
2088                         rv = vm_fault_wire(map, entry, TRUE);
2089                         if (rv) {
2090                                 CLIP_CHECK_BACK(entry, save_start);
2091                                 for (;;) {
2092                                         KASSERT(entry->wired_count == 1, ("bad wired_count on entry"));
2093                                         entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2094                                         entry->wired_count = 0;
2095                                         if (entry->end == save_end)
2096                                                 break;
2097                                         entry = entry->next;
2098                                         KASSERT(entry != &map->header, ("bad entry clip during backout"));
2099                                 }
2100                                 end = save_start;       /* unwire the rest */
2101                                 break;
2102                         }
2103                         /*
2104                          * note that even though the entry might have been
2105                          * clipped, the USER_WIRED flag we set prevents
2106                          * duplication so we do not have to do a 
2107                          * clip check.
2108                          */
2109                         entry = entry->next;
2110                 }
2111
2112                 /*
2113                  * If we failed fall through to the unwiring section to
2114                  * unwire what we had wired so far.  'end' has already
2115                  * been adjusted.
2116                  */
2117                 if (rv)
2118                         new_pageable = 1;
2119
2120                 /*
2121                  * start_entry might have been clipped if we unlocked the
2122                  * map and blocked.  No matter how clipped it has gotten
2123                  * there should be a fragment that is on our start boundary.
2124                  */
2125                 CLIP_CHECK_BACK(start_entry, start);
2126         }
2127
2128         /*
2129          * Deal with the unwiring case.
2130          */
2131         if (new_pageable) {
2132                 /*
2133                  * This is the unwiring case.  We must first ensure that the
2134                  * range to be unwired is really wired down.  We know there
2135                  * are no holes.
2136                  */
2137                 entry = start_entry;
2138                 while ((entry != &map->header) && (entry->start < end)) {
2139                         if ((entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
2140                                 rv = KERN_INVALID_ARGUMENT;
2141                                 goto done;
2142                         }
2143                         KASSERT(entry->wired_count != 0, ("wired count was 0 with USER_WIRED set! %p", entry));
2144                         entry = entry->next;
2145                 }
2146
2147                 /*
2148                  * Now decrement the wiring count for each region. If a region
2149                  * becomes completely unwired, unwire its physical pages and
2150                  * mappings.
2151                  */
2152                 /*
2153                  * The map entries are processed in a loop, checking to
2154                  * make sure the entry is wired and asserting it has a wired
2155                  * count. However, another loop was inserted more-or-less in
2156                  * the middle of the unwiring path. This loop picks up the
2157                  * "entry" loop variable from the first loop without first
2158                  * setting it to start_entry. Naturally, the secound loop
2159                  * is never entered and the pages backing the entries are
2160                  * never unwired. This can lead to a leak of wired pages.
2161                  */
2162                 entry = start_entry;
2163                 while ((entry != &map->header) && (entry->start < end)) {
2164                         KASSERT(entry->eflags & MAP_ENTRY_USER_WIRED,
2165                                 ("expected USER_WIRED on entry %p", entry));
2166                         entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2167                         entry->wired_count--;
2168                         if (entry->wired_count == 0)
2169                                 vm_fault_unwire(map, entry);
2170                         entry = entry->next;
2171                 }
2172         }
2173 done:
2174         vm_map_unclip_range(map, start_entry, start, real_end, &count,
2175                 MAP_CLIP_NO_HOLES);
2176         map->timestamp++;
2177         vm_map_unlock(map);
2178         vm_map_entry_release(count);
2179         return (rv);
2180 }
2181
2182 /*
2183  * Sets the pageability of the specified address range in the target map.
2184  * Regions specified as not pageable require locked-down physical
2185  * memory and physical page maps.
2186  *
2187  * The map must not be locked, but a reference must remain to the map
2188  * throughout the call.
2189  *
2190  * This function may be called via the zalloc path and must properly
2191  * reserve map entries for kernel_map.
2192  *
2193  * No requirements.
2194  */
2195 int
2196 vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t real_end, int kmflags)
2197 {
2198         vm_map_entry_t entry;
2199         vm_map_entry_t start_entry;
2200         vm_offset_t end;
2201         int rv = KERN_SUCCESS;
2202         int count;
2203
2204         if (kmflags & KM_KRESERVE)
2205                 count = vm_map_entry_kreserve(MAP_RESERVE_COUNT);
2206         else
2207                 count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2208         vm_map_lock(map);
2209         VM_MAP_RANGE_CHECK(map, start, real_end);
2210         end = real_end;
2211
2212         start_entry = vm_map_clip_range(map, start, end, &count,
2213                                         MAP_CLIP_NO_HOLES);
2214         if (start_entry == NULL) {
2215                 vm_map_unlock(map);
2216                 rv = KERN_INVALID_ADDRESS;
2217                 goto failure;
2218         }
2219         if ((kmflags & KM_PAGEABLE) == 0) {
2220                 /*
2221                  * Wiring.  
2222                  *
2223                  * 1.  Holding the write lock, we create any shadow or zero-fill
2224                  * objects that need to be created. Then we clip each map
2225                  * entry to the region to be wired and increment its wiring
2226                  * count.  We create objects before clipping the map entries
2227                  * to avoid object proliferation.
2228                  *
2229                  * 2.  We downgrade to a read lock, and call vm_fault_wire to
2230                  * fault in the pages for any newly wired area (wired_count is
2231                  * 1).
2232                  *
2233                  * Downgrading to a read lock for vm_fault_wire avoids a 
2234                  * possible deadlock with another process that may have faulted
2235                  * on one of the pages to be wired (it would mark the page busy,
2236                  * blocking us, then in turn block on the map lock that we
2237                  * hold).  Because of problems in the recursive lock package,
2238                  * we cannot upgrade to a write lock in vm_map_lookup.  Thus,
2239                  * any actions that require the write lock must be done
2240                  * beforehand.  Because we keep the read lock on the map, the
2241                  * copy-on-write status of the entries we modify here cannot
2242                  * change.
2243                  */
2244                 entry = start_entry;
2245                 while ((entry != &map->header) && (entry->start < end)) {
2246                         /*
2247                          * Trivial case if the entry is already wired
2248                          */
2249                         if (entry->wired_count) {
2250                                 entry->wired_count++;
2251                                 entry = entry->next;
2252                                 continue;
2253                         }
2254
2255                         /*
2256                          * The entry is being newly wired, we have to setup
2257                          * appropriate management structures.  A shadow 
2258                          * object is required for a copy-on-write region,
2259                          * or a normal object for a zero-fill region.  We
2260                          * do not have to do this for entries that point to sub
2261                          * maps because we won't hold the lock on the sub map.
2262                          */
2263                         if (entry->maptype != VM_MAPTYPE_SUBMAP) {
2264                                 int copyflag = entry->eflags &
2265                                                MAP_ENTRY_NEEDS_COPY;
2266                                 if (copyflag && ((entry->protection &
2267                                                   VM_PROT_WRITE) != 0)) {
2268                                         vm_map_entry_shadow(entry);
2269                                 } else if (entry->object.vm_object == NULL &&
2270                                            !map->system_map) {
2271                                         vm_map_entry_allocate_object(entry);
2272                                 }
2273                         }
2274
2275                         entry->wired_count++;
2276                         entry = entry->next;
2277                 }
2278
2279                 /*
2280                  * Pass 2.
2281                  */
2282
2283                 /*
2284                  * HACK HACK HACK HACK
2285                  *
2286                  * vm_fault_wire() temporarily unlocks the map to avoid
2287                  * deadlocks.  The in-transition flag from vm_map_clip_range
2288                  * call should protect us from changes while the map is
2289                  * unlocked.  T
2290                  *
2291                  * NOTE: Previously this comment stated that clipping might
2292                  *       still occur while the entry is unlocked, but from
2293                  *       what I can tell it actually cannot.
2294                  *
2295                  *       It is unclear whether the CLIP_CHECK_*() calls
2296                  *       are still needed but we keep them in anyway.
2297                  *
2298                  * HACK HACK HACK HACK
2299                  */
2300
2301                 entry = start_entry;
2302                 while (entry != &map->header && entry->start < end) {
2303                         /*
2304                          * If vm_fault_wire fails for any page we need to undo
2305                          * what has been done.  We decrement the wiring count
2306                          * for those pages which have not yet been wired (now)
2307                          * and unwire those that have (later).
2308                          */
2309                         vm_offset_t save_start = entry->start;
2310                         vm_offset_t save_end = entry->end;
2311
2312                         if (entry->wired_count == 1)
2313                                 rv = vm_fault_wire(map, entry, FALSE);
2314                         if (rv) {
2315                                 CLIP_CHECK_BACK(entry, save_start);
2316                                 for (;;) {
2317                                         KASSERT(entry->wired_count == 1, ("wired_count changed unexpectedly"));
2318                                         entry->wired_count = 0;
2319                                         if (entry->end == save_end)
2320                                                 break;
2321                                         entry = entry->next;
2322                                         KASSERT(entry != &map->header, ("bad entry clip during backout"));
2323                                 }
2324                                 end = save_start;
2325                                 break;
2326                         }
2327                         CLIP_CHECK_FWD(entry, save_end);
2328                         entry = entry->next;
2329                 }
2330
2331                 /*
2332                  * If a failure occured undo everything by falling through
2333                  * to the unwiring code.  'end' has already been adjusted
2334                  * appropriately.
2335                  */
2336                 if (rv)
2337                         kmflags |= KM_PAGEABLE;
2338
2339                 /*
2340                  * start_entry is still IN_TRANSITION but may have been 
2341                  * clipped since vm_fault_wire() unlocks and relocks the
2342                  * map.  No matter how clipped it has gotten there should
2343                  * be a fragment that is on our start boundary.
2344                  */
2345                 CLIP_CHECK_BACK(start_entry, start);
2346         }
2347
2348         if (kmflags & KM_PAGEABLE) {
2349                 /*
2350                  * This is the unwiring case.  We must first ensure that the
2351                  * range to be unwired is really wired down.  We know there
2352                  * are no holes.
2353                  */
2354                 entry = start_entry;
2355                 while ((entry != &map->header) && (entry->start < end)) {
2356                         if (entry->wired_count == 0) {
2357                                 rv = KERN_INVALID_ARGUMENT;
2358                                 goto done;
2359                         }
2360                         entry = entry->next;
2361                 }
2362
2363                 /*
2364                  * Now decrement the wiring count for each region. If a region
2365                  * becomes completely unwired, unwire its physical pages and
2366                  * mappings.
2367                  */
2368                 entry = start_entry;
2369                 while ((entry != &map->header) && (entry->start < end)) {
2370                         entry->wired_count--;
2371                         if (entry->wired_count == 0)
2372                                 vm_fault_unwire(map, entry);
2373                         entry = entry->next;
2374                 }
2375         }
2376 done:
2377         vm_map_unclip_range(map, start_entry, start, real_end,
2378                             &count, MAP_CLIP_NO_HOLES);
2379         map->timestamp++;
2380         vm_map_unlock(map);
2381 failure:
2382         if (kmflags & KM_KRESERVE)
2383                 vm_map_entry_krelease(count);
2384         else
2385                 vm_map_entry_release(count);
2386         return (rv);
2387 }
2388
2389 /*
2390  * Mark a newly allocated address range as wired but do not fault in
2391  * the pages.  The caller is expected to load the pages into the object.
2392  *
2393  * The map must be locked on entry and will remain locked on return.
2394  * No other requirements.
2395  */
2396 void
2397 vm_map_set_wired_quick(vm_map_t map, vm_offset_t addr, vm_size_t size,
2398                        int *countp)
2399 {
2400         vm_map_entry_t scan;
2401         vm_map_entry_t entry;
2402
2403         entry = vm_map_clip_range(map, addr, addr + size,
2404                                   countp, MAP_CLIP_NO_HOLES);
2405         for (scan = entry;
2406              scan != &map->header && scan->start < addr + size;
2407              scan = scan->next) {
2408             KKASSERT(entry->wired_count == 0);
2409             entry->wired_count = 1;                                              
2410         }
2411         vm_map_unclip_range(map, entry, addr, addr + size,
2412                             countp, MAP_CLIP_NO_HOLES);
2413 }
2414
2415 /*
2416  * Push any dirty cached pages in the address range to their pager.
2417  * If syncio is TRUE, dirty pages are written synchronously.
2418  * If invalidate is TRUE, any cached pages are freed as well.
2419  *
2420  * This routine is called by sys_msync()
2421  *
2422  * Returns an error if any part of the specified range is not mapped.
2423  *
2424  * No requirements.
2425  */
2426 int
2427 vm_map_clean(vm_map_t map, vm_offset_t start, vm_offset_t end,
2428              boolean_t syncio, boolean_t invalidate)
2429 {
2430         vm_map_entry_t current;
2431         vm_map_entry_t entry;
2432         vm_size_t size;
2433         vm_object_t object;
2434         vm_ooffset_t offset;
2435
2436         vm_map_lock_read(map);
2437         VM_MAP_RANGE_CHECK(map, start, end);
2438         if (!vm_map_lookup_entry(map, start, &entry)) {
2439                 vm_map_unlock_read(map);
2440                 return (KERN_INVALID_ADDRESS);
2441         }
2442         /*
2443          * Make a first pass to check for holes.
2444          */
2445         for (current = entry; current->start < end; current = current->next) {
2446                 if (current->maptype == VM_MAPTYPE_SUBMAP) {
2447                         vm_map_unlock_read(map);
2448                         return (KERN_INVALID_ARGUMENT);
2449                 }
2450                 if (end > current->end &&
2451                     (current->next == &map->header ||
2452                         current->end != current->next->start)) {
2453                         vm_map_unlock_read(map);
2454                         return (KERN_INVALID_ADDRESS);
2455                 }
2456         }
2457
2458         if (invalidate)
2459                 pmap_remove(vm_map_pmap(map), start, end);
2460
2461         /*
2462          * Make a second pass, cleaning/uncaching pages from the indicated
2463          * objects as we go.
2464          *
2465          * Hold vm_token to avoid blocking in vm_object_reference()
2466          */
2467         lwkt_gettoken(&vm_token);
2468         lwkt_gettoken(&vmobj_token);
2469
2470         for (current = entry; current->start < end; current = current->next) {
2471                 offset = current->offset + (start - current->start);
2472                 size = (end <= current->end ? end : current->end) - start;
2473                 if (current->maptype == VM_MAPTYPE_SUBMAP) {
2474                         vm_map_t smap;
2475                         vm_map_entry_t tentry;
2476                         vm_size_t tsize;
2477
2478                         smap = current->object.sub_map;
2479                         vm_map_lock_read(smap);
2480                         vm_map_lookup_entry(smap, offset, &tentry);
2481                         tsize = tentry->end - offset;
2482                         if (tsize < size)
2483                                 size = tsize;
2484                         object = tentry->object.vm_object;
2485                         offset = tentry->offset + (offset - tentry->start);
2486                         vm_map_unlock_read(smap);
2487                 } else {
2488                         object = current->object.vm_object;
2489                 }
2490                 /*
2491                  * Note that there is absolutely no sense in writing out
2492                  * anonymous objects, so we track down the vnode object
2493                  * to write out.
2494                  * We invalidate (remove) all pages from the address space
2495                  * anyway, for semantic correctness.
2496                  *
2497                  * note: certain anonymous maps, such as MAP_NOSYNC maps,
2498                  * may start out with a NULL object.
2499                  */
2500                 while (object && object->backing_object) {
2501                         offset += object->backing_object_offset;
2502                         object = object->backing_object;
2503                         if (object->size < OFF_TO_IDX( offset + size))
2504                                 size = IDX_TO_OFF(object->size) - offset;
2505                 }
2506                 if (object && (object->type == OBJT_VNODE) && 
2507                     (current->protection & VM_PROT_WRITE) &&
2508                     (object->flags & OBJ_NOMSYNC) == 0) {
2509                         /*
2510                          * Flush pages if writing is allowed, invalidate them
2511                          * if invalidation requested.  Pages undergoing I/O
2512                          * will be ignored by vm_object_page_remove().
2513                          *
2514                          * We cannot lock the vnode and then wait for paging
2515                          * to complete without deadlocking against vm_fault.
2516                          * Instead we simply call vm_object_page_remove() and
2517                          * allow it to block internally on a page-by-page 
2518                          * basis when it encounters pages undergoing async 
2519                          * I/O.
2520                          */
2521                         int flags;
2522
2523                         vm_object_reference_locked(object);
2524                         vn_lock(object->handle, LK_EXCLUSIVE | LK_RETRY);
2525                         flags = (syncio || invalidate) ? OBJPC_SYNC : 0;
2526                         flags |= invalidate ? OBJPC_INVAL : 0;
2527
2528                         /*
2529                          * When operating on a virtual page table just
2530                          * flush the whole object.  XXX we probably ought
2531                          * to 
2532                          */
2533                         switch(current->maptype) {
2534                         case VM_MAPTYPE_NORMAL:
2535                                 vm_object_page_clean(object,
2536                                     OFF_TO_IDX(offset),
2537                                     OFF_TO_IDX(offset + size + PAGE_MASK),
2538                                     flags);
2539                                 break;
2540                         case VM_MAPTYPE_VPAGETABLE:
2541                                 vm_object_page_clean(object, 0, 0, flags);
2542                                 break;
2543                         }
2544                         vn_unlock(((struct vnode *)object->handle));
2545                         vm_object_deallocate_locked(object);
2546                 }
2547                 if (object && invalidate &&
2548                    ((object->type == OBJT_VNODE) ||
2549                     (object->type == OBJT_DEVICE))) {
2550                         int clean_only = 
2551                                 (object->type == OBJT_DEVICE) ? FALSE : TRUE;
2552                         vm_object_reference_locked(object);
2553                         switch(current->maptype) {
2554                         case VM_MAPTYPE_NORMAL:
2555                                 vm_object_page_remove(object,
2556                                     OFF_TO_IDX(offset),
2557                                     OFF_TO_IDX(offset + size + PAGE_MASK),
2558                                     clean_only);
2559                                 break;
2560                         case VM_MAPTYPE_VPAGETABLE:
2561                                 vm_object_page_remove(object, 0, 0, clean_only);
2562                                 break;
2563                         }
2564                         vm_object_deallocate_locked(object);
2565                 }
2566                 start += size;
2567         }
2568
2569         lwkt_reltoken(&vmobj_token);
2570         lwkt_reltoken(&vm_token);
2571         vm_map_unlock_read(map);
2572
2573         return (KERN_SUCCESS);
2574 }
2575
2576 /*
2577  * Make the region specified by this entry pageable.
2578  *
2579  * The vm_map must be exclusively locked.
2580  */
2581 static void 
2582 vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
2583 {
2584         entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2585         entry->wired_count = 0;
2586         vm_fault_unwire(map, entry);
2587 }
2588
2589 /*
2590  * Deallocate the given entry from the target map.
2591  *
2592  * The vm_map must be exclusively locked.
2593  */
2594 static void
2595 vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry, int *countp)
2596 {
2597         vm_map_entry_unlink(map, entry);
2598         map->size -= entry->end - entry->start;
2599
2600         switch(entry->maptype) {
2601         case VM_MAPTYPE_NORMAL:
2602         case VM_MAPTYPE_VPAGETABLE:
2603                 vm_object_deallocate(entry->object.vm_object);
2604                 break;
2605         default:
2606                 break;
2607         }
2608
2609         vm_map_entry_dispose(map, entry, countp);
2610 }
2611
2612 /*
2613  * Deallocates the given address range from the target map.
2614  *
2615  * The vm_map must be exclusively locked.
2616  */
2617 int
2618 vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end, int *countp)
2619 {
2620         vm_object_t object;
2621         vm_map_entry_t entry;
2622         vm_map_entry_t first_entry;
2623
2624         ASSERT_VM_MAP_LOCKED(map);
2625 again:
2626         /*
2627          * Find the start of the region, and clip it.  Set entry to point
2628          * at the first record containing the requested address or, if no
2629          * such record exists, the next record with a greater address.  The
2630          * loop will run from this point until a record beyond the termination
2631          * address is encountered.
2632          *
2633          * map->hint must be adjusted to not point to anything we delete,
2634          * so set it to the entry prior to the one being deleted.
2635          *
2636          * GGG see other GGG comment.
2637          */
2638         if (vm_map_lookup_entry(map, start, &first_entry)) {
2639                 entry = first_entry;
2640                 vm_map_clip_start(map, entry, start, countp);
2641                 map->hint = entry->prev;        /* possible problem XXX */
2642         } else {
2643                 map->hint = first_entry;        /* possible problem XXX */
2644                 entry = first_entry->next;
2645         }
2646
2647         /*
2648          * If a hole opens up prior to the current first_free then
2649          * adjust first_free.  As with map->hint, map->first_free
2650          * cannot be left set to anything we might delete.
2651          */
2652         if (entry == &map->header) {
2653                 map->first_free = &map->header;
2654         } else if (map->first_free->start >= start) {
2655                 map->first_free = entry->prev;
2656         }
2657
2658         /*
2659          * Step through all entries in this region
2660          */
2661         while ((entry != &map->header) && (entry->start < end)) {
2662                 vm_map_entry_t next;
2663                 vm_offset_t s, e;
2664                 vm_pindex_t offidxstart, offidxend, count;
2665
2666                 /*
2667                  * If we hit an in-transition entry we have to sleep and
2668                  * retry.  It's easier (and not really slower) to just retry
2669                  * since this case occurs so rarely and the hint is already
2670                  * pointing at the right place.  We have to reset the
2671                  * start offset so as not to accidently delete an entry
2672                  * another process just created in vacated space.
2673                  */
2674                 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
2675                         entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
2676                         start = entry->start;
2677                         ++mycpu->gd_cnt.v_intrans_coll;
2678                         ++mycpu->gd_cnt.v_intrans_wait;
2679                         vm_map_transition_wait(map);
2680                         goto again;
2681                 }
2682                 vm_map_clip_end(map, entry, end, countp);
2683
2684                 s = entry->start;
2685                 e = entry->end;
2686                 next = entry->next;
2687
2688                 offidxstart = OFF_TO_IDX(entry->offset);
2689                 count = OFF_TO_IDX(e - s);
2690                 object = entry->object.vm_object;
2691
2692                 /*
2693                  * Unwire before removing addresses from the pmap; otherwise,
2694                  * unwiring will put the entries back in the pmap.
2695                  */
2696                 if (entry->wired_count != 0)
2697                         vm_map_entry_unwire(map, entry);
2698
2699                 offidxend = offidxstart + count;
2700
2701                 /*
2702                  * Hold vm_token when manipulating vm_objects,
2703                  *
2704                  * Hold vmobj_token when potentially adding or removing
2705                  * objects (collapse requires both).
2706                  */
2707                 lwkt_gettoken(&vm_token);
2708                 lwkt_gettoken(&vmobj_token);
2709
2710                 if (object == &kernel_object) {
2711                         vm_object_page_remove(object, offidxstart,
2712                                               offidxend, FALSE);
2713                 } else {
2714                         pmap_remove(map->pmap, s, e);
2715
2716                         if (object != NULL &&
2717                             object->ref_count != 1 &&
2718                             (object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) ==
2719                              OBJ_ONEMAPPING &&
2720                             (object->type == OBJT_DEFAULT ||
2721                              object->type == OBJT_SWAP)) {
2722                                 vm_object_collapse(object);
2723                                 vm_object_page_remove(object, offidxstart,
2724                                                       offidxend, FALSE);
2725                                 if (object->type == OBJT_SWAP) {
2726                                         swap_pager_freespace(object,
2727                                                              offidxstart,
2728                                                              count);
2729                                 }
2730                                 if (offidxend >= object->size &&
2731                                     offidxstart < object->size) {
2732                                         object->size = offidxstart;
2733                                 }
2734                         }
2735                 }
2736                 lwkt_reltoken(&vmobj_token);
2737                 lwkt_reltoken(&vm_token);
2738
2739                 /*
2740                  * Delete the entry (which may delete the object) only after
2741                  * removing all pmap entries pointing to its pages.
2742                  * (Otherwise, its page frames may be reallocated, and any
2743                  * modify bits will be set in the wrong object!)
2744                  */
2745                 vm_map_entry_delete(map, entry, countp);
2746                 entry = next;
2747         }
2748         return (KERN_SUCCESS);
2749 }
2750
2751 /*
2752  * Remove the given address range from the target map.
2753  * This is the exported form of vm_map_delete.
2754  *
2755  * No requirements.
2756  */
2757 int
2758 vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
2759 {
2760         int result;
2761         int count;
2762
2763         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2764         vm_map_lock(map);
2765         VM_MAP_RANGE_CHECK(map, start, end);
2766         result = vm_map_delete(map, start, end, &count);
2767         vm_map_unlock(map);
2768         vm_map_entry_release(count);
2769
2770         return (result);
2771 }
2772
2773 /*
2774  * Assert that the target map allows the specified privilege on the
2775  * entire address region given.  The entire region must be allocated.
2776  *
2777  * The caller must specify whether the vm_map is already locked or not.
2778  */
2779 boolean_t
2780 vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
2781                         vm_prot_t protection, boolean_t have_lock)
2782 {
2783         vm_map_entry_t entry;
2784         vm_map_entry_t tmp_entry;
2785         boolean_t result;
2786
2787         if (have_lock == FALSE)
2788                 vm_map_lock_read(map);
2789
2790         if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
2791                 if (have_lock == FALSE)
2792                         vm_map_unlock_read(map);
2793                 return (FALSE);
2794         }
2795         entry = tmp_entry;
2796
2797         result = TRUE;
2798         while (start < end) {
2799                 if (entry == &map->header) {
2800                         result = FALSE;
2801                         break;
2802                 }
2803                 /*
2804                  * No holes allowed!
2805                  */
2806
2807                 if (start < entry->start) {
2808                         result = FALSE;
2809                         break;
2810                 }
2811                 /*
2812                  * Check protection associated with entry.
2813                  */
2814
2815                 if ((entry->protection & protection) != protection) {
2816                         result = FALSE;
2817                         break;
2818                 }
2819                 /* go to next entry */
2820
2821                 start = entry->end;
2822                 entry = entry->next;
2823         }
2824         if (have_lock == FALSE)
2825                 vm_map_unlock_read(map);
2826         return (result);
2827 }
2828
2829 /*
2830  * Split the pages in a map entry into a new object.  This affords
2831  * easier removal of unused pages, and keeps object inheritance from
2832  * being a negative impact on memory usage.
2833  *
2834  * The vm_map must be exclusively locked.
2835  */
2836 static void
2837 vm_map_split(vm_map_entry_t entry)
2838 {
2839         vm_page_t m;
2840         vm_object_t orig_object, new_object, source;
2841         vm_offset_t s, e;
2842         vm_pindex_t offidxstart, offidxend, idx;
2843         vm_size_t size;
2844         vm_ooffset_t offset;
2845
2846         orig_object = entry->object.vm_object;
2847         if (orig_object->type != OBJT_DEFAULT && orig_object->type != OBJT_SWAP)
2848                 return;
2849         if (orig_object->ref_count <= 1)
2850                 return;
2851
2852         offset = entry->offset;
2853         s = entry->start;
2854         e = entry->end;
2855
2856         offidxstart = OFF_TO_IDX(offset);
2857         offidxend = offidxstart + OFF_TO_IDX(e - s);
2858         size = offidxend - offidxstart;
2859
2860         switch(orig_object->type) {
2861         case OBJT_DEFAULT:
2862                 new_object = default_pager_alloc(NULL, IDX_TO_OFF(size),
2863                                                  VM_PROT_ALL, 0);
2864                 break;
2865         case OBJT_SWAP:
2866                 new_object = swap_pager_alloc(NULL, IDX_TO_OFF(size),
2867                                               VM_PROT_ALL, 0);
2868                 break;
2869         default:
2870                 /* not reached */
2871                 new_object = NULL;
2872                 KKASSERT(0);
2873         }
2874         if (new_object == NULL)
2875                 return;
2876
2877         /*
2878          * vm_token required when manipulating vm_objects.
2879          */
2880         lwkt_gettoken(&vm_token);
2881         lwkt_gettoken(&vmobj_token);
2882
2883         source = orig_object->backing_object;
2884         if (source != NULL) {
2885                 /* Referenced by new_object */
2886                 vm_object_reference_locked(source);
2887                 LIST_INSERT_HEAD(&source->shadow_head,
2888                                  new_object, shadow_list);
2889                 vm_object_clear_flag(source, OBJ_ONEMAPPING);
2890                 new_object->backing_object_offset = 
2891                         orig_object->backing_object_offset +
2892                         IDX_TO_OFF(offidxstart);
2893                 new_object->backing_object = source;
2894                 source->shadow_count++;
2895                 source->generation++;
2896         }
2897
2898         for (idx = 0; idx < size; idx++) {
2899                 vm_page_t m;
2900
2901         retry:
2902                 m = vm_page_lookup(orig_object, offidxstart + idx);
2903                 if (m == NULL)
2904                         continue;
2905
2906                 /*
2907                  * We must wait for pending I/O to complete before we can
2908                  * rename the page.
2909                  *
2910                  * We do not have to VM_PROT_NONE the page as mappings should
2911                  * not be changed by this operation.
2912                  */
2913                 if (vm_page_sleep_busy(m, TRUE, "spltwt"))
2914                         goto retry;
2915                 vm_page_busy(m);
2916                 vm_page_rename(m, new_object, idx);
2917                 /* page automatically made dirty by rename and cache handled */
2918                 vm_page_busy(m);
2919         }
2920
2921         if (orig_object->type == OBJT_SWAP) {
2922                 vm_object_pip_add(orig_object, 1);
2923                 /*
2924                  * copy orig_object pages into new_object
2925                  * and destroy unneeded pages in
2926                  * shadow object.
2927                  */
2928                 swap_pager_copy(orig_object, new_object, offidxstart, 0);
2929                 vm_object_pip_wakeup(orig_object);
2930         }
2931
2932         /*
2933          * Wakeup the pages we played with.  No spl protection is needed
2934          * for a simple wakeup.
2935          */
2936         for (idx = 0; idx < size; idx++) {
2937                 m = vm_page_lookup(new_object, idx);
2938                 if (m)
2939                         vm_page_wakeup(m);
2940         }
2941
2942         entry->object.vm_object = new_object;
2943         entry->offset = 0LL;
2944         vm_object_deallocate_locked(orig_object);
2945         lwkt_reltoken(&vmobj_token);
2946         lwkt_reltoken(&vm_token);
2947 }
2948
2949 /*
2950  * Copies the contents of the source entry to the destination
2951  * entry.  The entries *must* be aligned properly.
2952  *
2953  * The vm_map must be exclusively locked.
2954  * vm_token must be held
2955  */
2956 static void
2957 vm_map_copy_entry(vm_map_t src_map, vm_map_t dst_map,
2958         vm_map_entry_t src_entry, vm_map_entry_t dst_entry)
2959 {
2960         vm_object_t src_object;
2961
2962         if (dst_entry->maptype == VM_MAPTYPE_SUBMAP)
2963                 return;
2964         if (src_entry->maptype == VM_MAPTYPE_SUBMAP)
2965                 return;
2966
2967         ASSERT_LWKT_TOKEN_HELD(&vm_token);
2968         lwkt_gettoken(&vmobj_token);            /* required for collapse */
2969
2970         if (src_entry->wired_count == 0) {
2971                 /*
2972                  * If the source entry is marked needs_copy, it is already
2973                  * write-protected.
2974                  */
2975                 if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) {
2976                         pmap_protect(src_map->pmap,
2977                             src_entry->start,
2978                             src_entry->end,
2979                             src_entry->protection & ~VM_PROT_WRITE);
2980                 }
2981
2982                 /*
2983                  * Make a copy of the object.
2984                  */
2985                 if ((src_object = src_entry->object.vm_object) != NULL) {
2986                         if ((src_object->handle == NULL) &&
2987                                 (src_object->type == OBJT_DEFAULT ||
2988                                  src_object->type == OBJT_SWAP)) {
2989                                 vm_object_collapse(src_object);
2990                                 if ((src_object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING) {
2991                                         vm_map_split(src_entry);
2992                                         src_object = src_entry->object.vm_object;
2993                                 }
2994                         }
2995
2996                         vm_object_reference_locked(src_object);
2997                         vm_object_clear_flag(src_object, OBJ_ONEMAPPING);
2998                         dst_entry->object.vm_object = src_object;
2999                         src_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY);
3000                         dst_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY);
3001                         dst_entry->offset = src_entry->offset;
3002                 } else {
3003                         dst_entry->object.vm_object = NULL;
3004                         dst_entry->offset = 0;
3005                 }
3006
3007                 pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start,
3008                     dst_entry->end - dst_entry->start, src_entry->start);
3009         } else {
3010                 /*
3011                  * Of course, wired down pages can't be set copy-on-write.
3012                  * Cause wired pages to be copied into the new map by
3013                  * simulating faults (the new pages are pageable)
3014                  */
3015                 vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry);
3016         }
3017         lwkt_reltoken(&vmobj_token);
3018 }
3019
3020 /*
3021  * vmspace_fork:
3022  * Create a new process vmspace structure and vm_map
3023  * based on those of an existing process.  The new map
3024  * is based on the old map, according to the inheritance
3025  * values on the regions in that map.
3026  *
3027  * The source map must not be locked.
3028  * No requirements.
3029  */
3030 struct vmspace *
3031 vmspace_fork(struct vmspace *vm1)
3032 {
3033         struct vmspace *vm2;
3034         vm_map_t old_map = &vm1->vm_map;
3035         vm_map_t new_map;
3036         vm_map_entry_t old_entry;
3037         vm_map_entry_t new_entry;
3038         vm_object_t object;
3039         int count;
3040
3041         lwkt_gettoken(&vm_token);
3042         lwkt_gettoken(&vmspace_token);
3043         lwkt_gettoken(&vmobj_token);
3044         vm_map_lock(old_map);
3045         old_map->infork = 1;
3046
3047         /*
3048          * XXX Note: upcalls are not copied.
3049          */
3050         vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset);
3051         bcopy(&vm1->vm_startcopy, &vm2->vm_startcopy,
3052             (caddr_t)&vm1->vm_endcopy - (caddr_t)&vm1->vm_startcopy);
3053         new_map = &vm2->vm_map; /* XXX */
3054         new_map->timestamp = 1;
3055
3056         vm_map_lock(new_map);
3057
3058         count = 0;
3059         old_entry = old_map->header.next;
3060         while (old_entry != &old_map->header) {
3061                 ++count;
3062                 old_entry = old_entry->next;
3063         }
3064
3065         count = vm_map_entry_reserve(count + MAP_RESERVE_COUNT);
3066
3067         old_entry = old_map->header.next;
3068         while (old_entry != &old_map->header) {
3069                 if (old_entry->maptype == VM_MAPTYPE_SUBMAP)
3070                         panic("vm_map_fork: encountered a submap");
3071
3072                 switch (old_entry->inheritance) {
3073                 case VM_INHERIT_NONE:
3074                         break;
3075                 case VM_INHERIT_SHARE:
3076                         /*
3077                          * Clone the entry, creating the shared object if
3078                          * necessary.
3079                          */
3080                         object = old_entry->object.vm_object;
3081                         if (object == NULL) {
3082                                 vm_map_entry_allocate_object(old_entry);
3083                                 object = old_entry->object.vm_object;
3084                         }
3085
3086                         /*
3087                          * Add the reference before calling vm_map_entry_shadow
3088                          * to insure that a shadow object is created.
3089                          */
3090                         vm_object_reference_locked(object);
3091                         if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
3092                                 vm_map_entry_shadow(old_entry);
3093                                 /* Transfer the second reference too. */
3094                                 vm_object_reference_locked(
3095                                     old_entry->object.vm_object);
3096                                 vm_object_deallocate_locked(object);
3097                                 object = old_entry->object.vm_object;
3098                         }
3099                         vm_object_clear_flag(object, OBJ_ONEMAPPING);
3100
3101                         /*
3102                          * Clone the entry, referencing the shared object.
3103                          */
3104                         new_entry = vm_map_entry_create(new_map, &count);
3105                         *new_entry = *old_entry;
3106                         new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3107                         new_entry->wired_count = 0;
3108
3109                         /*
3110                          * Insert the entry into the new map -- we know we're
3111                          * inserting at the end of the new map.
3112                          */
3113
3114                         vm_map_entry_link(new_map, new_map->header.prev,
3115                                           new_entry);
3116
3117                         /*
3118                          * Update the physical map
3119                          */
3120                         pmap_copy(new_map->pmap, old_map->pmap,
3121                             new_entry->start,
3122                             (old_entry->end - old_entry->start),
3123                             old_entry->start);
3124                         break;
3125                 case VM_INHERIT_COPY:
3126                         /*
3127                          * Clone the entry and link into the map.
3128                          */
3129                         new_entry = vm_map_entry_create(new_map, &count);
3130                         *new_entry = *old_entry;
3131                         new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3132                         new_entry->wired_count = 0;
3133                         new_entry->object.vm_object = NULL;
3134                         vm_map_entry_link(new_map, new_map->header.prev,
3135                                           new_entry);
3136                         vm_map_copy_entry(old_map, new_map, old_entry,
3137                                           new_entry);
3138                         break;
3139                 }
3140                 old_entry = old_entry->next;
3141         }
3142
3143         new_map->size = old_map->size;
3144         old_map->infork = 0;
3145         vm_map_unlock(old_map);
3146         vm_map_unlock(new_map);
3147         vm_map_entry_release(count);
3148
3149         lwkt_reltoken(&vmobj_token);
3150         lwkt_reltoken(&vmspace_token);
3151         lwkt_reltoken(&vm_token);
3152
3153         return (vm2);
3154 }
3155
3156 /*
3157  * Create an auto-grow stack entry
3158  *
3159  * No requirements.
3160  */
3161 int
3162 vm_map_stack (vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
3163               int flags, vm_prot_t prot, vm_prot_t max, int cow)
3164 {
3165         vm_map_entry_t  prev_entry;
3166         vm_map_entry_t  new_stack_entry;
3167         vm_size_t       init_ssize;
3168         int             rv;
3169         int             count;
3170         vm_offset_t     tmpaddr;
3171
3172         cow |= MAP_IS_STACK;
3173
3174         if (max_ssize < sgrowsiz)
3175                 init_ssize = max_ssize;
3176         else
3177                 init_ssize = sgrowsiz;
3178
3179         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
3180         vm_map_lock(map);
3181
3182         /*
3183          * Find space for the mapping
3184          */
3185         if ((flags & (MAP_FIXED | MAP_TRYFIXED)) == 0) {
3186                 if (vm_map_findspace(map, addrbos, max_ssize, 1,
3187                                      flags, &tmpaddr)) {
3188                         vm_map_unlock(map);
3189                         vm_map_entry_release(count);
3190                         return (KERN_NO_SPACE);
3191                 }
3192                 addrbos = tmpaddr;
3193         }
3194
3195         /* If addr is already mapped, no go */
3196         if (vm_map_lookup_entry(map, addrbos, &prev_entry)) {
3197                 vm_map_unlock(map);
3198                 vm_map_entry_release(count);
3199                 return (KERN_NO_SPACE);
3200         }
3201
3202 #if 0
3203         /* XXX already handled by kern_mmap() */
3204         /* If we would blow our VMEM resource limit, no go */
3205         if (map->size + init_ssize >
3206             curproc->p_rlimit[RLIMIT_VMEM].rlim_cur) {
3207                 vm_map_unlock(map);
3208                 vm_map_entry_release(count);
3209                 return (KERN_NO_SPACE);
3210         }
3211 #endif
3212
3213         /*
3214          * If we can't accomodate max_ssize in the current mapping,
3215          * no go.  However, we need to be aware that subsequent user
3216          * mappings might map into the space we have reserved for
3217          * stack, and currently this space is not protected.  
3218          * 
3219          * Hopefully we will at least detect this condition 
3220          * when we try to grow the stack.
3221          */
3222         if ((prev_entry->next != &map->header) &&
3223             (prev_entry->next->start < addrbos + max_ssize)) {
3224                 vm_map_unlock(map);
3225                 vm_map_entry_release(count);
3226                 return (KERN_NO_SPACE);
3227         }
3228
3229         /*
3230          * We initially map a stack of only init_ssize.  We will
3231          * grow as needed later.  Since this is to be a grow 
3232          * down stack, we map at the top of the range.
3233          *
3234          * Note: we would normally expect prot and max to be
3235          * VM_PROT_ALL, and cow to be 0.  Possibly we should
3236          * eliminate these as input parameters, and just
3237          * pass these values here in the insert call.
3238          */
3239         rv = vm_map_insert(map, &count,
3240                            NULL, 0, addrbos + max_ssize - init_ssize,
3241                            addrbos + max_ssize,
3242                            VM_MAPTYPE_NORMAL,
3243                            prot, max,
3244                            cow);
3245
3246         /* Now set the avail_ssize amount */
3247         if (rv == KERN_SUCCESS) {
3248                 if (prev_entry != &map->header)
3249                         vm_map_clip_end(map, prev_entry, addrbos + max_ssize - init_ssize, &count);
3250                 new_stack_entry = prev_entry->next;
3251                 if (new_stack_entry->end   != addrbos + max_ssize ||
3252                     new_stack_entry->start != addrbos + max_ssize - init_ssize)
3253                         panic ("Bad entry start/end for new stack entry");
3254                 else 
3255                         new_stack_entry->aux.avail_ssize = max_ssize - init_ssize;
3256         }
3257
3258         vm_map_unlock(map);
3259         vm_map_entry_release(count);
3260         return (rv);
3261 }
3262
3263 /*
3264  * Attempts to grow a vm stack entry.  Returns KERN_SUCCESS if the
3265  * desired address is already mapped, or if we successfully grow
3266  * the stack.  Also returns KERN_SUCCESS if addr is outside the
3267  * stack range (this is strange, but preserves compatibility with
3268  * the grow function in vm_machdep.c).
3269  *
3270  * No requirements.
3271  */
3272 int
3273 vm_map_growstack (struct proc *p, vm_offset_t addr)
3274 {
3275         vm_map_entry_t prev_entry;
3276         vm_map_entry_t stack_entry;
3277         vm_map_entry_t new_stack_entry;
3278         struct vmspace *vm = p->p_vmspace;
3279         vm_map_t map = &vm->vm_map;
3280         vm_offset_t    end;
3281         int grow_amount;
3282         int rv = KERN_SUCCESS;
3283         int is_procstack;
3284         int use_read_lock = 1;
3285         int count;
3286
3287         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
3288 Retry:
3289         if (use_read_lock)
3290                 vm_map_lock_read(map);
3291         else
3292                 vm_map_lock(map);
3293
3294         /* If addr is already in the entry range, no need to grow.*/
3295         if (vm_map_lookup_entry(map, addr, &prev_entry))
3296                 goto done;
3297
3298         if ((stack_entry = prev_entry->next) == &map->header)
3299                 goto done;
3300         if (prev_entry == &map->header) 
3301                 end = stack_entry->start - stack_entry->aux.avail_ssize;
3302         else
3303                 end = prev_entry->end;
3304
3305         /*
3306          * This next test mimics the old grow function in vm_machdep.c.
3307          * It really doesn't quite make sense, but we do it anyway
3308          * for compatibility.
3309          *
3310          * If not growable stack, return success.  This signals the
3311          * caller to proceed as he would normally with normal vm.
3312          */
3313         if (stack_entry->aux.avail_ssize < 1 ||
3314             addr >= stack_entry->start ||
3315             addr <  stack_entry->start - stack_entry->aux.avail_ssize) {
3316                 goto done;
3317         } 
3318         
3319         /* Find the minimum grow amount */
3320         grow_amount = roundup (stack_entry->start - addr, PAGE_SIZE);
3321         if (grow_amount > stack_entry->aux.avail_ssize) {
3322                 rv = KERN_NO_SPACE;
3323                 goto done;
3324         }
3325
3326         /*
3327          * If there is no longer enough space between the entries
3328          * nogo, and adjust the available space.  Note: this 
3329          * should only happen if the user has mapped into the
3330          * stack area after the stack was created, and is
3331          * probably an error.
3332          *
3333          * This also effectively destroys any guard page the user
3334          * might have intended by limiting the stack size.
3335          */
3336         if (grow_amount > stack_entry->start - end) {
3337                 if (use_read_lock && vm_map_lock_upgrade(map)) {
3338                         use_read_lock = 0;
3339                         goto Retry;
3340                 }
3341                 use_read_lock = 0;
3342                 stack_entry->aux.avail_ssize = stack_entry->start - end;
3343                 rv = KERN_NO_SPACE;
3344                 goto done;
3345         }
3346
3347         is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr;
3348
3349         /* If this is the main process stack, see if we're over the 
3350          * stack limit.
3351          */
3352         if (is_procstack && (ctob(vm->vm_ssize) + grow_amount >
3353                              p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
3354                 rv = KERN_NO_SPACE;
3355                 goto done;
3356         }
3357
3358         /* Round up the grow amount modulo SGROWSIZ */
3359         grow_amount = roundup (grow_amount, sgrowsiz);
3360         if (grow_amount > stack_entry->aux.avail_ssize) {
3361                 grow_amount = stack_entry->aux.avail_ssize;
3362         }
3363         if (is_procstack && (ctob(vm->vm_ssize) + grow_amount >
3364                              p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
3365                 grow_amount = p->p_rlimit[RLIMIT_STACK].rlim_cur -
3366                               ctob(vm->vm_ssize);
3367         }
3368
3369         /* If we would blow our VMEM resource limit, no go */
3370         if (map->size + grow_amount > p->p_rlimit[RLIMIT_VMEM].rlim_cur) {
3371                 rv = KERN_NO_SPACE;
3372                 goto done;
3373         }
3374
3375         if (use_read_lock && vm_map_lock_upgrade(map)) {
3376                 use_read_lock = 0;
3377                 goto Retry;
3378         }
3379         use_read_lock = 0;
3380
3381         /* Get the preliminary new entry start value */
3382         addr = stack_entry->start - grow_amount;
3383
3384         /* If this puts us into the previous entry, cut back our growth
3385          * to the available space.  Also, see the note above.
3386          */
3387         if (addr < end) {
3388                 stack_entry->aux.avail_ssize = stack_entry->start - end;
3389                 addr = end;
3390         }
3391
3392         rv = vm_map_insert(map, &count,
3393                            NULL, 0, addr, stack_entry->start,
3394                            VM_MAPTYPE_NORMAL,
3395                            VM_PROT_ALL, VM_PROT_ALL,
3396                            0);
3397
3398         /* Adjust the available stack space by the amount we grew. */
3399         if (rv == KERN_SUCCESS) {
3400                 if (prev_entry != &map->header)
3401                         vm_map_clip_end(map, prev_entry, addr, &count);
3402                 new_stack_entry = prev_entry->next;
3403                 if (new_stack_entry->end   != stack_entry->start  ||
3404                     new_stack_entry->start != addr)
3405                         panic ("Bad stack grow start/end in new stack entry");
3406                 else {
3407                         new_stack_entry->aux.avail_ssize =
3408                                 stack_entry->aux.avail_ssize -
3409                                 (new_stack_entry->end - new_stack_entry->start);
3410                         if (is_procstack)
3411                                 vm->vm_ssize += btoc(new_stack_entry->end -
3412                                                      new_stack_entry->start);
3413                 }
3414
3415                 if (map->flags & MAP_WIREFUTURE)
3416                         vm_map_unwire(map, new_stack_entry->start,
3417                                       new_stack_entry->end, FALSE);
3418         }
3419
3420 done:
3421         if (use_read_lock)
3422                 vm_map_unlock_read(map);
3423         else
3424                 vm_map_unlock(map);
3425         vm_map_entry_release(count);
3426         return (rv);
3427 }
3428
3429 /*
3430  * Unshare the specified VM space for exec.  If other processes are
3431  * mapped to it, then create a new one.  The new vmspace is null.
3432  *
3433  * No requirements.
3434  */
3435 void
3436 vmspace_exec(struct proc *p, struct vmspace *vmcopy) 
3437 {
3438         struct vmspace *oldvmspace = p->p_vmspace;
3439         struct vmspace *newvmspace;
3440         vm_map_t map = &p->p_vmspace->vm_map;
3441
3442         /*
3443          * If we are execing a resident vmspace we fork it, otherwise
3444          * we create a new vmspace.  Note that exitingcnt and upcalls
3445          * are not copied to the new vmspace.
3446          */
3447         lwkt_gettoken(&vmspace_token);
3448         if (vmcopy)  {
3449                 newvmspace = vmspace_fork(vmcopy);
3450         } else {
3451                 newvmspace = vmspace_alloc(map->min_offset, map->max_offset);
3452                 bcopy(&oldvmspace->vm_startcopy, &newvmspace->vm_startcopy,
3453                       (caddr_t)&oldvmspace->vm_endcopy -
3454                        (caddr_t)&oldvmspace->vm_startcopy);
3455         }
3456
3457         /*
3458          * Finish initializing the vmspace before assigning it
3459          * to the process.  The vmspace will become the current vmspace
3460          * if p == curproc.
3461          */
3462         pmap_pinit2(vmspace_pmap(newvmspace));
3463         pmap_replacevm(p, newvmspace, 0);
3464         sysref_put(&oldvmspace->vm_sysref);
3465         lwkt_reltoken(&vmspace_token);
3466 }
3467
3468 /*
3469  * Unshare the specified VM space for forcing COW.  This
3470  * is called by rfork, for the (RFMEM|RFPROC) == 0 case.
3471  *
3472  * The exitingcnt test is not strictly necessary but has been
3473  * included for code sanity (to make the code a bit more deterministic).
3474  */
3475 void
3476 vmspace_unshare(struct proc *p) 
3477 {
3478         struct vmspace *oldvmspace = p->p_vmspace;
3479         struct vmspace *newvmspace;
3480
3481         lwkt_gettoken(&vmspace_token);
3482         if (oldvmspace->vm_sysref.refcnt == 1 && oldvmspace->vm_exitingcnt == 0)
3483                 return;
3484         newvmspace = vmspace_fork(oldvmspace);
3485         pmap_pinit2(vmspace_pmap(newvmspace));
3486         pmap_replacevm(p, newvmspace, 0);
3487         sysref_put(&oldvmspace->vm_sysref);
3488         lwkt_reltoken(&vmspace_token);
3489 }
3490
3491 /*
3492  * vm_map_hint: return the beginning of the best area suitable for
3493  * creating a new mapping with "prot" protection.
3494  *
3495  * No requirements.
3496  */
3497 vm_offset_t
3498 vm_map_hint(struct proc *p, vm_offset_t addr, vm_prot_t prot)
3499 {
3500         struct vmspace *vms = p->p_vmspace;
3501
3502         if (!randomize_mmap) {
3503                 /*
3504                  * Set a reasonable start point for the hint if it was
3505                  * not specified or if it falls within the heap space.
3506                  * Hinted mmap()s do not allocate out of the heap space.
3507                  */
3508                 if (addr == 0 ||
3509                     (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
3510                      addr < round_page((vm_offset_t)vms->vm_daddr + maxdsiz))) {
3511                         addr = round_page((vm_offset_t)vms->vm_daddr + maxdsiz);
3512                 }
3513
3514                 return addr;
3515         }
3516
3517         if (addr != 0 && addr >= (vm_offset_t)vms->vm_daddr)
3518                 return addr;
3519
3520 #ifdef notyet
3521 #ifdef __i386__
3522         /*
3523          * If executable skip first two pages, otherwise start
3524          * after data + heap region.
3525          */
3526         if ((prot & VM_PROT_EXECUTE) &&
3527             ((vm_offset_t)vms->vm_daddr >= I386_MAX_EXE_ADDR)) {
3528                 addr = (PAGE_SIZE * 2) +
3529                     (karc4random() & (I386_MAX_EXE_ADDR / 2 - 1));
3530                 return (round_page(addr));
3531         }
3532 #endif /* __i386__ */
3533 #endif /* notyet */
3534
3535         addr = (vm_offset_t)vms->vm_daddr + MAXDSIZ;
3536         addr += karc4random() & (MIN((256 * 1024 * 1024), MAXDSIZ) - 1);
3537
3538         return (round_page(addr));
3539 }
3540
3541 /*
3542  * Finds the VM object, offset, and protection for a given virtual address
3543  * in the specified map, assuming a page fault of the type specified.
3544  *
3545  * Leaves the map in question locked for read; return values are guaranteed
3546  * until a vm_map_lookup_done call is performed.  Note that the map argument
3547  * is in/out; the returned map must be used in the call to vm_map_lookup_done.
3548  *
3549  * A handle (out_entry) is returned for use in vm_map_lookup_done, to make
3550  * that fast.
3551  *
3552  * If a lookup is requested with "write protection" specified, the map may
3553  * be changed to perform virtual copying operations, although the data
3554  * referenced will remain the same.
3555  *
3556  * No requirements.
3557  */
3558 int
3559 vm_map_lookup(vm_map_t *var_map,                /* IN/OUT */
3560               vm_offset_t vaddr,
3561               vm_prot_t fault_typea,
3562               vm_map_entry_t *out_entry,        /* OUT */
3563               vm_object_t *object,              /* OUT */
3564               vm_pindex_t *pindex,              /* OUT */
3565               vm_prot_t *out_prot,              /* OUT */
3566               boolean_t *wired)                 /* OUT */
3567 {
3568         vm_map_entry_t entry;
3569         vm_map_t map = *var_map;
3570         vm_prot_t prot;
3571         vm_prot_t fault_type = fault_typea;
3572         int use_read_lock = 1;
3573         int rv = KERN_SUCCESS;
3574
3575 RetryLookup:
3576         if (use_read_lock)
3577                 vm_map_lock_read(map);
3578         else
3579                 vm_map_lock(map);
3580
3581         /*
3582          * If the map has an interesting hint, try it before calling full
3583          * blown lookup routine.
3584          */
3585         entry = map->hint;
3586         *out_entry = entry;
3587
3588         if ((entry == &map->header) ||
3589             (vaddr < entry->start) || (vaddr >= entry->end)) {
3590                 vm_map_entry_t tmp_entry;
3591
3592                 /*
3593                  * Entry was either not a valid hint, or the vaddr was not
3594                  * contained in the entry, so do a full lookup.
3595                  */
3596                 if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
3597                         rv = KERN_INVALID_ADDRESS;
3598                         goto done;
3599                 }
3600
3601                 entry = tmp_entry;
3602                 *out_entry = entry;
3603         }
3604         
3605         /*
3606          * Handle submaps.
3607          */
3608         if (entry->maptype == VM_MAPTYPE_SUBMAP) {
3609                 vm_map_t old_map = map;
3610
3611                 *var_map = map = entry->object.sub_map;
3612                 if (use_read_lock)
3613                         vm_map_unlock_read(old_map);
3614                 else
3615                         vm_map_unlock(old_map);
3616                 use_read_lock = 1;
3617                 goto RetryLookup;
3618         }
3619
3620         /*
3621          * Check whether this task is allowed to have this page.
3622          * Note the special case for MAP_ENTRY_COW
3623          * pages with an override.  This is to implement a forced
3624          * COW for debuggers.
3625          */
3626
3627         if (fault_type & VM_PROT_OVERRIDE_WRITE)
3628                 prot = entry->max_protection;
3629         else
3630                 prot = entry->protection;
3631
3632         fault_type &= (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
3633         if ((fault_type & prot) != fault_type) {
3634                 rv = KERN_PROTECTION_FAILURE;
3635                 goto done;
3636         }
3637
3638         if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
3639             (entry->eflags & MAP_ENTRY_COW) &&
3640             (fault_type & VM_PROT_WRITE) &&
3641             (fault_typea & VM_PROT_OVERRIDE_WRITE) == 0) {
3642                 rv = KERN_PROTECTION_FAILURE;
3643                 goto done;
3644         }
3645
3646         /*
3647          * If this page is not pageable, we have to get it for all possible
3648          * accesses.
3649          */
3650         *wired = (entry->wired_count != 0);
3651         if (*wired)
3652                 prot = fault_type = entry->protection;
3653
3654         /*
3655          * Virtual page tables may need to update the accessed (A) bit
3656          * in a page table entry.  Upgrade the fault to a write fault for
3657          * that case if the map will support it.  If the map does not support
3658          * it the page table entry simply will not be updated.
3659          */
3660         if (entry->maptype == VM_MAPTYPE_VPAGETABLE) {
3661                 if (prot & VM_PROT_WRITE)
3662                         fault_type |= VM_PROT_WRITE;
3663         }
3664
3665         /*
3666          * If the entry was copy-on-write, we either ...
3667          */
3668         if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
3669                 /*
3670                  * If we want to write the page, we may as well handle that
3671                  * now since we've got the map locked.
3672                  *
3673                  * If we don't need to write the page, we just demote the
3674                  * permissions allowed.
3675                  */
3676
3677                 if (fault_type & VM_PROT_WRITE) {
3678                         /*
3679                          * Make a new object, and place it in the object
3680                          * chain.  Note that no new references have appeared
3681                          * -- one just moved from the map to the new
3682                          * object.
3683                          */
3684
3685                         if (use_read_lock && vm_map_lock_upgrade(map)) {
3686                                 use_read_lock = 0;
3687                                 goto RetryLookup;
3688                         }
3689                         use_read_lock = 0;
3690
3691                         vm_map_entry_shadow(entry);
3692                 } else {
3693                         /*
3694                          * We're attempting to read a copy-on-write page --
3695                          * don't allow writes.
3696                          */
3697
3698                         prot &= ~VM_PROT_WRITE;
3699                 }
3700         }
3701
3702         /*
3703          * Create an object if necessary.
3704          */
3705         if (entry->object.vm_object == NULL &&
3706             !map->system_map) {
3707                 if (use_read_lock && vm_map_lock_upgrade(map))  {
3708                         use_read_lock = 0;
3709                         goto RetryLookup;
3710                 }
3711                 use_read_lock = 0;
3712                 vm_map_entry_allocate_object(entry);
3713         }
3714
3715         /*
3716          * Return the object/offset from this entry.  If the entry was
3717          * copy-on-write or empty, it has been fixed up.
3718          */
3719
3720         *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
3721         *object = entry->object.vm_object;
3722
3723         /*
3724          * Return whether this is the only map sharing this data.  On
3725          * success we return with a read lock held on the map.  On failure
3726          * we return with the map unlocked.
3727          */
3728         *out_prot = prot;
3729 done:
3730         if (rv == KERN_SUCCESS) {
3731                 if (use_read_lock == 0)
3732                         vm_map_lock_downgrade(map);
3733         } else if (use_read_lock) {
3734                 vm_map_unlock_read(map);
3735         } else {
3736                 vm_map_unlock(map);
3737         }
3738         return (rv);
3739 }
3740
3741 /*
3742  * Releases locks acquired by a vm_map_lookup()
3743  * (according to the handle returned by that lookup).
3744  *
3745  * No other requirements.
3746  */
3747 void
3748 vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry, int count)
3749 {
3750         /*
3751          * Unlock the main-level map
3752          */
3753         vm_map_unlock_read(map);
3754         if (count)
3755                 vm_map_entry_release(count);
3756 }
3757
3758 #include "opt_ddb.h"
3759 #ifdef DDB
3760 #include <sys/kernel.h>
3761
3762 #include <ddb/ddb.h>
3763
3764 /*
3765  * Debugging only
3766  */
3767 DB_SHOW_COMMAND(map, vm_map_print)
3768 {
3769         static int nlines;
3770         /* XXX convert args. */
3771         vm_map_t map = (vm_map_t)addr;
3772         boolean_t full = have_addr;
3773
3774         vm_map_entry_t entry;
3775
3776         db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
3777             (void *)map,
3778             (void *)map->pmap, map->nentries, map->timestamp);
3779         nlines++;
3780
3781         if (!full && db_indent)
3782                 return;
3783
3784         db_indent += 2;
3785         for (entry = map->header.next; entry != &map->header;
3786             entry = entry->next) {
3787                 db_iprintf("map entry %p: start=%p, end=%p\n",
3788                     (void *)entry, (void *)entry->start, (void *)entry->end);
3789                 nlines++;
3790                 {
3791                         static char *inheritance_name[4] =
3792                         {"share", "copy", "none", "donate_copy"};
3793
3794                         db_iprintf(" prot=%x/%x/%s",
3795                             entry->protection,
3796                             entry->max_protection,
3797                             inheritance_name[(int)(unsigned char)entry->inheritance]);
3798                         if (entry->wired_count != 0)
3799                                 db_printf(", wired");
3800                 }
3801                 if (entry->maptype == VM_MAPTYPE_SUBMAP) {
3802                         /* XXX no %qd in kernel.  Truncate entry->offset. */
3803                         db_printf(", share=%p, offset=0x%lx\n",
3804                             (void *)entry->object.sub_map,
3805                             (long)entry->offset);
3806                         nlines++;
3807                         if ((entry->prev == &map->header) ||
3808                             (entry->prev->object.sub_map !=
3809                                 entry->object.sub_map)) {
3810                                 db_indent += 2;
3811                                 vm_map_print((db_expr_t)(intptr_t)
3812                                              entry->object.sub_map,
3813                                              full, 0, NULL);
3814                                 db_indent -= 2;
3815                         }
3816                 } else {
3817                         /* XXX no %qd in kernel.  Truncate entry->offset. */
3818                         db_printf(", object=%p, offset=0x%lx",
3819                             (void *)entry->object.vm_object,
3820                             (long)entry->offset);
3821                         if (entry->eflags & MAP_ENTRY_COW)
3822                                 db_printf(", copy (%s)",
3823                                     (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
3824                         db_printf("\n");
3825                         nlines++;
3826
3827                         if ((entry->prev == &map->header) ||
3828                             (entry->prev->object.vm_object !=
3829                                 entry->object.vm_object)) {
3830                                 db_indent += 2;
3831                                 vm_object_print((db_expr_t)(intptr_t)
3832                                                 entry->object.vm_object,
3833                                                 full, 0, NULL);
3834                                 nlines += 4;
3835                                 db_indent -= 2;
3836                         }
3837                 }
3838         }
3839         db_indent -= 2;
3840         if (db_indent == 0)
3841                 nlines = 0;
3842 }
3843
3844 /*
3845  * Debugging only
3846  */
3847 DB_SHOW_COMMAND(procvm, procvm)
3848 {
3849         struct proc *p;
3850
3851         if (have_addr) {
3852                 p = (struct proc *) addr;
3853         } else {
3854                 p = curproc;
3855         }
3856
3857         db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
3858             (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
3859             (void *)vmspace_pmap(p->p_vmspace));
3860
3861         vm_map_print((db_expr_t)(intptr_t)&p->p_vmspace->vm_map, 1, 0, NULL);
3862 }
3863
3864 #endif /* DDB */