1cd8df1ceeb8d59a23e3a3fedadc1d50af0ef693
[dragonfly.git] / sys / vm / vm_map.c
1 /*
2  * Copyright (c) 1991, 1993
3  *      The Regents of the University of California.  All rights reserved.
4  * Copyright (c) 2003-2017 The DragonFly Project.  All rights reserved.
5  *
6  * This code is derived from software contributed to Berkeley by
7  * The Mach Operating System project at Carnegie-Mellon University.
8  *
9  * This code is derived from software contributed to The DragonFly Project
10  * by Matthew Dillon <dillon@backplane.com>
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *      from: @(#)vm_map.c      8.3 (Berkeley) 1/12/94
37  *
38  *
39  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
40  * All rights reserved.
41  *
42  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
43  *
44  * Permission to use, copy, modify and distribute this software and
45  * its documentation is hereby granted, provided that both the copyright
46  * notice and this permission notice appear in all copies of the
47  * software, derivative works or modified versions, and any portions
48  * thereof, and that both notices appear in supporting documentation.
49  *
50  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
51  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
52  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
53  *
54  * Carnegie Mellon requests users of this software to return to
55  *
56  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
57  *  School of Computer Science
58  *  Carnegie Mellon University
59  *  Pittsburgh PA 15213-3890
60  *
61  * any improvements or extensions that they make and grant Carnegie the
62  * rights to redistribute these changes.
63  *
64  * $FreeBSD: src/sys/vm/vm_map.c,v 1.187.2.19 2003/05/27 00:47:02 alc Exp $
65  */
66
67 /*
68  *      Virtual memory mapping module.
69  */
70
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/kernel.h>
74 #include <sys/proc.h>
75 #include <sys/serialize.h>
76 #include <sys/lock.h>
77 #include <sys/vmmeter.h>
78 #include <sys/mman.h>
79 #include <sys/vnode.h>
80 #include <sys/resourcevar.h>
81 #include <sys/shm.h>
82 #include <sys/tree.h>
83 #include <sys/malloc.h>
84 #include <sys/objcache.h>
85 #include <sys/kern_syscall.h>
86
87 #include <vm/vm.h>
88 #include <vm/vm_param.h>
89 #include <vm/pmap.h>
90 #include <vm/vm_map.h>
91 #include <vm/vm_page.h>
92 #include <vm/vm_object.h>
93 #include <vm/vm_pager.h>
94 #include <vm/vm_kern.h>
95 #include <vm/vm_extern.h>
96 #include <vm/swap_pager.h>
97 #include <vm/vm_zone.h>
98
99 #include <sys/random.h>
100 #include <sys/sysctl.h>
101 #include <sys/spinlock.h>
102
103 #include <sys/thread2.h>
104 #include <sys/spinlock2.h>
105
106 /*
107  * Virtual memory maps provide for the mapping, protection, and sharing
108  * of virtual memory objects.  In addition, this module provides for an
109  * efficient virtual copy of memory from one map to another.
110  *
111  * Synchronization is required prior to most operations.
112  *
113  * Maps consist of an ordered doubly-linked list of simple entries.
114  * A hint and a RB tree is used to speed-up lookups.
115  *
116  * Callers looking to modify maps specify start/end addresses which cause
117  * the related map entry to be clipped if necessary, and then later
118  * recombined if the pieces remained compatible.
119  *
120  * Virtual copy operations are performed by copying VM object references
121  * from one map to another, and then marking both regions as copy-on-write.
122  */
123 static boolean_t vmspace_ctor(void *obj, void *privdata, int ocflags);
124 static void vmspace_dtor(void *obj, void *privdata);
125 static void vmspace_terminate(struct vmspace *vm, int final);
126
127 MALLOC_DEFINE(M_VMSPACE, "vmspace", "vmspace objcache backingstore");
128 MALLOC_DEFINE(M_MAP_BACKING, "map_backing", "vm_map_backing to entry");
129 static struct objcache *vmspace_cache;
130
131 /*
132  * per-cpu page table cross mappings are initialized in early boot
133  * and might require a considerable number of vm_map_entry structures.
134  */
135 #define MAPENTRYBSP_CACHE       (MAXCPU+1)
136 #define MAPENTRYAP_CACHE        8
137
138 /*
139  * Partioning threaded programs with large anonymous memory areas can
140  * improve concurrent fault performance.
141  */
142 #define MAP_ENTRY_PARTITION_SIZE        ((vm_offset_t)(32 * 1024 * 1024))
143 #define MAP_ENTRY_PARTITION_MASK        (MAP_ENTRY_PARTITION_SIZE - 1)
144
145 #define VM_MAP_ENTRY_WITHIN_PARTITION(entry)    \
146         ((((entry)->start ^ (entry)->end) & ~MAP_ENTRY_PARTITION_MASK) == 0)
147
148 static struct vm_zone mapentzone_store;
149 static vm_zone_t mapentzone;
150
151 static struct vm_map_entry map_entry_init[MAX_MAPENT];
152 static struct vm_map_entry cpu_map_entry_init_bsp[MAPENTRYBSP_CACHE];
153 static struct vm_map_entry cpu_map_entry_init_ap[MAXCPU][MAPENTRYAP_CACHE];
154
155 static int randomize_mmap;
156 SYSCTL_INT(_vm, OID_AUTO, randomize_mmap, CTLFLAG_RW, &randomize_mmap, 0,
157     "Randomize mmap offsets");
158 static int vm_map_relock_enable = 1;
159 SYSCTL_INT(_vm, OID_AUTO, map_relock_enable, CTLFLAG_RW,
160            &vm_map_relock_enable, 0, "insert pop pgtable optimization");
161 static int vm_map_partition_enable = 1;
162 SYSCTL_INT(_vm, OID_AUTO, map_partition_enable, CTLFLAG_RW,
163            &vm_map_partition_enable, 0, "Break up larger vm_map_entry's");
164 static int vm_map_backing_limit = 5;
165 SYSCTL_INT(_vm, OID_AUTO, map_backing_limit, CTLFLAG_RW,
166            &vm_map_backing_limit, 0, "ba.backing_ba link depth");
167
168 static void vmspace_drop_notoken(struct vmspace *vm);
169 static void vm_map_entry_shadow(vm_map_entry_t entry, int addref);
170 static vm_map_entry_t vm_map_entry_create(vm_map_t map, int *);
171 static void vm_map_entry_dispose (vm_map_t map, vm_map_entry_t entry, int *);
172 static void vm_map_entry_dispose_ba (vm_map_backing_t ba);
173 static void _vm_map_clip_end (vm_map_t, vm_map_entry_t, vm_offset_t, int *);
174 static void _vm_map_clip_start (vm_map_t, vm_map_entry_t, vm_offset_t, int *);
175 static void vm_map_entry_delete (vm_map_t, vm_map_entry_t, int *);
176 static void vm_map_entry_unwire (vm_map_t, vm_map_entry_t);
177 static void vm_map_copy_entry (vm_map_t, vm_map_t, vm_map_entry_t,
178                 vm_map_entry_t);
179 static void vm_map_unclip_range (vm_map_t map, vm_map_entry_t start_entry,
180                 vm_offset_t start, vm_offset_t end, int *countp, int flags);
181 static void vm_map_entry_partition(vm_map_t map, vm_map_entry_t entry,
182                 vm_offset_t vaddr, int *countp);
183
184 /*
185  * Initialize the vm_map module.  Must be called before any other vm_map
186  * routines.
187  *
188  * Map and entry structures are allocated from the general purpose
189  * memory pool with some exceptions:
190  *
191  *      - The kernel map is allocated statically.
192  *      - Initial kernel map entries are allocated out of a static pool.
193  *      - We must set ZONE_SPECIAL here or the early boot code can get
194  *        stuck if there are >63 cores.
195  *
196  *      These restrictions are necessary since malloc() uses the
197  *      maps and requires map entries.
198  *
199  * Called from the low level boot code only.
200  */
201 void
202 vm_map_startup(void)
203 {
204         mapentzone = &mapentzone_store;
205         zbootinit(mapentzone, "MAP ENTRY", sizeof (struct vm_map_entry),
206                   map_entry_init, MAX_MAPENT);
207         mapentzone_store.zflags |= ZONE_SPECIAL;
208 }
209
210 /*
211  * Called prior to any vmspace allocations.
212  *
213  * Called from the low level boot code only.
214  */
215 void
216 vm_init2(void) 
217 {
218         vmspace_cache = objcache_create_mbacked(M_VMSPACE,
219                                                 sizeof(struct vmspace),
220                                                 0, ncpus * 4,
221                                                 vmspace_ctor, vmspace_dtor,
222                                                 NULL);
223         zinitna(mapentzone, NULL, 0, 0, ZONE_USE_RESERVE | ZONE_SPECIAL);
224         pmap_init2();
225         vm_object_init2();
226 }
227
228 /*
229  * objcache support.  We leave the pmap root cached as long as possible
230  * for performance reasons.
231  */
232 static
233 boolean_t
234 vmspace_ctor(void *obj, void *privdata, int ocflags)
235 {
236         struct vmspace *vm = obj;
237
238         bzero(vm, sizeof(*vm));
239         vm->vm_refcnt = VM_REF_DELETED;
240
241         return 1;
242 }
243
244 static
245 void
246 vmspace_dtor(void *obj, void *privdata)
247 {
248         struct vmspace *vm = obj;
249
250         KKASSERT(vm->vm_refcnt == VM_REF_DELETED);
251         pmap_puninit(vmspace_pmap(vm));
252 }
253
254 /*
255  * Red black tree functions
256  *
257  * The caller must hold the related map lock.
258  */
259 static int rb_vm_map_compare(vm_map_entry_t a, vm_map_entry_t b);
260 RB_GENERATE(vm_map_rb_tree, vm_map_entry, rb_entry, rb_vm_map_compare);
261
262 /* a->start is address, and the only field which must be initialized */
263 static int
264 rb_vm_map_compare(vm_map_entry_t a, vm_map_entry_t b)
265 {
266         if (a->start < b->start)
267                 return(-1);
268         else if (a->start > b->start)
269                 return(1);
270         return(0);
271 }
272
273 /*
274  * Initialize vmspace ref/hold counts vmspace0.  There is a holdcnt for
275  * every refcnt.
276  */
277 void
278 vmspace_initrefs(struct vmspace *vm)
279 {
280         vm->vm_refcnt = 1;
281         vm->vm_holdcnt = 1;
282 }
283
284 /*
285  * Allocate a vmspace structure, including a vm_map and pmap.
286  * Initialize numerous fields.  While the initial allocation is zerod,
287  * subsequence reuse from the objcache leaves elements of the structure
288  * intact (particularly the pmap), so portions must be zerod.
289  *
290  * Returns a referenced vmspace.
291  *
292  * No requirements.
293  */
294 struct vmspace *
295 vmspace_alloc(vm_offset_t min, vm_offset_t max)
296 {
297         struct vmspace *vm;
298
299         vm = objcache_get(vmspace_cache, M_WAITOK);
300
301         bzero(&vm->vm_startcopy,
302               (char *)&vm->vm_endcopy - (char *)&vm->vm_startcopy);
303         vm_map_init(&vm->vm_map, min, max, NULL);       /* initializes token */
304
305         /*
306          * NOTE: hold to acquires token for safety.
307          *
308          * On return vmspace is referenced (refs=1, hold=1).  That is,
309          * each refcnt also has a holdcnt.  There can be additional holds
310          * (holdcnt) above and beyond the refcnt.  Finalization is handled in
311          * two stages, one on refs 1->0, and the the second on hold 1->0.
312          */
313         KKASSERT(vm->vm_holdcnt == 0);
314         KKASSERT(vm->vm_refcnt == VM_REF_DELETED);
315         vmspace_initrefs(vm);
316         vmspace_hold(vm);
317         pmap_pinit(vmspace_pmap(vm));           /* (some fields reused) */
318         vm->vm_map.pmap = vmspace_pmap(vm);     /* XXX */
319         vm->vm_shm = NULL;
320         vm->vm_flags = 0;
321         cpu_vmspace_alloc(vm);
322         vmspace_drop(vm);
323
324         return (vm);
325 }
326
327 /*
328  * NOTE: Can return 0 if the vmspace is exiting.
329  */
330 int
331 vmspace_getrefs(struct vmspace *vm)
332 {
333         int32_t n;
334
335         n = vm->vm_refcnt;
336         cpu_ccfence();
337         if (n & VM_REF_DELETED)
338                 n = -1;
339         return n;
340 }
341
342 void
343 vmspace_hold(struct vmspace *vm)
344 {
345         atomic_add_int(&vm->vm_holdcnt, 1);
346         lwkt_gettoken(&vm->vm_map.token);
347 }
348
349 /*
350  * Drop with final termination interlock.
351  */
352 void
353 vmspace_drop(struct vmspace *vm)
354 {
355         lwkt_reltoken(&vm->vm_map.token);
356         vmspace_drop_notoken(vm);
357 }
358
359 static void
360 vmspace_drop_notoken(struct vmspace *vm)
361 {
362         if (atomic_fetchadd_int(&vm->vm_holdcnt, -1) == 1) {
363                 if (vm->vm_refcnt & VM_REF_DELETED)
364                         vmspace_terminate(vm, 1);
365         }
366 }
367
368 /*
369  * A vmspace object must not be in a terminated state to be able to obtain
370  * additional refs on it.
371  *
372  * These are official references to the vmspace, the count is used to check
373  * for vmspace sharing.  Foreign accessors should use 'hold' and not 'ref'.
374  *
375  * XXX we need to combine hold & ref together into one 64-bit field to allow
376  * holds to prevent stage-1 termination.
377  */
378 void
379 vmspace_ref(struct vmspace *vm)
380 {
381         uint32_t n;
382
383         atomic_add_int(&vm->vm_holdcnt, 1);
384         n = atomic_fetchadd_int(&vm->vm_refcnt, 1);
385         KKASSERT((n & VM_REF_DELETED) == 0);
386 }
387
388 /*
389  * Release a ref on the vmspace.  On the 1->0 transition we do stage-1
390  * termination of the vmspace.  Then, on the final drop of the hold we
391  * will do stage-2 final termination.
392  */
393 void
394 vmspace_rel(struct vmspace *vm)
395 {
396         uint32_t n;
397
398         /*
399          * Drop refs.  Each ref also has a hold which is also dropped.
400          *
401          * When refs hits 0 compete to get the VM_REF_DELETED flag (hold
402          * prevent finalization) to start termination processing.
403          * Finalization occurs when the last hold count drops to 0.
404          */
405         n = atomic_fetchadd_int(&vm->vm_refcnt, -1) - 1;
406         while (n == 0) {
407                 if (atomic_cmpset_int(&vm->vm_refcnt, 0, VM_REF_DELETED)) {
408                         vmspace_terminate(vm, 0);
409                         break;
410                 }
411                 n = vm->vm_refcnt;
412                 cpu_ccfence();
413         }
414         vmspace_drop_notoken(vm);
415 }
416
417 /*
418  * This is called during exit indicating that the vmspace is no
419  * longer in used by an exiting process, but the process has not yet
420  * been reaped.
421  *
422  * We drop refs, allowing for stage-1 termination, but maintain a holdcnt
423  * to prevent stage-2 until the process is reaped.  Note hte order of
424  * operation, we must hold first.
425  *
426  * No requirements.
427  */
428 void
429 vmspace_relexit(struct vmspace *vm)
430 {
431         atomic_add_int(&vm->vm_holdcnt, 1);
432         vmspace_rel(vm);
433 }
434
435 /*
436  * Called during reap to disconnect the remainder of the vmspace from
437  * the process.  On the hold drop the vmspace termination is finalized.
438  *
439  * No requirements.
440  */
441 void
442 vmspace_exitfree(struct proc *p)
443 {
444         struct vmspace *vm;
445
446         vm = p->p_vmspace;
447         p->p_vmspace = NULL;
448         vmspace_drop_notoken(vm);
449 }
450
451 /*
452  * Called in two cases:
453  *
454  * (1) When the last refcnt is dropped and the vmspace becomes inactive,
455  *     called with final == 0.  refcnt will be (u_int)-1 at this point,
456  *     and holdcnt will still be non-zero.
457  *
458  * (2) When holdcnt becomes 0, called with final == 1.  There should no
459  *     longer be anyone with access to the vmspace.
460  *
461  * VMSPACE_EXIT1 flags the primary deactivation
462  * VMSPACE_EXIT2 flags the last reap
463  */
464 static void
465 vmspace_terminate(struct vmspace *vm, int final)
466 {
467         int count;
468
469         lwkt_gettoken(&vm->vm_map.token);
470         if (final == 0) {
471                 KKASSERT((vm->vm_flags & VMSPACE_EXIT1) == 0);
472                 vm->vm_flags |= VMSPACE_EXIT1;
473
474                 /*
475                  * Get rid of most of the resources.  Leave the kernel pmap
476                  * intact.
477                  *
478                  * If the pmap does not contain wired pages we can bulk-delete
479                  * the pmap as a performance optimization before removing the
480                  * related mappings.
481                  *
482                  * If the pmap contains wired pages we cannot do this
483                  * pre-optimization because currently vm_fault_unwire()
484                  * expects the pmap pages to exist and will not decrement
485                  * p->wire_count if they do not.
486                  */
487                 shmexit(vm);
488                 if (vmspace_pmap(vm)->pm_stats.wired_count) {
489                         vm_map_remove(&vm->vm_map, VM_MIN_USER_ADDRESS,
490                                       VM_MAX_USER_ADDRESS);
491                         pmap_remove_pages(vmspace_pmap(vm), VM_MIN_USER_ADDRESS,
492                                           VM_MAX_USER_ADDRESS);
493                 } else {
494                         pmap_remove_pages(vmspace_pmap(vm), VM_MIN_USER_ADDRESS,
495                                           VM_MAX_USER_ADDRESS);
496                         vm_map_remove(&vm->vm_map, VM_MIN_USER_ADDRESS,
497                                       VM_MAX_USER_ADDRESS);
498                 }
499                 lwkt_reltoken(&vm->vm_map.token);
500         } else {
501                 KKASSERT((vm->vm_flags & VMSPACE_EXIT1) != 0);
502                 KKASSERT((vm->vm_flags & VMSPACE_EXIT2) == 0);
503
504                 /*
505                  * Get rid of remaining basic resources.
506                  */
507                 vm->vm_flags |= VMSPACE_EXIT2;
508                 shmexit(vm);
509
510                 count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
511                 vm_map_lock(&vm->vm_map);
512                 cpu_vmspace_free(vm);
513
514                 /*
515                  * Lock the map, to wait out all other references to it.
516                  * Delete all of the mappings and pages they hold, then call
517                  * the pmap module to reclaim anything left.
518                  */
519                 vm_map_delete(&vm->vm_map,
520                               vm_map_min(&vm->vm_map),
521                               vm_map_max(&vm->vm_map),
522                               &count);
523                 vm_map_unlock(&vm->vm_map);
524                 vm_map_entry_release(count);
525
526                 pmap_release(vmspace_pmap(vm));
527                 lwkt_reltoken(&vm->vm_map.token);
528                 objcache_put(vmspace_cache, vm);
529         }
530 }
531
532 /*
533  * Swap useage is determined by taking the proportional swap used by
534  * VM objects backing the VM map.  To make up for fractional losses,
535  * if the VM object has any swap use at all the associated map entries
536  * count for at least 1 swap page.
537  *
538  * No requirements.
539  */
540 vm_offset_t
541 vmspace_swap_count(struct vmspace *vm)
542 {
543         vm_map_t map = &vm->vm_map;
544         vm_map_entry_t cur;
545         vm_object_t object;
546         vm_offset_t count = 0;
547         vm_offset_t n;
548
549         vmspace_hold(vm);
550
551         RB_FOREACH(cur, vm_map_rb_tree, &map->rb_root) {
552                 switch(cur->maptype) {
553                 case VM_MAPTYPE_NORMAL:
554                 case VM_MAPTYPE_VPAGETABLE:
555                         if ((object = cur->ba.object) == NULL)
556                                 break;
557                         if (object->swblock_count) {
558                                 n = (cur->end - cur->start) / PAGE_SIZE;
559                                 count += object->swblock_count *
560                                     SWAP_META_PAGES * n / object->size + 1;
561                         }
562                         break;
563                 default:
564                         break;
565                 }
566         }
567         vmspace_drop(vm);
568
569         return(count);
570 }
571
572 /*
573  * Calculate the approximate number of anonymous pages in use by
574  * this vmspace.  To make up for fractional losses, we count each
575  * VM object as having at least 1 anonymous page.
576  *
577  * No requirements.
578  */
579 vm_offset_t
580 vmspace_anonymous_count(struct vmspace *vm)
581 {
582         vm_map_t map = &vm->vm_map;
583         vm_map_entry_t cur;
584         vm_object_t object;
585         vm_offset_t count = 0;
586
587         vmspace_hold(vm);
588         RB_FOREACH(cur, vm_map_rb_tree, &map->rb_root) {
589                 switch(cur->maptype) {
590                 case VM_MAPTYPE_NORMAL:
591                 case VM_MAPTYPE_VPAGETABLE:
592                         if ((object = cur->ba.object) == NULL)
593                                 break;
594                         if (object->type != OBJT_DEFAULT &&
595                             object->type != OBJT_SWAP) {
596                                 break;
597                         }
598                         count += object->resident_page_count;
599                         break;
600                 default:
601                         break;
602                 }
603         }
604         vmspace_drop(vm);
605
606         return(count);
607 }
608
609 /*
610  * Initialize an existing vm_map structure such as that in the vmspace
611  * structure.  The pmap is initialized elsewhere.
612  *
613  * No requirements.
614  */
615 void
616 vm_map_init(struct vm_map *map, vm_offset_t min_addr, vm_offset_t max_addr,
617             pmap_t pmap)
618 {
619         RB_INIT(&map->rb_root);
620         spin_init(&map->ilock_spin, "ilock");
621         map->ilock_base = NULL;
622         map->nentries = 0;
623         map->size = 0;
624         map->system_map = 0;
625         vm_map_min(map) = min_addr;
626         vm_map_max(map) = max_addr;
627         map->pmap = pmap;
628         map->timestamp = 0;
629         map->flags = 0;
630         bzero(&map->freehint, sizeof(map->freehint));
631         lwkt_token_init(&map->token, "vm_map");
632         lockinit(&map->lock, "vm_maplk", (hz + 9) / 10, 0);
633 }
634
635 /*
636  * Find the first possible free address for the specified request length.
637  * Returns 0 if we don't have one cached.
638  */
639 static
640 vm_offset_t
641 vm_map_freehint_find(vm_map_t map, vm_size_t length, vm_size_t align)
642 {
643         vm_map_freehint_t *scan;
644
645         scan = &map->freehint[0];
646         while (scan < &map->freehint[VM_MAP_FFCOUNT]) {
647                 if (scan->length == length && scan->align == align)
648                         return(scan->start);
649                 ++scan;
650         }
651         return 0;
652 }
653
654 /*
655  * Unconditionally set the freehint.  Called by vm_map_findspace() after
656  * it finds an address.  This will help us iterate optimally on the next
657  * similar findspace.
658  */
659 static
660 void
661 vm_map_freehint_update(vm_map_t map, vm_offset_t start,
662                        vm_size_t length, vm_size_t align)
663 {
664         vm_map_freehint_t *scan;
665
666         scan = &map->freehint[0];
667         while (scan < &map->freehint[VM_MAP_FFCOUNT]) {
668                 if (scan->length == length && scan->align == align) {
669                         scan->start = start;
670                         return;
671                 }
672                 ++scan;
673         }
674         scan = &map->freehint[map->freehint_newindex & VM_MAP_FFMASK];
675         scan->start = start;
676         scan->align = align;
677         scan->length = length;
678         ++map->freehint_newindex;
679 }
680
681 /*
682  * Update any existing freehints (for any alignment), for the hole we just
683  * added.
684  */
685 static
686 void
687 vm_map_freehint_hole(vm_map_t map, vm_offset_t start, vm_size_t length)
688 {
689         vm_map_freehint_t *scan;
690
691         scan = &map->freehint[0];
692         while (scan < &map->freehint[VM_MAP_FFCOUNT]) {
693                 if (scan->length <= length && scan->start > start)
694                         scan->start = start;
695                 ++scan;
696         }
697 }
698
699 /*
700  * This function handles MAP_ENTRY_NEEDS_COPY by inserting a fronting
701  * object in the entry for COW faults.
702  *
703  * The entire chain including entry->ba (prior to inserting the fronting
704  * object) essentially becomes set in stone... elements of it can be paged
705  * in or out, but cannot be further modified.
706  *
707  * NOTE: If we do not optimize the backing chain then a unique copy is not
708  *       needed.  Note, however, that because portions of the chain are
709  *       shared across pmaps we cannot make any changes to the vm_map_backing
710  *       elements themselves.
711  *
712  * If the map segment is governed by a virtual page table then it is
713  * possible to address offsets beyond the mapped area.  Just allocate
714  * a maximally sized object for this case.
715  *
716  * If addref is non-zero an additional reference is added to the returned
717  * entry.  This mechanic exists because the additional reference might have
718  * to be added atomically and not after return to prevent a premature
719  * collapse.  XXX currently there is no collapse code.
720  *
721  * The vm_map must be exclusively locked.
722  * No other requirements.
723  */
724 static
725 void
726 vm_map_entry_shadow(vm_map_entry_t entry, int addref)
727 {
728         vm_map_backing_t ba;
729         vm_size_t length;
730         vm_object_t source;
731         vm_object_t result;
732         int drop_source;
733
734         if (entry->maptype == VM_MAPTYPE_VPAGETABLE)
735                 length = 0x7FFFFFFF;
736         else
737                 length = atop(entry->end - entry->start);
738         ba = kmalloc(sizeof(*ba), M_MAP_BACKING, M_INTWAIT); /* copied later */
739
740         /*
741          * The ref on source is inherited when we move it into the ba.
742          */
743         source = entry->ba.object;
744
745         /*
746          * Don't create the new object if the old object isn't shared.
747          *
748          * If addref is non-zero additional ref(s) are being added (probably
749          * for map entry fork purposes), so clear OBJ_ONEMAPPING.
750          *
751          * WARNING! Checking ref_count == 1 only works because we are testing
752          *          the object embedded in the entry (entry->ba.object).
753          *          This test DOES NOT WORK if checking an object hanging off
754          *          the backing chain (entry->ba.backing_ba list) because the
755          *          vm_map_backing might be shared, or part of a chain that
756          *          is shared.  Checking ba->refs is worthless.
757          */
758         drop_source = 0;
759         if (source) {
760                 if (source->type != OBJT_VNODE) {
761                         vm_object_hold(source);
762                         if (source->ref_count == 1 &&
763                             source->handle == NULL &&
764                             (source->type == OBJT_DEFAULT ||
765                              source->type == OBJT_SWAP)) {
766                                 if (addref) {
767                                         vm_object_reference_locked(source);
768                                         vm_object_clear_flag(source,
769                                                              OBJ_ONEMAPPING);
770                                 }
771                                 vm_object_drop(source);
772                                 kfree(ba, M_MAP_BACKING);
773                                 goto done;
774                         }
775                         /*vm_object_reference_locked(source);*/
776                         vm_object_clear_flag(source, OBJ_ONEMAPPING);
777                         drop_source = 1;        /* drop source at end */
778                 } else {
779                         /*vm_object_reference_quick(source);*/
780                         vm_object_clear_flag(source, OBJ_ONEMAPPING);
781                 }
782         }
783
784         /*
785          * Allocate a new object with the given length.  The new object
786          * is returned referenced but we may have to add another one.
787          * If we are adding a second reference we must clear OBJ_ONEMAPPING.
788          * (typically because the caller is about to clone a vm_map_entry).
789          *
790          * The source object currently has an extra reference to prevent
791          * collapses into it while we mess with its shadow list, which
792          * we will remove later in this routine.
793          *
794          * The target object may require a second reference if asked for one
795          * by the caller.
796          */
797         result = vm_object_allocate(OBJT_DEFAULT, length);
798         if (result == NULL)
799                 panic("vm_object_shadow: no object for shadowing");
800         vm_object_hold(result);
801         if (addref) {
802                 vm_object_reference_locked(result);
803                 vm_object_clear_flag(result, OBJ_ONEMAPPING);
804         }
805
806         /*
807          * The new object shadows the source object.
808          *
809          * Try to optimize the result object's page color when shadowing
810          * in order to maintain page coloring consistency in the combined
811          * shadowed object.
812          *
813          * The source object is moved to ba, retaining its existing ref-count.
814          * No additional ref is needed.
815          *
816          * SHADOWING IS NOT APPLICABLE TO OBJT_VNODE OBJECTS
817          */
818         *ba = entry->ba;                /* previous ba */
819         ba->refs = 1;                   /* initialize ref count */
820         entry->ba.object = result;      /* new ba (at head of entry) */
821         entry->ba.backing_ba = ba;
822         entry->ba.backing_count = ba->backing_count + 1;
823         entry->ba.offset = 0;
824         entry->ba.refs = 0;
825
826         if (source) {
827 #if 0
828                 /* shadowing no longer messes with generation count */
829                 if (drop_source) {
830                         atomic_add_int(&source->generation, 1);
831                         vm_object_set_flag(result, OBJ_ONSHADOW);
832                 }
833 #endif
834                 /* cpu localization twist */
835                 result->pg_color = vm_quickcolor();
836         }
837
838         /*
839          * Adjust the return storage.  Drop the ref on source before
840          * returning.
841          */
842         vm_object_drop(result);
843         if (source) {
844                 if (drop_source) {
845                         /*vm_object_deallocate_locked(source);*/
846                         vm_object_drop(source);
847                 } else {
848                         /*vm_object_deallocate(source);*/
849                 }
850         }
851
852 done:
853         entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
854 }
855
856 /*
857  * Allocate an object for a vm_map_entry.
858  *
859  * Object allocation for anonymous mappings is defered as long as possible.
860  * This function is called when we can defer no longer, generally when a map
861  * entry might be split or forked or takes a page fault.
862  *
863  * If the map segment is governed by a virtual page table then it is
864  * possible to address offsets beyond the mapped area.  Just allocate
865  * a maximally sized object for this case.
866  *
867  * The vm_map must be exclusively locked.
868  * No other requirements.
869  */
870 void 
871 vm_map_entry_allocate_object(vm_map_entry_t entry)
872 {
873         vm_object_t obj;
874
875         /*
876          * ba.offset is added cumulatively in the backing_ba scan, so we
877          * can noly reset it to zero if ba.backing_ba is NULL.  We reset
878          * it to 0 only for debugging convenience.
879          *
880          * ba.offset cannot otherwise be modified because it effects
881          * the offsets for the entire backing_ba chain.
882          */
883         if (entry->ba.backing_ba == NULL)
884                 entry->ba.offset = 0;
885
886         if (entry->maptype == VM_MAPTYPE_VPAGETABLE) {
887                 obj = vm_object_allocate(OBJT_DEFAULT, 0x7FFFFFFF); /* XXX */
888         } else {
889                 obj = vm_object_allocate(OBJT_DEFAULT,
890                                          atop(entry->end - entry->start) +
891                                          entry->ba.offset);
892         }
893         entry->ba.object = obj;
894 }
895
896 /*
897  * Set an initial negative count so the first attempt to reserve
898  * space preloads a bunch of vm_map_entry's for this cpu.  Also
899  * pre-allocate 2 vm_map_entries which will be needed by zalloc() to
900  * map a new page for vm_map_entry structures.  SMP systems are
901  * particularly sensitive.
902  *
903  * This routine is called in early boot so we cannot just call
904  * vm_map_entry_reserve().
905  *
906  * Called from the low level boot code only (for each cpu)
907  *
908  * WARNING! Take care not to have too-big a static/BSS structure here
909  *          as MAXCPU can be 256+, otherwise the loader's 64MB heap
910  *          can get blown out by the kernel plus the initrd image.
911  */
912 void
913 vm_map_entry_reserve_cpu_init(globaldata_t gd)
914 {
915         vm_map_entry_t entry;
916         int count;
917         int i;
918
919         atomic_add_int(&gd->gd_vme_avail, -MAP_RESERVE_COUNT * 2);
920         if (gd->gd_cpuid == 0) {
921                 entry = &cpu_map_entry_init_bsp[0];
922                 count = MAPENTRYBSP_CACHE;
923         } else {
924                 entry = &cpu_map_entry_init_ap[gd->gd_cpuid][0];
925                 count = MAPENTRYAP_CACHE;
926         }
927         for (i = 0; i < count; ++i, ++entry) {
928                 MAPENT_FREELIST(entry) = gd->gd_vme_base;
929                 gd->gd_vme_base = entry;
930         }
931 }
932
933 /*
934  * Reserves vm_map_entry structures so code later-on can manipulate
935  * map_entry structures within a locked map without blocking trying
936  * to allocate a new vm_map_entry.
937  *
938  * No requirements.
939  *
940  * WARNING!  We must not decrement gd_vme_avail until after we have
941  *           ensured that sufficient entries exist, otherwise we can
942  *           get into an endless call recursion in the zalloc code
943  *           itself.
944  */
945 int
946 vm_map_entry_reserve(int count)
947 {
948         struct globaldata *gd = mycpu;
949         vm_map_entry_t entry;
950
951         /*
952          * Make sure we have enough structures in gd_vme_base to handle
953          * the reservation request.
954          *
955          * Use a critical section to protect against VM faults.  It might
956          * not be needed, but we have to be careful here.
957          */
958         if (gd->gd_vme_avail < count) {
959                 crit_enter();
960                 while (gd->gd_vme_avail < count) {
961                         entry = zalloc(mapentzone);
962                         MAPENT_FREELIST(entry) = gd->gd_vme_base;
963                         gd->gd_vme_base = entry;
964                         atomic_add_int(&gd->gd_vme_avail, 1);
965                 }
966                 crit_exit();
967         }
968         atomic_add_int(&gd->gd_vme_avail, -count);
969
970         return(count);
971 }
972
973 /*
974  * Releases previously reserved vm_map_entry structures that were not
975  * used.  If we have too much junk in our per-cpu cache clean some of
976  * it out.
977  *
978  * No requirements.
979  */
980 void
981 vm_map_entry_release(int count)
982 {
983         struct globaldata *gd = mycpu;
984         vm_map_entry_t entry;
985         vm_map_entry_t efree;
986
987         count = atomic_fetchadd_int(&gd->gd_vme_avail, count) + count;
988         if (gd->gd_vme_avail > MAP_RESERVE_SLOP) {
989                 efree = NULL;
990                 crit_enter();
991                 while (gd->gd_vme_avail > MAP_RESERVE_HYST) {
992                         entry = gd->gd_vme_base;
993                         KKASSERT(entry != NULL);
994                         gd->gd_vme_base = MAPENT_FREELIST(entry);
995                         atomic_add_int(&gd->gd_vme_avail, -1);
996                         MAPENT_FREELIST(entry) = efree;
997                         efree = entry;
998                 }
999                 crit_exit();
1000                 while ((entry = efree) != NULL) {
1001                         efree = MAPENT_FREELIST(efree);
1002                         zfree(mapentzone, entry);
1003                 }
1004         }
1005 }
1006
1007 /*
1008  * Reserve map entry structures for use in kernel_map itself.  These
1009  * entries have *ALREADY* been reserved on a per-cpu basis when the map
1010  * was inited.  This function is used by zalloc() to avoid a recursion
1011  * when zalloc() itself needs to allocate additional kernel memory.
1012  *
1013  * This function works like the normal reserve but does not load the
1014  * vm_map_entry cache (because that would result in an infinite
1015  * recursion).  Note that gd_vme_avail may go negative.  This is expected.
1016  *
1017  * Any caller of this function must be sure to renormalize after
1018  * potentially eating entries to ensure that the reserve supply
1019  * remains intact.
1020  *
1021  * No requirements.
1022  */
1023 int
1024 vm_map_entry_kreserve(int count)
1025 {
1026         struct globaldata *gd = mycpu;
1027
1028         atomic_add_int(&gd->gd_vme_avail, -count);
1029         KASSERT(gd->gd_vme_base != NULL,
1030                 ("no reserved entries left, gd_vme_avail = %d",
1031                 gd->gd_vme_avail));
1032         return(count);
1033 }
1034
1035 /*
1036  * Release previously reserved map entries for kernel_map.  We do not
1037  * attempt to clean up like the normal release function as this would
1038  * cause an unnecessary (but probably not fatal) deep procedure call.
1039  *
1040  * No requirements.
1041  */
1042 void
1043 vm_map_entry_krelease(int count)
1044 {
1045         struct globaldata *gd = mycpu;
1046
1047         atomic_add_int(&gd->gd_vme_avail, count);
1048 }
1049
1050 /*
1051  * Allocates a VM map entry for insertion.  No entry fields are filled in.
1052  *
1053  * The entries should have previously been reserved.  The reservation count
1054  * is tracked in (*countp).
1055  *
1056  * No requirements.
1057  */
1058 static vm_map_entry_t
1059 vm_map_entry_create(vm_map_t map, int *countp)
1060 {
1061         struct globaldata *gd = mycpu;
1062         vm_map_entry_t entry;
1063
1064         KKASSERT(*countp > 0);
1065         --*countp;
1066         crit_enter();
1067         entry = gd->gd_vme_base;
1068         KASSERT(entry != NULL, ("gd_vme_base NULL! count %d", *countp));
1069         gd->gd_vme_base = MAPENT_FREELIST(entry);
1070         crit_exit();
1071
1072         return(entry);
1073 }
1074
1075 /*
1076  * Dispose of the dynamically allocated backing_ba chain associated
1077  * with a vm_map_entry.
1078  *
1079  * We decrement the (possibly shared) element and kfree() on the
1080  * 1->0 transition.  We only iterate to the next backing_ba when
1081  * the previous one went through a 1->0 transition.
1082  */
1083 static void
1084 vm_map_entry_dispose_ba(vm_map_backing_t ba)
1085 {
1086         vm_map_backing_t next;
1087         long refs;
1088
1089         while (ba) {
1090                 refs = atomic_fetchadd_long(&ba->refs, -1);
1091                 if (refs > 1)
1092                         break;
1093                 KKASSERT(refs == 1);    /* transitioned 1->0 */
1094                 if (ba->object)
1095                         vm_object_deallocate(ba->object);
1096                 next = ba->backing_ba;
1097                 kfree(ba, M_MAP_BACKING);
1098                 ba = next;
1099         }
1100 }
1101
1102 /*
1103  * Dispose of a vm_map_entry that is no longer being referenced.
1104  *
1105  * No requirements.
1106  */
1107 static void
1108 vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry, int *countp)
1109 {
1110         struct globaldata *gd = mycpu;
1111
1112         /*
1113          * Dispose of the base object and the backing link.
1114          */
1115         switch(entry->maptype) {
1116         case VM_MAPTYPE_NORMAL:
1117         case VM_MAPTYPE_VPAGETABLE:
1118         case VM_MAPTYPE_SUBMAP:
1119                 if (entry->ba.object)
1120                         vm_object_deallocate(entry->ba.object);
1121                 break;
1122         case VM_MAPTYPE_UKSMAP:
1123                 /* XXX TODO */
1124                 break;
1125         default:
1126                 break;
1127         }
1128         vm_map_entry_dispose_ba(entry->ba.backing_ba);
1129
1130         /*
1131          * Cleanup for safety.
1132          */
1133         entry->ba.backing_ba = NULL;
1134         entry->ba.object = NULL;
1135         entry->ba.offset = 0;
1136
1137         ++*countp;
1138         crit_enter();
1139         MAPENT_FREELIST(entry) = gd->gd_vme_base;
1140         gd->gd_vme_base = entry;
1141         crit_exit();
1142 }
1143
1144
1145 /*
1146  * Insert/remove entries from maps.
1147  *
1148  * The related map must be exclusively locked.
1149  * The caller must hold map->token
1150  * No other requirements.
1151  */
1152 static __inline void
1153 vm_map_entry_link(vm_map_t map, vm_map_entry_t entry)
1154 {
1155         ASSERT_VM_MAP_LOCKED(map);
1156
1157         map->nentries++;
1158         if (vm_map_rb_tree_RB_INSERT(&map->rb_root, entry))
1159                 panic("vm_map_entry_link: dup addr map %p ent %p", map, entry);
1160 }
1161
1162 static __inline void
1163 vm_map_entry_unlink(vm_map_t map,
1164                     vm_map_entry_t entry)
1165 {
1166         ASSERT_VM_MAP_LOCKED(map);
1167
1168         if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
1169                 panic("vm_map_entry_unlink: attempt to mess with "
1170                       "locked entry! %p", entry);
1171         }
1172         vm_map_rb_tree_RB_REMOVE(&map->rb_root, entry);
1173         map->nentries--;
1174 }
1175
1176 /*
1177  * Finds the map entry containing (or immediately preceding) the specified
1178  * address in the given map.  The entry is returned in (*entry).
1179  *
1180  * The boolean result indicates whether the address is actually contained
1181  * in the map.
1182  *
1183  * The related map must be locked.
1184  * No other requirements.
1185  */
1186 boolean_t
1187 vm_map_lookup_entry(vm_map_t map, vm_offset_t address, vm_map_entry_t *entry)
1188 {
1189         vm_map_entry_t tmp;
1190         vm_map_entry_t last;
1191
1192         ASSERT_VM_MAP_LOCKED(map);
1193
1194         /*
1195          * Locate the record from the top of the tree.  'last' tracks the
1196          * closest prior record and is returned if no match is found, which
1197          * in binary tree terms means tracking the most recent right-branch
1198          * taken.  If there is no prior record, *entry is set to NULL.
1199          */
1200         last = NULL;
1201         tmp = RB_ROOT(&map->rb_root);
1202
1203         while (tmp) {
1204                 if (address >= tmp->start) {
1205                         if (address < tmp->end) {
1206                                 *entry = tmp;
1207                                 return(TRUE);
1208                         }
1209                         last = tmp;
1210                         tmp = RB_RIGHT(tmp, rb_entry);
1211                 } else {
1212                         tmp = RB_LEFT(tmp, rb_entry);
1213                 }
1214         }
1215         *entry = last;
1216         return (FALSE);
1217 }
1218
1219 /*
1220  * Inserts the given whole VM object into the target map at the specified
1221  * address range.  The object's size should match that of the address range.
1222  *
1223  * The map must be exclusively locked.
1224  * The object must be held.
1225  * The caller must have reserved sufficient vm_map_entry structures.
1226  *
1227  * If object is non-NULL, ref count must be bumped by caller prior to
1228  * making call to account for the new entry.
1229  */
1230 int
1231 vm_map_insert(vm_map_t map, int *countp, void *map_object, void *map_aux,
1232               vm_ooffset_t offset, vm_offset_t start, vm_offset_t end,
1233               vm_maptype_t maptype, vm_subsys_t id,
1234               vm_prot_t prot, vm_prot_t max, int cow)
1235 {
1236         vm_map_entry_t new_entry;
1237         vm_map_entry_t prev_entry;
1238         vm_map_entry_t next;
1239         vm_map_entry_t temp_entry;
1240         vm_eflags_t protoeflags;
1241         vm_object_t object;
1242         int must_drop = 0;
1243
1244         if (maptype == VM_MAPTYPE_UKSMAP)
1245                 object = NULL;
1246         else
1247                 object = map_object;
1248
1249         ASSERT_VM_MAP_LOCKED(map);
1250         if (object)
1251                 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1252
1253         /*
1254          * Check that the start and end points are not bogus.
1255          */
1256         if ((start < vm_map_min(map)) || (end > vm_map_max(map)) ||
1257             (start >= end)) {
1258                 return (KERN_INVALID_ADDRESS);
1259         }
1260
1261         /*
1262          * Find the entry prior to the proposed starting address; if it's part
1263          * of an existing entry, this range is bogus.
1264          */
1265         if (vm_map_lookup_entry(map, start, &temp_entry))
1266                 return (KERN_NO_SPACE);
1267         prev_entry = temp_entry;
1268
1269         /*
1270          * Assert that the next entry doesn't overlap the end point.
1271          */
1272         if (prev_entry)
1273                 next = vm_map_rb_tree_RB_NEXT(prev_entry);
1274         else
1275                 next = RB_MIN(vm_map_rb_tree, &map->rb_root);
1276         if (next && next->start < end)
1277                 return (KERN_NO_SPACE);
1278
1279         protoeflags = 0;
1280
1281         if (cow & MAP_COPY_ON_WRITE)
1282                 protoeflags |= MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY;
1283
1284         if (cow & MAP_NOFAULT) {
1285                 protoeflags |= MAP_ENTRY_NOFAULT;
1286
1287                 KASSERT(object == NULL,
1288                         ("vm_map_insert: paradoxical MAP_NOFAULT request"));
1289         }
1290         if (cow & MAP_DISABLE_SYNCER)
1291                 protoeflags |= MAP_ENTRY_NOSYNC;
1292         if (cow & MAP_DISABLE_COREDUMP)
1293                 protoeflags |= MAP_ENTRY_NOCOREDUMP;
1294         if (cow & MAP_IS_STACK)
1295                 protoeflags |= MAP_ENTRY_STACK;
1296         if (cow & MAP_IS_KSTACK)
1297                 protoeflags |= MAP_ENTRY_KSTACK;
1298
1299         lwkt_gettoken(&map->token);
1300
1301         if (object) {
1302                 /*
1303                  * When object is non-NULL, it could be shared with another
1304                  * process.  We have to set or clear OBJ_ONEMAPPING 
1305                  * appropriately.
1306                  *
1307                  * NOTE: This flag is only applicable to DEFAULT and SWAP
1308                  *       objects and will already be clear in other types
1309                  *       of objects, so a shared object lock is ok for
1310                  *       VNODE objects.
1311                  */
1312                 if (object->ref_count > 1)
1313                         vm_object_clear_flag(object, OBJ_ONEMAPPING);
1314         }
1315         else if (prev_entry &&
1316                  (prev_entry->eflags == protoeflags) &&
1317                  (prev_entry->end == start) &&
1318                  (prev_entry->wired_count == 0) &&
1319                  (prev_entry->id == id) &&
1320                  prev_entry->maptype == maptype &&
1321                  maptype == VM_MAPTYPE_NORMAL &&
1322                  prev_entry->ba.backing_ba == NULL &&   /* not backed */
1323                  ((prev_entry->ba.object == NULL) ||
1324                   vm_object_coalesce(prev_entry->ba.object,
1325                                      OFF_TO_IDX(prev_entry->ba.offset),
1326                                      (vm_size_t)(prev_entry->end - prev_entry->start),
1327                                      (vm_size_t)(end - prev_entry->end)))) {
1328                 /*
1329                  * We were able to extend the object.  Determine if we
1330                  * can extend the previous map entry to include the 
1331                  * new range as well.
1332                  */
1333                 if ((prev_entry->inheritance == VM_INHERIT_DEFAULT) &&
1334                     (prev_entry->protection == prot) &&
1335                     (prev_entry->max_protection == max)) {
1336                         map->size += (end - prev_entry->end);
1337                         prev_entry->end = end;
1338                         vm_map_simplify_entry(map, prev_entry, countp);
1339                         lwkt_reltoken(&map->token);
1340                         return (KERN_SUCCESS);
1341                 }
1342
1343                 /*
1344                  * If we can extend the object but cannot extend the
1345                  * map entry, we have to create a new map entry.  We
1346                  * must bump the ref count on the extended object to
1347                  * account for it.  object may be NULL.
1348                  */
1349                 object = prev_entry->ba.object;
1350                 offset = prev_entry->ba.offset +
1351                         (prev_entry->end - prev_entry->start);
1352                 if (object) {
1353                         vm_object_hold(object);
1354                         vm_object_lock_swap(); /* map->token order */
1355                         vm_object_reference_locked(object);
1356                         map_object = object;
1357                         must_drop = 1;
1358                 }
1359         }
1360
1361         /*
1362          * NOTE: if conditionals fail, object can be NULL here.  This occurs
1363          * in things like the buffer map where we manage kva but do not manage
1364          * backing objects.
1365          */
1366
1367         /*
1368          * Create a new entry
1369          */
1370         new_entry = vm_map_entry_create(map, countp);
1371         new_entry->start = start;
1372         new_entry->end = end;
1373         new_entry->id = id;
1374
1375         new_entry->maptype = maptype;
1376         new_entry->eflags = protoeflags;
1377         new_entry->aux.master_pde = 0;          /* in case size is different */
1378         new_entry->aux.map_aux = map_aux;
1379         new_entry->ba.map_object = map_object;
1380         new_entry->ba.backing_ba = NULL;
1381         new_entry->ba.backing_count = 0;
1382         new_entry->ba.offset = offset;
1383         new_entry->ba.refs = 0;
1384         new_entry->ba.flags = 0;
1385
1386         new_entry->inheritance = VM_INHERIT_DEFAULT;
1387         new_entry->protection = prot;
1388         new_entry->max_protection = max;
1389         new_entry->wired_count = 0;
1390
1391         /*
1392          * Insert the new entry into the list
1393          */
1394
1395         vm_map_entry_link(map, new_entry);
1396         map->size += new_entry->end - new_entry->start;
1397
1398         /*
1399          * Don't worry about updating freehint[] when inserting, allow
1400          * addresses to be lower than the actual first free spot.
1401          */
1402 #if 0
1403         /*
1404          * Temporarily removed to avoid MAP_STACK panic, due to
1405          * MAP_STACK being a huge hack.  Will be added back in
1406          * when MAP_STACK (and the user stack mapping) is fixed.
1407          */
1408         /*
1409          * It may be possible to simplify the entry
1410          */
1411         vm_map_simplify_entry(map, new_entry, countp);
1412 #endif
1413
1414         /*
1415          * Try to pre-populate the page table.  Mappings governed by virtual
1416          * page tables cannot be prepopulated without a lot of work, so
1417          * don't try.
1418          */
1419         if ((cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL)) &&
1420             maptype != VM_MAPTYPE_VPAGETABLE &&
1421             maptype != VM_MAPTYPE_UKSMAP) {
1422                 int dorelock = 0;
1423                 if (vm_map_relock_enable && (cow & MAP_PREFAULT_RELOCK)) {
1424                         dorelock = 1;
1425                         vm_object_lock_swap();
1426                         vm_object_drop(object);
1427                 }
1428                 pmap_object_init_pt(map->pmap, start, prot,
1429                                     object, OFF_TO_IDX(offset), end - start,
1430                                     cow & MAP_PREFAULT_PARTIAL);
1431                 if (dorelock) {
1432                         vm_object_hold(object);
1433                         vm_object_lock_swap();
1434                 }
1435         }
1436         lwkt_reltoken(&map->token);
1437         if (must_drop)
1438                 vm_object_drop(object);
1439
1440         return (KERN_SUCCESS);
1441 }
1442
1443 /*
1444  * Find sufficient space for `length' bytes in the given map, starting at
1445  * `start'.  Returns 0 on success, 1 on no space.
1446  *
1447  * This function will returned an arbitrarily aligned pointer.  If no
1448  * particular alignment is required you should pass align as 1.  Note that
1449  * the map may return PAGE_SIZE aligned pointers if all the lengths used in
1450  * the map are a multiple of PAGE_SIZE, even if you pass a smaller align
1451  * argument.
1452  *
1453  * 'align' should be a power of 2 but is not required to be.
1454  *
1455  * The map must be exclusively locked.
1456  * No other requirements.
1457  */
1458 int
1459 vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length,
1460                  vm_size_t align, int flags, vm_offset_t *addr)
1461 {
1462         vm_map_entry_t entry;
1463         vm_map_entry_t tmp;
1464         vm_offset_t hole_start;
1465         vm_offset_t end;
1466         vm_offset_t align_mask;
1467
1468         if (start < vm_map_min(map))
1469                 start = vm_map_min(map);
1470         if (start > vm_map_max(map))
1471                 return (1);
1472
1473         /*
1474          * If the alignment is not a power of 2 we will have to use
1475          * a mod/division, set align_mask to a special value.
1476          */
1477         if ((align | (align - 1)) + 1 != (align << 1))
1478                 align_mask = (vm_offset_t)-1;
1479         else
1480                 align_mask = align - 1;
1481
1482         /*
1483          * Use freehint to adjust the start point, hopefully reducing
1484          * the iteration to O(1).
1485          */
1486         hole_start = vm_map_freehint_find(map, length, align);
1487         if (start < hole_start)
1488                 start = hole_start;
1489         if (vm_map_lookup_entry(map, start, &tmp))
1490                 start = tmp->end;
1491         entry = tmp;    /* may be NULL */
1492
1493         /*
1494          * Look through the rest of the map, trying to fit a new region in the
1495          * gap between existing regions, or after the very last region.
1496          */
1497         for (;;) {
1498                 /*
1499                  * Adjust the proposed start by the requested alignment,
1500                  * be sure that we didn't wrap the address.
1501                  */
1502                 if (align_mask == (vm_offset_t)-1)
1503                         end = roundup(start, align);
1504                 else
1505                         end = (start + align_mask) & ~align_mask;
1506                 if (end < start)
1507                         return (1);
1508                 start = end;
1509
1510                 /*
1511                  * Find the end of the proposed new region.  Be sure we didn't
1512                  * go beyond the end of the map, or wrap around the address.
1513                  * Then check to see if this is the last entry or if the 
1514                  * proposed end fits in the gap between this and the next
1515                  * entry.
1516                  */
1517                 end = start + length;
1518                 if (end > vm_map_max(map) || end < start)
1519                         return (1);
1520
1521                 /*
1522                  * Locate the next entry, we can stop if this is the
1523                  * last entry (we know we are in-bounds so that would
1524                  * be a sucess).
1525                  */
1526                 if (entry)
1527                         entry = vm_map_rb_tree_RB_NEXT(entry);
1528                 else
1529                         entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
1530                 if (entry == NULL)
1531                         break;
1532
1533                 /*
1534                  * Determine if the proposed area would overlap the
1535                  * next entry.
1536                  *
1537                  * When matching against a STACK entry, only allow the
1538                  * memory map to intrude on the ungrown portion of the
1539                  * STACK entry when MAP_TRYFIXED is set.
1540                  */
1541                 if (entry->start >= end) {
1542                         if ((entry->eflags & MAP_ENTRY_STACK) == 0)
1543                                 break;
1544                         if (flags & MAP_TRYFIXED)
1545                                 break;
1546                         if (entry->start - entry->aux.avail_ssize >= end)
1547                                 break;
1548                 }
1549                 start = entry->end;
1550         }
1551
1552         /*
1553          * Update the freehint
1554          */
1555         vm_map_freehint_update(map, start, length, align);
1556
1557         /*
1558          * Grow the kernel_map if necessary.  pmap_growkernel() will panic
1559          * if it fails.  The kernel_map is locked and nothing can steal
1560          * our address space if pmap_growkernel() blocks.
1561          *
1562          * NOTE: This may be unconditionally called for kldload areas on
1563          *       x86_64 because these do not bump kernel_vm_end (which would
1564          *       fill 128G worth of page tables!).  Therefore we must not
1565          *       retry.
1566          */
1567         if (map == &kernel_map) {
1568                 vm_offset_t kstop;
1569
1570                 kstop = round_page(start + length);
1571                 if (kstop > kernel_vm_end)
1572                         pmap_growkernel(start, kstop);
1573         }
1574         *addr = start;
1575         return (0);
1576 }
1577
1578 /*
1579  * vm_map_find finds an unallocated region in the target address map with
1580  * the given length and allocates it.  The search is defined to be first-fit
1581  * from the specified address; the region found is returned in the same
1582  * parameter.
1583  *
1584  * If object is non-NULL, ref count must be bumped by caller
1585  * prior to making call to account for the new entry.
1586  *
1587  * No requirements.  This function will lock the map temporarily.
1588  */
1589 int
1590 vm_map_find(vm_map_t map, void *map_object, void *map_aux,
1591             vm_ooffset_t offset, vm_offset_t *addr,
1592             vm_size_t length, vm_size_t align, boolean_t fitit,
1593             vm_maptype_t maptype, vm_subsys_t id,
1594             vm_prot_t prot, vm_prot_t max, int cow)
1595 {
1596         vm_offset_t start;
1597         vm_object_t object;
1598         int result;
1599         int count;
1600
1601         if (maptype == VM_MAPTYPE_UKSMAP)
1602                 object = NULL;
1603         else
1604                 object = map_object;
1605
1606         start = *addr;
1607
1608         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
1609         vm_map_lock(map);
1610         if (object)
1611                 vm_object_hold_shared(object);
1612         if (fitit) {
1613                 if (vm_map_findspace(map, start, length, align, 0, addr)) {
1614                         if (object)
1615                                 vm_object_drop(object);
1616                         vm_map_unlock(map);
1617                         vm_map_entry_release(count);
1618                         return (KERN_NO_SPACE);
1619                 }
1620                 start = *addr;
1621         }
1622         result = vm_map_insert(map, &count, map_object, map_aux,
1623                                offset, start, start + length,
1624                                maptype, id, prot, max, cow);
1625         if (object)
1626                 vm_object_drop(object);
1627         vm_map_unlock(map);
1628         vm_map_entry_release(count);
1629
1630         return (result);
1631 }
1632
1633 /*
1634  * Simplify the given map entry by merging with either neighbor.  This
1635  * routine also has the ability to merge with both neighbors.
1636  *
1637  * This routine guarentees that the passed entry remains valid (though
1638  * possibly extended).  When merging, this routine may delete one or
1639  * both neighbors.  No action is taken on entries which have their
1640  * in-transition flag set.
1641  *
1642  * The map must be exclusively locked.
1643  */
1644 void
1645 vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry, int *countp)
1646 {
1647         vm_map_entry_t next, prev;
1648         vm_size_t prevsize, esize;
1649
1650         if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
1651                 ++mycpu->gd_cnt.v_intrans_coll;
1652                 return;
1653         }
1654
1655         if (entry->maptype == VM_MAPTYPE_SUBMAP)
1656                 return;
1657         if (entry->maptype == VM_MAPTYPE_UKSMAP)
1658                 return;
1659
1660         prev = vm_map_rb_tree_RB_PREV(entry);
1661         if (prev) {
1662                 prevsize = prev->end - prev->start;
1663                 if ( (prev->end == entry->start) &&
1664                      (prev->maptype == entry->maptype) &&
1665                      (prev->ba.object == entry->ba.object) &&
1666                      (prev->ba.backing_ba == entry->ba.backing_ba) &&
1667                      (!prev->ba.object ||
1668                         (prev->ba.offset + prevsize == entry->ba.offset)) &&
1669                      (prev->eflags == entry->eflags) &&
1670                      (prev->protection == entry->protection) &&
1671                      (prev->max_protection == entry->max_protection) &&
1672                      (prev->inheritance == entry->inheritance) &&
1673                      (prev->id == entry->id) &&
1674                      (prev->wired_count == entry->wired_count)) {
1675                         vm_map_entry_unlink(map, prev);
1676                         entry->start = prev->start;
1677                         entry->ba.offset = prev->ba.offset;
1678                         vm_map_entry_dispose(map, prev, countp);
1679                 }
1680         }
1681
1682         next = vm_map_rb_tree_RB_NEXT(entry);
1683         if (next) {
1684                 esize = entry->end - entry->start;
1685                 if ((entry->end == next->start) &&
1686                     (next->maptype == entry->maptype) &&
1687                     (next->ba.object == entry->ba.object) &&
1688                      (prev->ba.backing_ba == entry->ba.backing_ba) &&
1689                      (!entry->ba.object ||
1690                         (entry->ba.offset + esize == next->ba.offset)) &&
1691                     (next->eflags == entry->eflags) &&
1692                     (next->protection == entry->protection) &&
1693                     (next->max_protection == entry->max_protection) &&
1694                     (next->inheritance == entry->inheritance) &&
1695                     (next->id == entry->id) &&
1696                     (next->wired_count == entry->wired_count)) {
1697                         vm_map_entry_unlink(map, next);
1698                         entry->end = next->end;
1699                         vm_map_entry_dispose(map, next, countp);
1700                 }
1701         }
1702 }
1703
1704 /*
1705  * Asserts that the given entry begins at or after the specified address.
1706  * If necessary, it splits the entry into two.
1707  */
1708 #define vm_map_clip_start(map, entry, startaddr, countp)                \
1709 {                                                                       \
1710         if (startaddr > entry->start)                                   \
1711                 _vm_map_clip_start(map, entry, startaddr, countp);      \
1712 }
1713
1714 /*
1715  * This routine is called only when it is known that the entry must be split.
1716  *
1717  * The map must be exclusively locked.
1718  */
1719 static void
1720 _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start,
1721                    int *countp)
1722 {
1723         vm_map_entry_t new_entry;
1724
1725         /*
1726          * Split off the front portion -- note that we must insert the new
1727          * entry BEFORE this one, so that this entry has the specified
1728          * starting address.
1729          */
1730
1731         vm_map_simplify_entry(map, entry, countp);
1732
1733         /*
1734          * If there is no object backing this entry, we might as well create
1735          * one now.  If we defer it, an object can get created after the map
1736          * is clipped, and individual objects will be created for the split-up
1737          * map.  This is a bit of a hack, but is also about the best place to
1738          * put this improvement.
1739          */
1740         if (entry->ba.object == NULL && !map->system_map &&
1741             VM_MAP_ENTRY_WITHIN_PARTITION(entry)) {
1742                 vm_map_entry_allocate_object(entry);
1743         }
1744
1745         new_entry = vm_map_entry_create(map, countp);
1746         *new_entry = *entry;
1747
1748         new_entry->end = start;
1749         entry->ba.offset += (start - entry->start);
1750         entry->start = start;
1751         if (new_entry->ba.backing_ba)
1752                 atomic_add_long(&new_entry->ba.backing_ba->refs, 1);
1753
1754         vm_map_entry_link(map, new_entry);
1755
1756         switch(entry->maptype) {
1757         case VM_MAPTYPE_NORMAL:
1758         case VM_MAPTYPE_VPAGETABLE:
1759                 if (new_entry->ba.object) {
1760                         vm_object_hold(new_entry->ba.object);
1761                         vm_object_reference_locked(new_entry->ba.object);
1762                         vm_object_drop(new_entry->ba.object);
1763                 }
1764                 break;
1765         default:
1766                 break;
1767         }
1768 }
1769
1770 /*
1771  * Asserts that the given entry ends at or before the specified address.
1772  * If necessary, it splits the entry into two.
1773  *
1774  * The map must be exclusively locked.
1775  */
1776 #define vm_map_clip_end(map, entry, endaddr, countp)            \
1777 {                                                               \
1778         if (endaddr < entry->end)                               \
1779                 _vm_map_clip_end(map, entry, endaddr, countp);  \
1780 }
1781
1782 /*
1783  * This routine is called only when it is known that the entry must be split.
1784  *
1785  * The map must be exclusively locked.
1786  */
1787 static void
1788 _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end,
1789                  int *countp)
1790 {
1791         vm_map_entry_t new_entry;
1792
1793         /*
1794          * If there is no object backing this entry, we might as well create
1795          * one now.  If we defer it, an object can get created after the map
1796          * is clipped, and individual objects will be created for the split-up
1797          * map.  This is a bit of a hack, but is also about the best place to
1798          * put this improvement.
1799          */
1800
1801         if (entry->ba.object == NULL && !map->system_map &&
1802             VM_MAP_ENTRY_WITHIN_PARTITION(entry)) {
1803                 vm_map_entry_allocate_object(entry);
1804         }
1805
1806         /*
1807          * Create a new entry and insert it AFTER the specified entry
1808          */
1809         new_entry = vm_map_entry_create(map, countp);
1810         *new_entry = *entry;
1811
1812         new_entry->start = entry->end = end;
1813         new_entry->ba.offset += (end - entry->start);
1814         if (new_entry->ba.backing_ba)
1815                 atomic_add_long(&new_entry->ba.backing_ba->refs, 1);
1816
1817         vm_map_entry_link(map, new_entry);
1818
1819         switch(entry->maptype) {
1820         case VM_MAPTYPE_NORMAL:
1821         case VM_MAPTYPE_VPAGETABLE:
1822                 if (new_entry->ba.object) {
1823                         vm_object_hold(new_entry->ba.object);
1824                         vm_object_reference_locked(new_entry->ba.object);
1825                         vm_object_drop(new_entry->ba.object);
1826                 }
1827                 break;
1828         default:
1829                 break;
1830         }
1831 }
1832
1833 /*
1834  * Asserts that the starting and ending region addresses fall within the
1835  * valid range for the map.
1836  */
1837 #define VM_MAP_RANGE_CHECK(map, start, end)     \
1838 {                                               \
1839         if (start < vm_map_min(map))            \
1840                 start = vm_map_min(map);        \
1841         if (end > vm_map_max(map))              \
1842                 end = vm_map_max(map);          \
1843         if (start > end)                        \
1844                 start = end;                    \
1845 }
1846
1847 /*
1848  * Used to block when an in-transition collison occurs.  The map
1849  * is unlocked for the sleep and relocked before the return.
1850  */
1851 void
1852 vm_map_transition_wait(vm_map_t map, int relock)
1853 {
1854         tsleep_interlock(map, 0);
1855         vm_map_unlock(map);
1856         tsleep(map, PINTERLOCKED, "vment", 0);
1857         if (relock)
1858                 vm_map_lock(map);
1859 }
1860
1861 /*
1862  * When we do blocking operations with the map lock held it is
1863  * possible that a clip might have occured on our in-transit entry,
1864  * requiring an adjustment to the entry in our loop.  These macros
1865  * help the pageable and clip_range code deal with the case.  The
1866  * conditional costs virtually nothing if no clipping has occured.
1867  */
1868
1869 #define CLIP_CHECK_BACK(entry, save_start)                      \
1870     do {                                                        \
1871             while (entry->start != save_start) {                \
1872                     entry = vm_map_rb_tree_RB_PREV(entry);      \
1873                     KASSERT(entry, ("bad entry clip"));         \
1874             }                                                   \
1875     } while(0)
1876
1877 #define CLIP_CHECK_FWD(entry, save_end)                         \
1878     do {                                                        \
1879             while (entry->end != save_end) {                    \
1880                     entry = vm_map_rb_tree_RB_NEXT(entry);      \
1881                     KASSERT(entry, ("bad entry clip"));         \
1882             }                                                   \
1883     } while(0)
1884
1885
1886 /*
1887  * Clip the specified range and return the base entry.  The
1888  * range may cover several entries starting at the returned base
1889  * and the first and last entry in the covering sequence will be
1890  * properly clipped to the requested start and end address.
1891  *
1892  * If no holes are allowed you should pass the MAP_CLIP_NO_HOLES
1893  * flag.
1894  *
1895  * The MAP_ENTRY_IN_TRANSITION flag will be set for the entries
1896  * covered by the requested range.
1897  *
1898  * The map must be exclusively locked on entry and will remain locked
1899  * on return. If no range exists or the range contains holes and you
1900  * specified that no holes were allowed, NULL will be returned.  This
1901  * routine may temporarily unlock the map in order avoid a deadlock when
1902  * sleeping.
1903  */
1904 static
1905 vm_map_entry_t
1906 vm_map_clip_range(vm_map_t map, vm_offset_t start, vm_offset_t end, 
1907                   int *countp, int flags)
1908 {
1909         vm_map_entry_t start_entry;
1910         vm_map_entry_t entry;
1911         vm_map_entry_t next;
1912
1913         /*
1914          * Locate the entry and effect initial clipping.  The in-transition
1915          * case does not occur very often so do not try to optimize it.
1916          */
1917 again:
1918         if (vm_map_lookup_entry(map, start, &start_entry) == FALSE)
1919                 return (NULL);
1920         entry = start_entry;
1921         if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
1922                 entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
1923                 ++mycpu->gd_cnt.v_intrans_coll;
1924                 ++mycpu->gd_cnt.v_intrans_wait;
1925                 vm_map_transition_wait(map, 1);
1926                 /*
1927                  * entry and/or start_entry may have been clipped while
1928                  * we slept, or may have gone away entirely.  We have
1929                  * to restart from the lookup.
1930                  */
1931                 goto again;
1932         }
1933
1934         /*
1935          * Since we hold an exclusive map lock we do not have to restart
1936          * after clipping, even though clipping may block in zalloc.
1937          */
1938         vm_map_clip_start(map, entry, start, countp);
1939         vm_map_clip_end(map, entry, end, countp);
1940         entry->eflags |= MAP_ENTRY_IN_TRANSITION;
1941
1942         /*
1943          * Scan entries covered by the range.  When working on the next
1944          * entry a restart need only re-loop on the current entry which
1945          * we have already locked, since 'next' may have changed.  Also,
1946          * even though entry is safe, it may have been clipped so we
1947          * have to iterate forwards through the clip after sleeping.
1948          */
1949         for (;;) {
1950                 next = vm_map_rb_tree_RB_NEXT(entry);
1951                 if (next == NULL || next->start >= end)
1952                         break;
1953                 if (flags & MAP_CLIP_NO_HOLES) {
1954                         if (next->start > entry->end) {
1955                                 vm_map_unclip_range(map, start_entry,
1956                                         start, entry->end, countp, flags);
1957                                 return(NULL);
1958                         }
1959                 }
1960
1961                 if (next->eflags & MAP_ENTRY_IN_TRANSITION) {
1962                         vm_offset_t save_end = entry->end;
1963                         next->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
1964                         ++mycpu->gd_cnt.v_intrans_coll;
1965                         ++mycpu->gd_cnt.v_intrans_wait;
1966                         vm_map_transition_wait(map, 1);
1967
1968                         /*
1969                          * clips might have occured while we blocked.
1970                          */
1971                         CLIP_CHECK_FWD(entry, save_end);
1972                         CLIP_CHECK_BACK(start_entry, start);
1973                         continue;
1974                 }
1975
1976                 /*
1977                  * No restart necessary even though clip_end may block, we
1978                  * are holding the map lock.
1979                  */
1980                 vm_map_clip_end(map, next, end, countp);
1981                 next->eflags |= MAP_ENTRY_IN_TRANSITION;
1982                 entry = next;
1983         }
1984         if (flags & MAP_CLIP_NO_HOLES) {
1985                 if (entry->end != end) {
1986                         vm_map_unclip_range(map, start_entry,
1987                                 start, entry->end, countp, flags);
1988                         return(NULL);
1989                 }
1990         }
1991         return(start_entry);
1992 }
1993
1994 /*
1995  * Undo the effect of vm_map_clip_range().  You should pass the same
1996  * flags and the same range that you passed to vm_map_clip_range().
1997  * This code will clear the in-transition flag on the entries and
1998  * wake up anyone waiting.  This code will also simplify the sequence
1999  * and attempt to merge it with entries before and after the sequence.
2000  *
2001  * The map must be locked on entry and will remain locked on return.
2002  *
2003  * Note that you should also pass the start_entry returned by
2004  * vm_map_clip_range().  However, if you block between the two calls
2005  * with the map unlocked please be aware that the start_entry may
2006  * have been clipped and you may need to scan it backwards to find
2007  * the entry corresponding with the original start address.  You are
2008  * responsible for this, vm_map_unclip_range() expects the correct
2009  * start_entry to be passed to it and will KASSERT otherwise.
2010  */
2011 static
2012 void
2013 vm_map_unclip_range(vm_map_t map, vm_map_entry_t start_entry,
2014                     vm_offset_t start, vm_offset_t end,
2015                     int *countp, int flags)
2016 {
2017         vm_map_entry_t entry;
2018
2019         entry = start_entry;
2020
2021         KASSERT(entry->start == start, ("unclip_range: illegal base entry"));
2022         while (entry && entry->start < end) {
2023                 KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION,
2024                         ("in-transition flag not set during unclip on: %p",
2025                         entry));
2026                 KASSERT(entry->end <= end,
2027                         ("unclip_range: tail wasn't clipped"));
2028                 entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
2029                 if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
2030                         entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
2031                         wakeup(map);
2032                 }
2033                 entry = vm_map_rb_tree_RB_NEXT(entry);
2034         }
2035
2036         /*
2037          * Simplification does not block so there is no restart case.
2038          */
2039         entry = start_entry;
2040         while (entry && entry->start < end) {
2041                 vm_map_simplify_entry(map, entry, countp);
2042                 entry = vm_map_rb_tree_RB_NEXT(entry);
2043         }
2044 }
2045
2046 /*
2047  * Mark the given range as handled by a subordinate map.
2048  *
2049  * This range must have been created with vm_map_find(), and no other
2050  * operations may have been performed on this range prior to calling
2051  * vm_map_submap().
2052  *
2053  * Submappings cannot be removed.
2054  *
2055  * No requirements.
2056  */
2057 int
2058 vm_map_submap(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_map_t submap)
2059 {
2060         vm_map_entry_t entry;
2061         int result = KERN_INVALID_ARGUMENT;
2062         int count;
2063
2064         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2065         vm_map_lock(map);
2066
2067         VM_MAP_RANGE_CHECK(map, start, end);
2068
2069         if (vm_map_lookup_entry(map, start, &entry)) {
2070                 vm_map_clip_start(map, entry, start, &count);
2071         } else if (entry) {
2072                 entry = vm_map_rb_tree_RB_NEXT(entry);
2073         } else {
2074                 entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
2075         }
2076
2077         vm_map_clip_end(map, entry, end, &count);
2078
2079         if ((entry->start == start) && (entry->end == end) &&
2080             ((entry->eflags & MAP_ENTRY_COW) == 0) &&
2081             (entry->ba.object == NULL)) {
2082                 entry->ba.sub_map = submap;
2083                 entry->maptype = VM_MAPTYPE_SUBMAP;
2084                 result = KERN_SUCCESS;
2085         }
2086         vm_map_unlock(map);
2087         vm_map_entry_release(count);
2088
2089         return (result);
2090 }
2091
2092 /*
2093  * Sets the protection of the specified address region in the target map. 
2094  * If "set_max" is specified, the maximum protection is to be set;
2095  * otherwise, only the current protection is affected.
2096  *
2097  * The protection is not applicable to submaps, but is applicable to normal
2098  * maps and maps governed by virtual page tables.  For example, when operating
2099  * on a virtual page table our protection basically controls how COW occurs
2100  * on the backing object, whereas the virtual page table abstraction itself
2101  * is an abstraction for userland.
2102  *
2103  * No requirements.
2104  */
2105 int
2106 vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
2107                vm_prot_t new_prot, boolean_t set_max)
2108 {
2109         vm_map_entry_t current;
2110         vm_map_entry_t entry;
2111         int count;
2112
2113         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2114         vm_map_lock(map);
2115
2116         VM_MAP_RANGE_CHECK(map, start, end);
2117
2118         if (vm_map_lookup_entry(map, start, &entry)) {
2119                 vm_map_clip_start(map, entry, start, &count);
2120         } else if (entry) {
2121                 entry = vm_map_rb_tree_RB_NEXT(entry);
2122         } else {
2123                 entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
2124         }
2125
2126         /*
2127          * Make a first pass to check for protection violations.
2128          */
2129         current = entry;
2130         while (current && current->start < end) {
2131                 if (current->maptype == VM_MAPTYPE_SUBMAP) {
2132                         vm_map_unlock(map);
2133                         vm_map_entry_release(count);
2134                         return (KERN_INVALID_ARGUMENT);
2135                 }
2136                 if ((new_prot & current->max_protection) != new_prot) {
2137                         vm_map_unlock(map);
2138                         vm_map_entry_release(count);
2139                         return (KERN_PROTECTION_FAILURE);
2140                 }
2141
2142                 /*
2143                  * When making a SHARED+RW file mmap writable, update
2144                  * v_lastwrite_ts.
2145                  */
2146                 if (new_prot & PROT_WRITE &&
2147                     (current->eflags & MAP_ENTRY_NEEDS_COPY) == 0 &&
2148                     (current->maptype == VM_MAPTYPE_NORMAL ||
2149                      current->maptype == VM_MAPTYPE_VPAGETABLE) &&
2150                     current->ba.object &&
2151                     current->ba.object->type == OBJT_VNODE) {
2152                         struct vnode *vp;
2153
2154                         vp = current->ba.object->handle;
2155                         if (vp && vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT) == 0) {
2156                                 vfs_timestamp(&vp->v_lastwrite_ts);
2157                                 vsetflags(vp, VLASTWRITETS);
2158                                 vn_unlock(vp);
2159                         }
2160                 }
2161                 current = vm_map_rb_tree_RB_NEXT(current);
2162         }
2163
2164         /*
2165          * Go back and fix up protections. [Note that clipping is not
2166          * necessary the second time.]
2167          */
2168         current = entry;
2169
2170         while (current && current->start < end) {
2171                 vm_prot_t old_prot;
2172
2173                 vm_map_clip_end(map, current, end, &count);
2174
2175                 old_prot = current->protection;
2176                 if (set_max) {
2177                         current->max_protection = new_prot;
2178                         current->protection = new_prot & old_prot;
2179                 } else {
2180                         current->protection = new_prot;
2181                 }
2182
2183                 /*
2184                  * Update physical map if necessary. Worry about copy-on-write
2185                  * here -- CHECK THIS XXX
2186                  */
2187                 if (current->protection != old_prot) {
2188 #define MASK(entry)     (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
2189                                                         VM_PROT_ALL)
2190
2191                         pmap_protect(map->pmap, current->start,
2192                             current->end,
2193                             current->protection & MASK(current));
2194 #undef  MASK
2195                 }
2196
2197                 vm_map_simplify_entry(map, current, &count);
2198
2199                 current = vm_map_rb_tree_RB_NEXT(current);
2200         }
2201         vm_map_unlock(map);
2202         vm_map_entry_release(count);
2203         return (KERN_SUCCESS);
2204 }
2205
2206 /*
2207  * This routine traverses a processes map handling the madvise
2208  * system call.  Advisories are classified as either those effecting
2209  * the vm_map_entry structure, or those effecting the underlying
2210  * objects.
2211  *
2212  * The <value> argument is used for extended madvise calls.
2213  *
2214  * No requirements.
2215  */
2216 int
2217 vm_map_madvise(vm_map_t map, vm_offset_t start, vm_offset_t end,
2218                int behav, off_t value)
2219 {
2220         vm_map_entry_t current, entry;
2221         int modify_map = 0;
2222         int error = 0;
2223         int count;
2224
2225         /*
2226          * Some madvise calls directly modify the vm_map_entry, in which case
2227          * we need to use an exclusive lock on the map and we need to perform 
2228          * various clipping operations.  Otherwise we only need a read-lock
2229          * on the map.
2230          */
2231         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2232
2233         switch(behav) {
2234         case MADV_NORMAL:
2235         case MADV_SEQUENTIAL:
2236         case MADV_RANDOM:
2237         case MADV_NOSYNC:
2238         case MADV_AUTOSYNC:
2239         case MADV_NOCORE:
2240         case MADV_CORE:
2241         case MADV_SETMAP:
2242                 modify_map = 1;
2243                 vm_map_lock(map);
2244                 break;
2245         case MADV_INVAL:
2246         case MADV_WILLNEED:
2247         case MADV_DONTNEED:
2248         case MADV_FREE:
2249                 vm_map_lock_read(map);
2250                 break;
2251         default:
2252                 vm_map_entry_release(count);
2253                 return (EINVAL);
2254         }
2255
2256         /*
2257          * Locate starting entry and clip if necessary.
2258          */
2259
2260         VM_MAP_RANGE_CHECK(map, start, end);
2261
2262         if (vm_map_lookup_entry(map, start, &entry)) {
2263                 if (modify_map)
2264                         vm_map_clip_start(map, entry, start, &count);
2265         } else if (entry) {
2266                 entry = vm_map_rb_tree_RB_NEXT(entry);
2267         } else {
2268                 entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
2269         }
2270
2271         if (modify_map) {
2272                 /*
2273                  * madvise behaviors that are implemented in the vm_map_entry.
2274                  *
2275                  * We clip the vm_map_entry so that behavioral changes are
2276                  * limited to the specified address range.
2277                  */
2278                 for (current = entry;
2279                      current && current->start < end;
2280                      current = vm_map_rb_tree_RB_NEXT(current)) {
2281                         /*
2282                          * Ignore submaps
2283                          */
2284                         if (current->maptype == VM_MAPTYPE_SUBMAP)
2285                                 continue;
2286
2287                         vm_map_clip_end(map, current, end, &count);
2288
2289                         switch (behav) {
2290                         case MADV_NORMAL:
2291                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL);
2292                                 break;
2293                         case MADV_SEQUENTIAL:
2294                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL);
2295                                 break;
2296                         case MADV_RANDOM:
2297                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM);
2298                                 break;
2299                         case MADV_NOSYNC:
2300                                 current->eflags |= MAP_ENTRY_NOSYNC;
2301                                 break;
2302                         case MADV_AUTOSYNC:
2303                                 current->eflags &= ~MAP_ENTRY_NOSYNC;
2304                                 break;
2305                         case MADV_NOCORE:
2306                                 current->eflags |= MAP_ENTRY_NOCOREDUMP;
2307                                 break;
2308                         case MADV_CORE:
2309                                 current->eflags &= ~MAP_ENTRY_NOCOREDUMP;
2310                                 break;
2311                         case MADV_SETMAP:
2312                                 /*
2313                                  * Set the page directory page for a map
2314                                  * governed by a virtual page table.  Mark
2315                                  * the entry as being governed by a virtual
2316                                  * page table if it is not.
2317                                  *
2318                                  * XXX the page directory page is stored
2319                                  * in the avail_ssize field if the map_entry.
2320                                  *
2321                                  * XXX the map simplification code does not
2322                                  * compare this field so weird things may
2323                                  * happen if you do not apply this function
2324                                  * to the entire mapping governed by the
2325                                  * virtual page table.
2326                                  */
2327                                 if (current->maptype != VM_MAPTYPE_VPAGETABLE) {
2328                                         error = EINVAL;
2329                                         break;
2330                                 }
2331                                 current->aux.master_pde = value;
2332                                 pmap_remove(map->pmap,
2333                                             current->start, current->end);
2334                                 break;
2335                         case MADV_INVAL:
2336                                 /*
2337                                  * Invalidate the related pmap entries, used
2338                                  * to flush portions of the real kernel's
2339                                  * pmap when the caller has removed or
2340                                  * modified existing mappings in a virtual
2341                                  * page table.
2342                                  *
2343                                  * (exclusive locked map version does not
2344                                  * need the range interlock).
2345                                  */
2346                                 pmap_remove(map->pmap,
2347                                             current->start, current->end);
2348                                 break;
2349                         default:
2350                                 error = EINVAL;
2351                                 break;
2352                         }
2353                         vm_map_simplify_entry(map, current, &count);
2354                 }
2355                 vm_map_unlock(map);
2356         } else {
2357                 vm_pindex_t pindex;
2358                 vm_pindex_t delta;
2359
2360                 /*
2361                  * madvise behaviors that are implemented in the underlying
2362                  * vm_object.
2363                  *
2364                  * Since we don't clip the vm_map_entry, we have to clip
2365                  * the vm_object pindex and count.
2366                  *
2367                  * NOTE!  These functions are only supported on normal maps,
2368                  *        except MADV_INVAL which is also supported on
2369                  *        virtual page tables.
2370                  *
2371                  * NOTE!  These functions only apply to the top-most object.
2372                  *        It is not applicable to backing objects.
2373                  */
2374                 for (current = entry;
2375                      current && current->start < end;
2376                      current = vm_map_rb_tree_RB_NEXT(current)) {
2377                         vm_offset_t useStart;
2378
2379                         if (current->maptype != VM_MAPTYPE_NORMAL &&
2380                             (current->maptype != VM_MAPTYPE_VPAGETABLE ||
2381                              behav != MADV_INVAL)) {
2382                                 continue;
2383                         }
2384
2385                         pindex = OFF_TO_IDX(current->ba.offset);
2386                         delta = atop(current->end - current->start);
2387                         useStart = current->start;
2388
2389                         if (current->start < start) {
2390                                 pindex += atop(start - current->start);
2391                                 delta -= atop(start - current->start);
2392                                 useStart = start;
2393                         }
2394                         if (current->end > end)
2395                                 delta -= atop(current->end - end);
2396
2397                         if ((vm_spindex_t)delta <= 0)
2398                                 continue;
2399
2400                         if (behav == MADV_INVAL) {
2401                                 /*
2402                                  * Invalidate the related pmap entries, used
2403                                  * to flush portions of the real kernel's
2404                                  * pmap when the caller has removed or
2405                                  * modified existing mappings in a virtual
2406                                  * page table.
2407                                  *
2408                                  * (shared locked map version needs the
2409                                  * interlock, see vm_fault()).
2410                                  */
2411                                 struct vm_map_ilock ilock;
2412
2413                                 KASSERT(useStart >= VM_MIN_USER_ADDRESS &&
2414                                             useStart + ptoa(delta) <=
2415                                             VM_MAX_USER_ADDRESS,
2416                                          ("Bad range %016jx-%016jx (%016jx)",
2417                                          useStart, useStart + ptoa(delta),
2418                                          delta));
2419                                 vm_map_interlock(map, &ilock,
2420                                                  useStart,
2421                                                  useStart + ptoa(delta));
2422                                 pmap_remove(map->pmap,
2423                                             useStart,
2424                                             useStart + ptoa(delta));
2425                                 vm_map_deinterlock(map, &ilock);
2426                         } else {
2427                                 vm_object_madvise(current->ba.object,
2428                                                   pindex, delta, behav);
2429                         }
2430
2431                         /*
2432                          * Try to populate the page table.  Mappings governed
2433                          * by virtual page tables cannot be pre-populated
2434                          * without a lot of work so don't try.
2435                          */
2436                         if (behav == MADV_WILLNEED &&
2437                             current->maptype != VM_MAPTYPE_VPAGETABLE) {
2438                                 pmap_object_init_pt(
2439                                     map->pmap, 
2440                                     useStart,
2441                                     current->protection,
2442                                     current->ba.object,
2443                                     pindex, 
2444                                     (count << PAGE_SHIFT),
2445                                     MAP_PREFAULT_MADVISE
2446                                 );
2447                         }
2448                 }
2449                 vm_map_unlock_read(map);
2450         }
2451         vm_map_entry_release(count);
2452         return(error);
2453 }       
2454
2455
2456 /*
2457  * Sets the inheritance of the specified address range in the target map.
2458  * Inheritance affects how the map will be shared with child maps at the
2459  * time of vm_map_fork.
2460  */
2461 int
2462 vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
2463                vm_inherit_t new_inheritance)
2464 {
2465         vm_map_entry_t entry;
2466         vm_map_entry_t temp_entry;
2467         int count;
2468
2469         switch (new_inheritance) {
2470         case VM_INHERIT_NONE:
2471         case VM_INHERIT_COPY:
2472         case VM_INHERIT_SHARE:
2473                 break;
2474         default:
2475                 return (KERN_INVALID_ARGUMENT);
2476         }
2477
2478         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2479         vm_map_lock(map);
2480
2481         VM_MAP_RANGE_CHECK(map, start, end);
2482
2483         if (vm_map_lookup_entry(map, start, &temp_entry)) {
2484                 entry = temp_entry;
2485                 vm_map_clip_start(map, entry, start, &count);
2486         } else if (temp_entry) {
2487                 entry = vm_map_rb_tree_RB_NEXT(temp_entry);
2488         } else {
2489                 entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
2490         }
2491
2492         while (entry && entry->start < end) {
2493                 vm_map_clip_end(map, entry, end, &count);
2494
2495                 entry->inheritance = new_inheritance;
2496
2497                 vm_map_simplify_entry(map, entry, &count);
2498
2499                 entry = vm_map_rb_tree_RB_NEXT(entry);
2500         }
2501         vm_map_unlock(map);
2502         vm_map_entry_release(count);
2503         return (KERN_SUCCESS);
2504 }
2505
2506 /*
2507  * Implement the semantics of mlock
2508  */
2509 int
2510 vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t real_end,
2511               boolean_t new_pageable)
2512 {
2513         vm_map_entry_t entry;
2514         vm_map_entry_t start_entry;
2515         vm_offset_t end;
2516         int rv = KERN_SUCCESS;
2517         int count;
2518
2519         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2520         vm_map_lock(map);
2521         VM_MAP_RANGE_CHECK(map, start, real_end);
2522         end = real_end;
2523
2524         start_entry = vm_map_clip_range(map, start, end, &count,
2525                                         MAP_CLIP_NO_HOLES);
2526         if (start_entry == NULL) {
2527                 vm_map_unlock(map);
2528                 vm_map_entry_release(count);
2529                 return (KERN_INVALID_ADDRESS);
2530         }
2531
2532         if (new_pageable == 0) {
2533                 entry = start_entry;
2534                 while (entry && entry->start < end) {
2535                         vm_offset_t save_start;
2536                         vm_offset_t save_end;
2537
2538                         /*
2539                          * Already user wired or hard wired (trivial cases)
2540                          */
2541                         if (entry->eflags & MAP_ENTRY_USER_WIRED) {
2542                                 entry = vm_map_rb_tree_RB_NEXT(entry);
2543                                 continue;
2544                         }
2545                         if (entry->wired_count != 0) {
2546                                 entry->wired_count++;
2547                                 entry->eflags |= MAP_ENTRY_USER_WIRED;
2548                                 entry = vm_map_rb_tree_RB_NEXT(entry);
2549                                 continue;
2550                         }
2551
2552                         /*
2553                          * A new wiring requires instantiation of appropriate
2554                          * management structures and the faulting in of the
2555                          * page.
2556                          */
2557                         if (entry->maptype == VM_MAPTYPE_NORMAL ||
2558                             entry->maptype == VM_MAPTYPE_VPAGETABLE) {
2559                                 int copyflag = entry->eflags &
2560                                                MAP_ENTRY_NEEDS_COPY;
2561                                 if (copyflag && ((entry->protection &
2562                                                   VM_PROT_WRITE) != 0)) {
2563                                         vm_map_entry_shadow(entry, 0);
2564                                 } else if (entry->ba.object == NULL &&
2565                                            !map->system_map) {
2566                                         vm_map_entry_allocate_object(entry);
2567                                 }
2568                         }
2569                         entry->wired_count++;
2570                         entry->eflags |= MAP_ENTRY_USER_WIRED;
2571
2572                         /*
2573                          * Now fault in the area.  Note that vm_fault_wire()
2574                          * may release the map lock temporarily, it will be
2575                          * relocked on return.  The in-transition
2576                          * flag protects the entries. 
2577                          */
2578                         save_start = entry->start;
2579                         save_end = entry->end;
2580                         rv = vm_fault_wire(map, entry, TRUE, 0);
2581                         if (rv) {
2582                                 CLIP_CHECK_BACK(entry, save_start);
2583                                 for (;;) {
2584                                         KASSERT(entry->wired_count == 1, ("bad wired_count on entry"));
2585                                         entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2586                                         entry->wired_count = 0;
2587                                         if (entry->end == save_end)
2588                                                 break;
2589                                         entry = vm_map_rb_tree_RB_NEXT(entry);
2590                                         KASSERT(entry,
2591                                              ("bad entry clip during backout"));
2592                                 }
2593                                 end = save_start;       /* unwire the rest */
2594                                 break;
2595                         }
2596                         /*
2597                          * note that even though the entry might have been
2598                          * clipped, the USER_WIRED flag we set prevents
2599                          * duplication so we do not have to do a 
2600                          * clip check.
2601                          */
2602                         entry = vm_map_rb_tree_RB_NEXT(entry);
2603                 }
2604
2605                 /*
2606                  * If we failed fall through to the unwiring section to
2607                  * unwire what we had wired so far.  'end' has already
2608                  * been adjusted.
2609                  */
2610                 if (rv)
2611                         new_pageable = 1;
2612
2613                 /*
2614                  * start_entry might have been clipped if we unlocked the
2615                  * map and blocked.  No matter how clipped it has gotten
2616                  * there should be a fragment that is on our start boundary.
2617                  */
2618                 CLIP_CHECK_BACK(start_entry, start);
2619         }
2620
2621         /*
2622          * Deal with the unwiring case.
2623          */
2624         if (new_pageable) {
2625                 /*
2626                  * This is the unwiring case.  We must first ensure that the
2627                  * range to be unwired is really wired down.  We know there
2628                  * are no holes.
2629                  */
2630                 entry = start_entry;
2631                 while (entry && entry->start < end) {
2632                         if ((entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
2633                                 rv = KERN_INVALID_ARGUMENT;
2634                                 goto done;
2635                         }
2636                         KASSERT(entry->wired_count != 0,
2637                                 ("wired count was 0 with USER_WIRED set! %p",
2638                                  entry));
2639                         entry = vm_map_rb_tree_RB_NEXT(entry);
2640                 }
2641
2642                 /*
2643                  * Now decrement the wiring count for each region. If a region
2644                  * becomes completely unwired, unwire its physical pages and
2645                  * mappings.
2646                  */
2647                 /*
2648                  * The map entries are processed in a loop, checking to
2649                  * make sure the entry is wired and asserting it has a wired
2650                  * count. However, another loop was inserted more-or-less in
2651                  * the middle of the unwiring path. This loop picks up the
2652                  * "entry" loop variable from the first loop without first
2653                  * setting it to start_entry. Naturally, the secound loop
2654                  * is never entered and the pages backing the entries are
2655                  * never unwired. This can lead to a leak of wired pages.
2656                  */
2657                 entry = start_entry;
2658                 while (entry && entry->start < end) {
2659                         KASSERT(entry->eflags & MAP_ENTRY_USER_WIRED,
2660                                 ("expected USER_WIRED on entry %p", entry));
2661                         entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2662                         entry->wired_count--;
2663                         if (entry->wired_count == 0)
2664                                 vm_fault_unwire(map, entry);
2665                         entry = vm_map_rb_tree_RB_NEXT(entry);
2666                 }
2667         }
2668 done:
2669         vm_map_unclip_range(map, start_entry, start, real_end, &count,
2670                 MAP_CLIP_NO_HOLES);
2671         vm_map_unlock(map);
2672         vm_map_entry_release(count);
2673
2674         return (rv);
2675 }
2676
2677 /*
2678  * Sets the pageability of the specified address range in the target map.
2679  * Regions specified as not pageable require locked-down physical
2680  * memory and physical page maps.
2681  *
2682  * The map must not be locked, but a reference must remain to the map
2683  * throughout the call.
2684  *
2685  * This function may be called via the zalloc path and must properly
2686  * reserve map entries for kernel_map.
2687  *
2688  * No requirements.
2689  */
2690 int
2691 vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t real_end, int kmflags)
2692 {
2693         vm_map_entry_t entry;
2694         vm_map_entry_t start_entry;
2695         vm_offset_t end;
2696         int rv = KERN_SUCCESS;
2697         int count;
2698
2699         if (kmflags & KM_KRESERVE)
2700                 count = vm_map_entry_kreserve(MAP_RESERVE_COUNT);
2701         else
2702                 count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2703         vm_map_lock(map);
2704         VM_MAP_RANGE_CHECK(map, start, real_end);
2705         end = real_end;
2706
2707         start_entry = vm_map_clip_range(map, start, end, &count,
2708                                         MAP_CLIP_NO_HOLES);
2709         if (start_entry == NULL) {
2710                 vm_map_unlock(map);
2711                 rv = KERN_INVALID_ADDRESS;
2712                 goto failure;
2713         }
2714         if ((kmflags & KM_PAGEABLE) == 0) {
2715                 /*
2716                  * Wiring.  
2717                  *
2718                  * 1.  Holding the write lock, we create any shadow or zero-fill
2719                  * objects that need to be created. Then we clip each map
2720                  * entry to the region to be wired and increment its wiring
2721                  * count.  We create objects before clipping the map entries
2722                  * to avoid object proliferation.
2723                  *
2724                  * 2.  We downgrade to a read lock, and call vm_fault_wire to
2725                  * fault in the pages for any newly wired area (wired_count is
2726                  * 1).
2727                  *
2728                  * Downgrading to a read lock for vm_fault_wire avoids a 
2729                  * possible deadlock with another process that may have faulted
2730                  * on one of the pages to be wired (it would mark the page busy,
2731                  * blocking us, then in turn block on the map lock that we
2732                  * hold).  Because of problems in the recursive lock package,
2733                  * we cannot upgrade to a write lock in vm_map_lookup.  Thus,
2734                  * any actions that require the write lock must be done
2735                  * beforehand.  Because we keep the read lock on the map, the
2736                  * copy-on-write status of the entries we modify here cannot
2737                  * change.
2738                  */
2739                 entry = start_entry;
2740                 while (entry && entry->start < end) {
2741                         /*
2742                          * Trivial case if the entry is already wired
2743                          */
2744                         if (entry->wired_count) {
2745                                 entry->wired_count++;
2746                                 entry = vm_map_rb_tree_RB_NEXT(entry);
2747                                 continue;
2748                         }
2749
2750                         /*
2751                          * The entry is being newly wired, we have to setup
2752                          * appropriate management structures.  A shadow 
2753                          * object is required for a copy-on-write region,
2754                          * or a normal object for a zero-fill region.  We
2755                          * do not have to do this for entries that point to sub
2756                          * maps because we won't hold the lock on the sub map.
2757                          */
2758                         if (entry->maptype == VM_MAPTYPE_NORMAL ||
2759                             entry->maptype == VM_MAPTYPE_VPAGETABLE) {
2760                                 int copyflag = entry->eflags &
2761                                                MAP_ENTRY_NEEDS_COPY;
2762                                 if (copyflag && ((entry->protection &
2763                                                   VM_PROT_WRITE) != 0)) {
2764                                         vm_map_entry_shadow(entry, 0);
2765                                 } else if (entry->ba.object == NULL &&
2766                                            !map->system_map) {
2767                                         vm_map_entry_allocate_object(entry);
2768                                 }
2769                         }
2770                         entry->wired_count++;
2771                         entry = vm_map_rb_tree_RB_NEXT(entry);
2772                 }
2773
2774                 /*
2775                  * Pass 2.
2776                  */
2777
2778                 /*
2779                  * HACK HACK HACK HACK
2780                  *
2781                  * vm_fault_wire() temporarily unlocks the map to avoid
2782                  * deadlocks.  The in-transition flag from vm_map_clip_range
2783                  * call should protect us from changes while the map is
2784                  * unlocked.  T
2785                  *
2786                  * NOTE: Previously this comment stated that clipping might
2787                  *       still occur while the entry is unlocked, but from
2788                  *       what I can tell it actually cannot.
2789                  *
2790                  *       It is unclear whether the CLIP_CHECK_*() calls
2791                  *       are still needed but we keep them in anyway.
2792                  *
2793                  * HACK HACK HACK HACK
2794                  */
2795
2796                 entry = start_entry;
2797                 while (entry && entry->start < end) {
2798                         /*
2799                          * If vm_fault_wire fails for any page we need to undo
2800                          * what has been done.  We decrement the wiring count
2801                          * for those pages which have not yet been wired (now)
2802                          * and unwire those that have (later).
2803                          */
2804                         vm_offset_t save_start = entry->start;
2805                         vm_offset_t save_end = entry->end;
2806
2807                         if (entry->wired_count == 1)
2808                                 rv = vm_fault_wire(map, entry, FALSE, kmflags);
2809                         if (rv) {
2810                                 CLIP_CHECK_BACK(entry, save_start);
2811                                 for (;;) {
2812                                         KASSERT(entry->wired_count == 1,
2813                                           ("wired_count changed unexpectedly"));
2814                                         entry->wired_count = 0;
2815                                         if (entry->end == save_end)
2816                                                 break;
2817                                         entry = vm_map_rb_tree_RB_NEXT(entry);
2818                                         KASSERT(entry,
2819                                           ("bad entry clip during backout"));
2820                                 }
2821                                 end = save_start;
2822                                 break;
2823                         }
2824                         CLIP_CHECK_FWD(entry, save_end);
2825                         entry = vm_map_rb_tree_RB_NEXT(entry);
2826                 }
2827
2828                 /*
2829                  * If a failure occured undo everything by falling through
2830                  * to the unwiring code.  'end' has already been adjusted
2831                  * appropriately.
2832                  */
2833                 if (rv)
2834                         kmflags |= KM_PAGEABLE;
2835
2836                 /*
2837                  * start_entry is still IN_TRANSITION but may have been 
2838                  * clipped since vm_fault_wire() unlocks and relocks the
2839                  * map.  No matter how clipped it has gotten there should
2840                  * be a fragment that is on our start boundary.
2841                  */
2842                 CLIP_CHECK_BACK(start_entry, start);
2843         }
2844
2845         if (kmflags & KM_PAGEABLE) {
2846                 /*
2847                  * This is the unwiring case.  We must first ensure that the
2848                  * range to be unwired is really wired down.  We know there
2849                  * are no holes.
2850                  */
2851                 entry = start_entry;
2852                 while (entry && entry->start < end) {
2853                         if (entry->wired_count == 0) {
2854                                 rv = KERN_INVALID_ARGUMENT;
2855                                 goto done;
2856                         }
2857                         entry = vm_map_rb_tree_RB_NEXT(entry);
2858                 }
2859
2860                 /*
2861                  * Now decrement the wiring count for each region. If a region
2862                  * becomes completely unwired, unwire its physical pages and
2863                  * mappings.
2864                  */
2865                 entry = start_entry;
2866                 while (entry && entry->start < end) {
2867                         entry->wired_count--;
2868                         if (entry->wired_count == 0)
2869                                 vm_fault_unwire(map, entry);
2870                         entry = vm_map_rb_tree_RB_NEXT(entry);
2871                 }
2872         }
2873 done:
2874         vm_map_unclip_range(map, start_entry, start, real_end,
2875                             &count, MAP_CLIP_NO_HOLES);
2876         vm_map_unlock(map);
2877 failure:
2878         if (kmflags & KM_KRESERVE)
2879                 vm_map_entry_krelease(count);
2880         else
2881                 vm_map_entry_release(count);
2882         return (rv);
2883 }
2884
2885 /*
2886  * Mark a newly allocated address range as wired but do not fault in
2887  * the pages.  The caller is expected to load the pages into the object.
2888  *
2889  * The map must be locked on entry and will remain locked on return.
2890  * No other requirements.
2891  */
2892 void
2893 vm_map_set_wired_quick(vm_map_t map, vm_offset_t addr, vm_size_t size,
2894                        int *countp)
2895 {
2896         vm_map_entry_t scan;
2897         vm_map_entry_t entry;
2898
2899         entry = vm_map_clip_range(map, addr, addr + size,
2900                                   countp, MAP_CLIP_NO_HOLES);
2901         scan = entry;
2902         while (scan && scan->start < addr + size) {
2903                 KKASSERT(scan->wired_count == 0);
2904                 scan->wired_count = 1;
2905                 scan = vm_map_rb_tree_RB_NEXT(scan);
2906         }
2907         vm_map_unclip_range(map, entry, addr, addr + size,
2908                             countp, MAP_CLIP_NO_HOLES);
2909 }
2910
2911 /*
2912  * Push any dirty cached pages in the address range to their pager.
2913  * If syncio is TRUE, dirty pages are written synchronously.
2914  * If invalidate is TRUE, any cached pages are freed as well.
2915  *
2916  * This routine is called by sys_msync()
2917  *
2918  * Returns an error if any part of the specified range is not mapped.
2919  *
2920  * No requirements.
2921  */
2922 int
2923 vm_map_clean(vm_map_t map, vm_offset_t start, vm_offset_t end,
2924              boolean_t syncio, boolean_t invalidate)
2925 {
2926         vm_map_entry_t current;
2927         vm_map_entry_t next;
2928         vm_map_entry_t entry;
2929         vm_map_backing_t ba;
2930         vm_size_t size;
2931         vm_object_t object;
2932         vm_ooffset_t offset;
2933
2934         vm_map_lock_read(map);
2935         VM_MAP_RANGE_CHECK(map, start, end);
2936         if (!vm_map_lookup_entry(map, start, &entry)) {
2937                 vm_map_unlock_read(map);
2938                 return (KERN_INVALID_ADDRESS);
2939         }
2940         lwkt_gettoken(&map->token);
2941
2942         /*
2943          * Make a first pass to check for holes.
2944          */
2945         current = entry;
2946         while (current && current->start < end) {
2947                 if (current->maptype == VM_MAPTYPE_SUBMAP) {
2948                         lwkt_reltoken(&map->token);
2949                         vm_map_unlock_read(map);
2950                         return (KERN_INVALID_ARGUMENT);
2951                 }
2952                 next = vm_map_rb_tree_RB_NEXT(current);
2953                 if (end > current->end &&
2954                     (next == NULL ||
2955                      current->end != next->start)) {
2956                         lwkt_reltoken(&map->token);
2957                         vm_map_unlock_read(map);
2958                         return (KERN_INVALID_ADDRESS);
2959                 }
2960                 current = next;
2961         }
2962
2963         if (invalidate)
2964                 pmap_remove(vm_map_pmap(map), start, end);
2965
2966         /*
2967          * Make a second pass, cleaning/uncaching pages from the indicated
2968          * objects as we go.
2969          */
2970         current = entry;
2971         while (current && current->start < end) {
2972                 offset = current->ba.offset + (start - current->start);
2973                 size = (end <= current->end ? end : current->end) - start;
2974
2975                 switch(current->maptype) {
2976                 case VM_MAPTYPE_SUBMAP:
2977                 {
2978                         vm_map_t smap;
2979                         vm_map_entry_t tentry;
2980                         vm_size_t tsize;
2981
2982                         smap = current->ba.sub_map;
2983                         vm_map_lock_read(smap);
2984                         vm_map_lookup_entry(smap, offset, &tentry);
2985                         if (tentry == NULL) {
2986                                 tsize = vm_map_max(smap) - offset;
2987                                 ba = NULL;
2988                                 offset = 0 + (offset - vm_map_min(smap));
2989                         } else {
2990                                 tsize = tentry->end - offset;
2991                                 ba = &tentry->ba;
2992                                 offset = tentry->ba.offset +
2993                                          (offset - tentry->start);
2994                         }
2995                         vm_map_unlock_read(smap);
2996                         if (tsize < size)
2997                                 size = tsize;
2998                         break;
2999                 }
3000                 case VM_MAPTYPE_NORMAL:
3001                 case VM_MAPTYPE_VPAGETABLE:
3002                         ba = &current->ba;
3003                         break;
3004                 default:
3005                         ba = NULL;
3006                         break;
3007                 }
3008                 if (ba) {
3009                         object = ba->object;
3010                         if (object)
3011                                 vm_object_hold(object);
3012                 } else {
3013                         object = NULL;
3014                 }
3015
3016                 /*
3017                  * Note that there is absolutely no sense in writing out
3018                  * anonymous objects, so we track down the vnode object
3019                  * to write out.
3020                  * We invalidate (remove) all pages from the address space
3021                  * anyway, for semantic correctness.
3022                  *
3023                  * note: certain anonymous maps, such as MAP_NOSYNC maps,
3024                  * may start out with a NULL object.
3025                  *
3026                  * XXX do we really want to stop at the first backing store
3027                  * here if there are more? XXX
3028                  */
3029                 if (ba) {
3030                         vm_object_t tobj;
3031
3032                         tobj = object;
3033                         while (ba->backing_ba != NULL) {
3034                                 ba = ba->backing_ba;
3035                                 offset += ba->offset;
3036                                 tobj = ba->object;
3037                                 if (tobj->size < OFF_TO_IDX(offset + size))
3038                                         size = IDX_TO_OFF(tobj->size) - offset;
3039                                 break; /* XXX this break is not correct */
3040                         }
3041                         if (object != tobj) {
3042                                 if (object)
3043                                         vm_object_drop(object);
3044                                 object = tobj;
3045                                 vm_object_hold(object);
3046                         }
3047                 }
3048
3049                 if (object && (object->type == OBJT_VNODE) && 
3050                     (current->protection & VM_PROT_WRITE) &&
3051                     (object->flags & OBJ_NOMSYNC) == 0) {
3052                         /*
3053                          * Flush pages if writing is allowed, invalidate them
3054                          * if invalidation requested.  Pages undergoing I/O
3055                          * will be ignored by vm_object_page_remove().
3056                          *
3057                          * We cannot lock the vnode and then wait for paging
3058                          * to complete without deadlocking against vm_fault.
3059                          * Instead we simply call vm_object_page_remove() and
3060                          * allow it to block internally on a page-by-page 
3061                          * basis when it encounters pages undergoing async 
3062                          * I/O.
3063                          */
3064                         int flags;
3065
3066                         /* no chain wait needed for vnode objects */
3067                         vm_object_reference_locked(object);
3068                         vn_lock(object->handle, LK_EXCLUSIVE | LK_RETRY);
3069                         flags = (syncio || invalidate) ? OBJPC_SYNC : 0;
3070                         flags |= invalidate ? OBJPC_INVAL : 0;
3071
3072                         /*
3073                          * When operating on a virtual page table just
3074                          * flush the whole object.  XXX we probably ought
3075                          * to 
3076                          */
3077                         switch(current->maptype) {
3078                         case VM_MAPTYPE_NORMAL:
3079                                 vm_object_page_clean(object,
3080                                     OFF_TO_IDX(offset),
3081                                     OFF_TO_IDX(offset + size + PAGE_MASK),
3082                                     flags);
3083                                 break;
3084                         case VM_MAPTYPE_VPAGETABLE:
3085                                 vm_object_page_clean(object, 0, 0, flags);
3086                                 break;
3087                         }
3088                         vn_unlock(((struct vnode *)object->handle));
3089                         vm_object_deallocate_locked(object);
3090                 }
3091                 if (object && invalidate &&
3092                    ((object->type == OBJT_VNODE) ||
3093                     (object->type == OBJT_DEVICE) ||
3094                     (object->type == OBJT_MGTDEVICE))) {
3095                         int clean_only = 
3096                                 ((object->type == OBJT_DEVICE) ||
3097                                 (object->type == OBJT_MGTDEVICE)) ? FALSE : TRUE;
3098                         /* no chain wait needed for vnode/device objects */
3099                         vm_object_reference_locked(object);
3100                         switch(current->maptype) {
3101                         case VM_MAPTYPE_NORMAL:
3102                                 vm_object_page_remove(object,
3103                                     OFF_TO_IDX(offset),
3104                                     OFF_TO_IDX(offset + size + PAGE_MASK),
3105                                     clean_only);
3106                                 break;
3107                         case VM_MAPTYPE_VPAGETABLE:
3108                                 vm_object_page_remove(object, 0, 0, clean_only);
3109                                 break;
3110                         }
3111                         vm_object_deallocate_locked(object);
3112                 }
3113                 start += size;
3114                 if (object)
3115                         vm_object_drop(object);
3116                 current = vm_map_rb_tree_RB_NEXT(current);
3117         }
3118
3119         lwkt_reltoken(&map->token);
3120         vm_map_unlock_read(map);
3121
3122         return (KERN_SUCCESS);
3123 }
3124
3125 /*
3126  * Make the region specified by this entry pageable.
3127  *
3128  * The vm_map must be exclusively locked.
3129  */
3130 static void 
3131 vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
3132 {
3133         entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3134         entry->wired_count = 0;
3135         vm_fault_unwire(map, entry);
3136 }
3137
3138 /*
3139  * Deallocate the given entry from the target map.
3140  *
3141  * The vm_map must be exclusively locked.
3142  */
3143 static void
3144 vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry, int *countp)
3145 {
3146         vm_map_entry_unlink(map, entry);
3147         map->size -= entry->end - entry->start;
3148         vm_map_entry_dispose(map, entry, countp);
3149 }
3150
3151 /*
3152  * Deallocates the given address range from the target map.
3153  *
3154  * The vm_map must be exclusively locked.
3155  */
3156 int
3157 vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end, int *countp)
3158 {
3159         vm_object_t object;
3160         vm_map_entry_t entry;
3161         vm_map_entry_t first_entry;
3162         vm_offset_t hole_start;
3163
3164         ASSERT_VM_MAP_LOCKED(map);
3165         lwkt_gettoken(&map->token);
3166 again:
3167         /*
3168          * Find the start of the region, and clip it.  Set entry to point
3169          * at the first record containing the requested address or, if no
3170          * such record exists, the next record with a greater address.  The
3171          * loop will run from this point until a record beyond the termination
3172          * address is encountered.
3173          *
3174          * Adjust freehint[] for either the clip case or the extension case.
3175          *
3176          * GGG see other GGG comment.
3177          */
3178         if (vm_map_lookup_entry(map, start, &first_entry)) {
3179                 entry = first_entry;
3180                 vm_map_clip_start(map, entry, start, countp);
3181                 hole_start = start;
3182         } else {
3183                 if (first_entry) {
3184                         entry = vm_map_rb_tree_RB_NEXT(first_entry);
3185                         if (entry == NULL)
3186                                 hole_start = first_entry->start;
3187                         else
3188                                 hole_start = first_entry->end;
3189                 } else {
3190                         entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
3191                         if (entry == NULL)
3192                                 hole_start = vm_map_min(map);
3193                         else
3194                                 hole_start = vm_map_max(map);
3195                 }
3196         }
3197
3198         /*
3199          * Step through all entries in this region
3200          */
3201         while (entry && entry->start < end) {
3202                 vm_map_entry_t next;
3203                 vm_offset_t s, e;
3204                 vm_pindex_t offidxstart, offidxend, count;
3205
3206                 /*
3207                  * If we hit an in-transition entry we have to sleep and
3208                  * retry.  It's easier (and not really slower) to just retry
3209                  * since this case occurs so rarely and the hint is already
3210                  * pointing at the right place.  We have to reset the
3211                  * start offset so as not to accidently delete an entry
3212                  * another process just created in vacated space.
3213                  */
3214                 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
3215                         entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
3216                         start = entry->start;
3217                         ++mycpu->gd_cnt.v_intrans_coll;
3218                         ++mycpu->gd_cnt.v_intrans_wait;
3219                         vm_map_transition_wait(map, 1);
3220                         goto again;
3221                 }
3222                 vm_map_clip_end(map, entry, end, countp);
3223
3224                 s = entry->start;
3225                 e = entry->end;
3226                 next = vm_map_rb_tree_RB_NEXT(entry);
3227
3228                 offidxstart = OFF_TO_IDX(entry->ba.offset);
3229                 count = OFF_TO_IDX(e - s);
3230
3231                 switch(entry->maptype) {
3232                 case VM_MAPTYPE_NORMAL:
3233                 case VM_MAPTYPE_VPAGETABLE:
3234                 case VM_MAPTYPE_SUBMAP:
3235                         object = entry->ba.object;
3236                         break;
3237                 default:
3238                         object = NULL;
3239                         break;
3240                 }
3241
3242                 /*
3243                  * Unwire before removing addresses from the pmap; otherwise,
3244                  * unwiring will put the entries back in the pmap.
3245                  *
3246                  * Generally speaking, doing a bulk pmap_remove() before
3247                  * removing the pages from the VM object is better at
3248                  * reducing unnecessary IPIs.  The pmap code is now optimized
3249                  * to not blindly iterate the range when pt and pd pages
3250                  * are missing.
3251                  */
3252                 if (entry->wired_count != 0)
3253                         vm_map_entry_unwire(map, entry);
3254
3255                 offidxend = offidxstart + count;
3256
3257                 if (object == &kernel_object) {
3258                         pmap_remove(map->pmap, s, e);
3259                         vm_object_hold(object);
3260                         vm_object_page_remove(object, offidxstart,
3261                                               offidxend, FALSE);
3262                         vm_object_drop(object);
3263                 } else if (object && object->type != OBJT_DEFAULT &&
3264                            object->type != OBJT_SWAP) {
3265                         /*
3266                          * vnode object routines cannot be chain-locked,
3267                          * but since we aren't removing pages from the
3268                          * object here we can use a shared hold.
3269                          */
3270                         vm_object_hold_shared(object);
3271                         pmap_remove(map->pmap, s, e);
3272                         vm_object_drop(object);
3273                 } else if (object) {
3274                         vm_object_hold(object);
3275                         pmap_remove(map->pmap, s, e);
3276
3277                         if (object != NULL &&
3278                             object->ref_count != 1 &&
3279                             (object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) ==
3280                              OBJ_ONEMAPPING &&
3281                             (object->type == OBJT_DEFAULT ||
3282                              object->type == OBJT_SWAP)) {
3283                                 /*
3284                                  * When ONEMAPPING is set we can destroy the
3285                                  * pages underlying the entry's range.
3286                                  */
3287                                 /*vm_object_collapse(object, NULL);*/
3288                                 vm_object_page_remove(object, offidxstart,
3289                                                       offidxend, FALSE);
3290                                 if (object->type == OBJT_SWAP) {
3291                                         swap_pager_freespace(object,
3292                                                              offidxstart,
3293                                                              count);
3294                                 }
3295                                 if (offidxend >= object->size &&
3296                                     offidxstart < object->size) {
3297                                         object->size = offidxstart;
3298                                 }
3299                         }
3300                         vm_object_drop(object);
3301                 } else if (entry->maptype == VM_MAPTYPE_UKSMAP) {
3302                         pmap_remove(map->pmap, s, e);
3303                 }
3304
3305                 /*
3306                  * Delete the entry (which may delete the object) only after
3307                  * removing all pmap entries pointing to its pages.
3308                  * (Otherwise, its page frames may be reallocated, and any
3309                  * modify bits will be set in the wrong object!)
3310                  */
3311                 vm_map_entry_delete(map, entry, countp);
3312                 entry = next;
3313         }
3314
3315         /*
3316          * We either reached the end and use vm_map_max as the end
3317          * address, or we didn't and we use the next entry as the
3318          * end address.
3319          */
3320         if (entry == NULL) {
3321                 vm_map_freehint_hole(map, hole_start,
3322                                      vm_map_max(map) - hole_start);
3323         } else {
3324                 vm_map_freehint_hole(map, hole_start,
3325                                      entry->start - hole_start);
3326         }
3327
3328         lwkt_reltoken(&map->token);
3329
3330         return (KERN_SUCCESS);
3331 }
3332
3333 /*
3334  * Remove the given address range from the target map.
3335  * This is the exported form of vm_map_delete.
3336  *
3337  * No requirements.
3338  */
3339 int
3340 vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
3341 {
3342         int result;
3343         int count;
3344
3345         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
3346         vm_map_lock(map);
3347         VM_MAP_RANGE_CHECK(map, start, end);
3348         result = vm_map_delete(map, start, end, &count);
3349         vm_map_unlock(map);
3350         vm_map_entry_release(count);
3351
3352         return (result);
3353 }
3354
3355 /*
3356  * Assert that the target map allows the specified privilege on the
3357  * entire address region given.  The entire region must be allocated.
3358  *
3359  * The caller must specify whether the vm_map is already locked or not.
3360  */
3361 boolean_t
3362 vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
3363                         vm_prot_t protection, boolean_t have_lock)
3364 {
3365         vm_map_entry_t entry;
3366         vm_map_entry_t tmp_entry;
3367         boolean_t result;
3368
3369         if (have_lock == FALSE)
3370                 vm_map_lock_read(map);
3371
3372         if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
3373                 if (have_lock == FALSE)
3374                         vm_map_unlock_read(map);
3375                 return (FALSE);
3376         }
3377         entry = tmp_entry;
3378
3379         result = TRUE;
3380         while (start < end) {
3381                 if (entry == NULL) {
3382                         result = FALSE;
3383                         break;
3384                 }
3385
3386                 /*
3387                  * No holes allowed!
3388                  */
3389
3390                 if (start < entry->start) {
3391                         result = FALSE;
3392                         break;
3393                 }
3394                 /*
3395                  * Check protection associated with entry.
3396                  */
3397
3398                 if ((entry->protection & protection) != protection) {
3399                         result = FALSE;
3400                         break;
3401                 }
3402                 /* go to next entry */
3403                 start = entry->end;
3404                 entry = vm_map_rb_tree_RB_NEXT(entry);
3405         }
3406         if (have_lock == FALSE)
3407                 vm_map_unlock_read(map);
3408         return (result);
3409 }
3410
3411 /*
3412  * Handles the dirty work of making src_entry and dst_entry copy-on-write
3413  * after src_entry has been cloned to dst_entry.
3414  *
3415  * The vm_maps must be exclusively locked.
3416  * The vm_map's token must be held.
3417  *
3418  * Because the maps are locked no faults can be in progress during the
3419  * operation.
3420  */
3421 static void
3422 vm_map_copy_entry(vm_map_t src_map, vm_map_t dst_map,
3423                   vm_map_entry_t src_entry, vm_map_entry_t dst_entry)
3424 {
3425         vm_object_t src_object;
3426
3427         /*
3428          * Nothing to do for special map types
3429          */
3430         if (dst_entry->maptype == VM_MAPTYPE_SUBMAP ||
3431             dst_entry->maptype == VM_MAPTYPE_UKSMAP) {
3432                 return;
3433         }
3434         if (src_entry->maptype == VM_MAPTYPE_SUBMAP ||
3435             src_entry->maptype == VM_MAPTYPE_UKSMAP) {
3436                 return;
3437         }
3438
3439         if (src_entry->wired_count) {
3440                 /*
3441                  * Of course, wired down pages can't be set copy-on-write.
3442                  * Cause wired pages to be copied into the new map by
3443                  * simulating faults (the new pages are pageable)
3444                  *
3445                  * Scrap ba.object (its ref-count has not yet been adjusted
3446                  * so we can just NULL out the field).  Remove the backing
3447                  * store.
3448                  *
3449                  * Then call vm_fault_copy_entry() to create a new object
3450                  * in dst_entry and copy the wired pages from src to dst.
3451                  */
3452                 dst_entry->ba.object = NULL;
3453                 vm_map_entry_dispose_ba(dst_entry->ba.backing_ba);
3454                 dst_entry->ba.backing_ba = NULL;
3455                 dst_entry->ba.backing_count = 0;
3456                 vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry);
3457         } else {
3458                 if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) {
3459                         /*
3460                          * If the source entry is not already marked NEEDS_COPY
3461                          * we need to write-protect the PTEs.
3462                          */
3463                         pmap_protect(src_map->pmap,
3464                                      src_entry->start,
3465                                      src_entry->end,
3466                                      src_entry->protection & ~VM_PROT_WRITE);
3467                 }
3468
3469                 /*
3470                  * dst_entry.ba_object might be stale.  Update it (its
3471                  * ref-count has not yet been updated so just overwrite
3472                  * the field).
3473                  *
3474                  * If there is no object then we are golden.  Also, in
3475                  * this situation if there are no backing_ba linkages then
3476                  * we can set ba.offset to 0 for debugging convenience.
3477                  *
3478                  * ba.offset cannot otherwise be modified because it effects
3479                  * the offsets for the entire backing_ba chain.
3480                  */
3481                 src_object = src_entry->ba.object;
3482
3483                 if (src_object) {
3484                         vm_object_hold(src_object);     /* for ref & flag clr */
3485                         vm_object_reference_locked(src_object);
3486                         vm_object_clear_flag(src_object, OBJ_ONEMAPPING);
3487
3488                         src_entry->eflags |= (MAP_ENTRY_COW |
3489                                               MAP_ENTRY_NEEDS_COPY);
3490                         dst_entry->eflags |= (MAP_ENTRY_COW |
3491                                               MAP_ENTRY_NEEDS_COPY);
3492                         KKASSERT(dst_entry->ba.offset == src_entry->ba.offset);
3493                         vm_object_drop(src_object);
3494                 } else {
3495                         if (dst_entry->ba.backing_ba == NULL)
3496                                 dst_entry->ba.offset = 0;
3497                 }
3498
3499                 /*
3500                  * Normal, allow the backing_ba link depth to
3501                  * increase.
3502                  */
3503                 pmap_copy(dst_map->pmap, src_map->pmap,
3504                           dst_entry->start,
3505                           dst_entry->end - dst_entry->start,
3506                           src_entry->start);
3507         }
3508 }
3509
3510 /*
3511  * vmspace_fork:
3512  * Create a new process vmspace structure and vm_map
3513  * based on those of an existing process.  The new map
3514  * is based on the old map, according to the inheritance
3515  * values on the regions in that map.
3516  *
3517  * The source map must not be locked.
3518  * No requirements.
3519  */
3520 static void vmspace_fork_normal_entry(vm_map_t old_map, vm_map_t new_map,
3521                           vm_map_entry_t old_entry, int *countp);
3522 static void vmspace_fork_uksmap_entry(vm_map_t old_map, vm_map_t new_map,
3523                           vm_map_entry_t old_entry, int *countp);
3524
3525 struct vmspace *
3526 vmspace_fork(struct vmspace *vm1)
3527 {
3528         struct vmspace *vm2;
3529         vm_map_t old_map = &vm1->vm_map;
3530         vm_map_t new_map;
3531         vm_map_entry_t old_entry;
3532         int count;
3533
3534         lwkt_gettoken(&vm1->vm_map.token);
3535         vm_map_lock(old_map);
3536
3537         vm2 = vmspace_alloc(vm_map_min(old_map), vm_map_max(old_map));
3538         lwkt_gettoken(&vm2->vm_map.token);
3539
3540         /*
3541          * We must bump the timestamp to force any concurrent fault
3542          * to retry.
3543          */
3544         bcopy(&vm1->vm_startcopy, &vm2->vm_startcopy,
3545               (caddr_t)&vm1->vm_endcopy - (caddr_t)&vm1->vm_startcopy);
3546         new_map = &vm2->vm_map; /* XXX */
3547         new_map->timestamp = 1;
3548
3549         vm_map_lock(new_map);
3550
3551         count = old_map->nentries;
3552         count = vm_map_entry_reserve(count + MAP_RESERVE_COUNT);
3553
3554         RB_FOREACH(old_entry, vm_map_rb_tree, &old_map->rb_root) {
3555                 switch(old_entry->maptype) {
3556                 case VM_MAPTYPE_SUBMAP:
3557                         panic("vm_map_fork: encountered a submap");
3558                         break;
3559                 case VM_MAPTYPE_UKSMAP:
3560                         vmspace_fork_uksmap_entry(old_map, new_map,
3561                                                   old_entry, &count);
3562                         break;
3563                 case VM_MAPTYPE_NORMAL:
3564                 case VM_MAPTYPE_VPAGETABLE:
3565                         vmspace_fork_normal_entry(old_map, new_map,
3566                                                   old_entry, &count);
3567                         break;
3568                 }
3569         }
3570
3571         new_map->size = old_map->size;
3572         vm_map_unlock(new_map);
3573         vm_map_unlock(old_map);
3574         vm_map_entry_release(count);
3575
3576         lwkt_reltoken(&vm2->vm_map.token);
3577         lwkt_reltoken(&vm1->vm_map.token);
3578
3579         return (vm2);
3580 }
3581
3582 static
3583 void
3584 vmspace_fork_normal_entry(vm_map_t old_map, vm_map_t new_map,
3585                           vm_map_entry_t old_entry, int *countp)
3586 {
3587         vm_map_entry_t new_entry;
3588         vm_object_t object;
3589
3590         switch (old_entry->inheritance) {
3591         case VM_INHERIT_NONE:
3592                 break;
3593         case VM_INHERIT_SHARE:
3594                 /*
3595                  * Clone the entry as a shared entry.  This will look like
3596                  * shared memory across the old and the new process.  We must
3597                  * ensure that the object is allocated.
3598                  */
3599                 if (old_entry->ba.object == NULL)
3600                         vm_map_entry_allocate_object(old_entry);
3601
3602                 if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
3603                         /*
3604                          * Create the fronting vm_map_backing for
3605                          * an entry which needs a copy, plus an extra
3606                          * ref because we are going to duplicate it
3607                          * in the fork.
3608                          *
3609                          * The call to vm_map_entry_shadow() will also clear
3610                          * OBJ_ONEMAPPING.
3611                          *
3612                          * XXX no more collapse.  Still need extra ref
3613                          * for the fork.
3614                          */
3615                         vm_map_entry_shadow(old_entry, 1);
3616                 } else if (old_entry->ba.object) {
3617                         /*
3618                          * We will make a shared copy of the object,
3619                          * and must clear OBJ_ONEMAPPING.
3620                          *
3621                          * Optimize vnode objects.  OBJ_ONEMAPPING
3622                          * is non-applicable but clear it anyway,
3623                          * and its terminal so we don't have to deal
3624                          * with chains.  Reduces SMP conflicts.
3625                          *
3626                          * XXX assert that object.vm_object != NULL
3627                          *     since we allocate it above.
3628                          */
3629                         object = old_entry->ba.object;
3630                         if (object->type == OBJT_VNODE) {
3631                                 vm_object_reference_quick(object);
3632                                 vm_object_clear_flag(object,
3633                                                      OBJ_ONEMAPPING);
3634                         } else {
3635                                 vm_object_hold(object);
3636                                 vm_object_reference_locked(object);
3637                                 vm_object_clear_flag(object, OBJ_ONEMAPPING);
3638                                 vm_object_drop(object);
3639                         }
3640                 }
3641
3642                 /*
3643                  * Clone the entry.  We've already bumped the ref on
3644                  * the vm_object for our new entry.
3645                  */
3646                 new_entry = vm_map_entry_create(new_map, countp);
3647                 *new_entry = *old_entry;
3648
3649                 new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3650                 new_entry->wired_count = 0;
3651                 if (new_entry->ba.backing_ba)
3652                         atomic_add_long(&new_entry->ba.backing_ba->refs, 1);
3653
3654                 /*
3655                  * Insert the entry into the new map -- we know we're
3656                  * inserting at the end of the new map.
3657                  */
3658                 vm_map_entry_link(new_map, new_entry);
3659
3660                 /*
3661                  * Update the physical map
3662                  */
3663                 pmap_copy(new_map->pmap, old_map->pmap,
3664                           new_entry->start,
3665                           (old_entry->end - old_entry->start),
3666                           old_entry->start);
3667                 break;
3668         case VM_INHERIT_COPY:
3669                 /*
3670                  * Clone the entry and link the copy into the new map.
3671                  *
3672                  * Note that ref-counting adjustment for old_entry->ba.object
3673                  * (if it isn't a special map that is) is handled by
3674                  * vm_map_copy_entry().
3675                  */
3676                 new_entry = vm_map_entry_create(new_map, countp);
3677                 *new_entry = *old_entry;
3678
3679                 new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3680                 new_entry->wired_count = 0;
3681                 if (new_entry->ba.backing_ba)
3682                         atomic_add_long(&new_entry->ba.backing_ba->refs, 1);
3683
3684                 vm_map_entry_link(new_map, new_entry);
3685
3686                 /*
3687                  * This does the actual dirty work of making both entries
3688                  * copy-on-write, and will also handle the fronting object.
3689                  */
3690                 vm_map_copy_entry(old_map, new_map, old_entry, new_entry);
3691                 break;
3692         }
3693 }
3694
3695 /*
3696  * When forking user-kernel shared maps, the map might change in the
3697  * child so do not try to copy the underlying pmap entries.
3698  */
3699 static
3700 void
3701 vmspace_fork_uksmap_entry(vm_map_t old_map, vm_map_t new_map,
3702                           vm_map_entry_t old_entry, int *countp)
3703 {
3704         vm_map_entry_t new_entry;
3705
3706         new_entry = vm_map_entry_create(new_map, countp);
3707         *new_entry = *old_entry;
3708
3709         new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3710         new_entry->wired_count = 0;
3711         if (new_entry->ba.backing_ba)
3712                 atomic_add_long(&new_entry->ba.backing_ba->refs, 1);
3713
3714         vm_map_entry_link(new_map, new_entry);
3715 }
3716
3717 /*
3718  * Create an auto-grow stack entry
3719  *
3720  * No requirements.
3721  */
3722 int
3723 vm_map_stack (vm_map_t map, vm_offset_t *addrbos, vm_size_t max_ssize,
3724               int flags, vm_prot_t prot, vm_prot_t max, int cow)
3725 {
3726         vm_map_entry_t  prev_entry;
3727         vm_map_entry_t  next;
3728         vm_size_t       init_ssize;
3729         int             rv;
3730         int             count;
3731         vm_offset_t     tmpaddr;
3732
3733         cow |= MAP_IS_STACK;
3734
3735         if (max_ssize < sgrowsiz)
3736                 init_ssize = max_ssize;
3737         else
3738                 init_ssize = sgrowsiz;
3739
3740         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
3741         vm_map_lock(map);
3742
3743         /*
3744          * Find space for the mapping
3745          */
3746         if ((flags & (MAP_FIXED | MAP_TRYFIXED)) == 0) {
3747                 if (vm_map_findspace(map, *addrbos, max_ssize, 1,
3748                                      flags, &tmpaddr)) {
3749                         vm_map_unlock(map);
3750                         vm_map_entry_release(count);
3751                         return (KERN_NO_SPACE);
3752                 }
3753                 *addrbos = tmpaddr;
3754         }
3755
3756         /* If addr is already mapped, no go */
3757         if (vm_map_lookup_entry(map, *addrbos, &prev_entry)) {
3758                 vm_map_unlock(map);
3759                 vm_map_entry_release(count);
3760                 return (KERN_NO_SPACE);
3761         }
3762
3763 #if 0
3764         /* XXX already handled by kern_mmap() */
3765         /* If we would blow our VMEM resource limit, no go */
3766         if (map->size + init_ssize >
3767             curproc->p_rlimit[RLIMIT_VMEM].rlim_cur) {
3768                 vm_map_unlock(map);
3769                 vm_map_entry_release(count);
3770                 return (KERN_NO_SPACE);
3771         }
3772 #endif
3773
3774         /*
3775          * If we can't accomodate max_ssize in the current mapping,
3776          * no go.  However, we need to be aware that subsequent user
3777          * mappings might map into the space we have reserved for
3778          * stack, and currently this space is not protected.  
3779          * 
3780          * Hopefully we will at least detect this condition 
3781          * when we try to grow the stack.
3782          */
3783         if (prev_entry)
3784                 next = vm_map_rb_tree_RB_NEXT(prev_entry);
3785         else
3786                 next = RB_MIN(vm_map_rb_tree, &map->rb_root);
3787
3788         if (next && next->start < *addrbos + max_ssize) {
3789                 vm_map_unlock(map);
3790                 vm_map_entry_release(count);
3791                 return (KERN_NO_SPACE);
3792         }
3793
3794         /*
3795          * We initially map a stack of only init_ssize.  We will
3796          * grow as needed later.  Since this is to be a grow 
3797          * down stack, we map at the top of the range.
3798          *
3799          * Note: we would normally expect prot and max to be
3800          * VM_PROT_ALL, and cow to be 0.  Possibly we should
3801          * eliminate these as input parameters, and just
3802          * pass these values here in the insert call.
3803          */
3804         rv = vm_map_insert(map, &count, NULL, NULL,
3805                            0, *addrbos + max_ssize - init_ssize,
3806                            *addrbos + max_ssize,
3807                            VM_MAPTYPE_NORMAL,
3808                            VM_SUBSYS_STACK, prot, max, cow);
3809
3810         /* Now set the avail_ssize amount */
3811         if (rv == KERN_SUCCESS) {
3812                 if (prev_entry)
3813                         next = vm_map_rb_tree_RB_NEXT(prev_entry);
3814                 else
3815                         next = RB_MIN(vm_map_rb_tree, &map->rb_root);
3816                 if (prev_entry != NULL) {
3817                         vm_map_clip_end(map,
3818                                         prev_entry,
3819                                         *addrbos + max_ssize - init_ssize,
3820                                         &count);
3821                 }
3822                 if (next->end   != *addrbos + max_ssize ||
3823                     next->start != *addrbos + max_ssize - init_ssize){
3824                         panic ("Bad entry start/end for new stack entry");
3825                 } else {
3826                         next->aux.avail_ssize = max_ssize - init_ssize;
3827                 }
3828         }
3829
3830         vm_map_unlock(map);
3831         vm_map_entry_release(count);
3832         return (rv);
3833 }
3834
3835 /*
3836  * Attempts to grow a vm stack entry.  Returns KERN_SUCCESS if the
3837  * desired address is already mapped, or if we successfully grow
3838  * the stack.  Also returns KERN_SUCCESS if addr is outside the
3839  * stack range (this is strange, but preserves compatibility with
3840  * the grow function in vm_machdep.c).
3841  *
3842  * No requirements.
3843  */
3844 int
3845 vm_map_growstack (vm_map_t map, vm_offset_t addr)
3846 {
3847         vm_map_entry_t prev_entry;
3848         vm_map_entry_t stack_entry;
3849         vm_map_entry_t next;
3850         struct vmspace *vm;
3851         struct lwp *lp;
3852         struct proc *p;
3853         vm_offset_t    end;
3854         int grow_amount;
3855         int rv = KERN_SUCCESS;
3856         int is_procstack;
3857         int use_read_lock = 1;
3858         int count;
3859
3860         /*
3861          * Find the vm
3862          */
3863         lp = curthread->td_lwp;
3864         p = curthread->td_proc;
3865         KKASSERT(lp != NULL);
3866         vm = lp->lwp_vmspace;
3867
3868         /*
3869          * Growstack is only allowed on the current process.  We disallow
3870          * other use cases, e.g. trying to access memory via procfs that
3871          * the stack hasn't grown into.
3872          */
3873         if (map != &vm->vm_map) {
3874                 return KERN_FAILURE;
3875         }
3876
3877         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
3878 Retry:
3879         if (use_read_lock)
3880                 vm_map_lock_read(map);
3881         else
3882                 vm_map_lock(map);
3883
3884         /*
3885          * If addr is already in the entry range, no need to grow.
3886          * prev_entry returns NULL if addr is at the head.
3887          */
3888         if (vm_map_lookup_entry(map, addr, &prev_entry))
3889                 goto done;
3890         if (prev_entry)
3891                 stack_entry = vm_map_rb_tree_RB_NEXT(prev_entry);
3892         else
3893                 stack_entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
3894
3895         if (stack_entry == NULL)
3896                 goto done;
3897         if (prev_entry == NULL)
3898                 end = stack_entry->start - stack_entry->aux.avail_ssize;
3899         else
3900                 end = prev_entry->end;
3901
3902         /*
3903          * This next test mimics the old grow function in vm_machdep.c.
3904          * It really doesn't quite make sense, but we do it anyway
3905          * for compatibility.
3906          *
3907          * If not growable stack, return success.  This signals the
3908          * caller to proceed as he would normally with normal vm.
3909          */
3910         if (stack_entry->aux.avail_ssize < 1 ||
3911             addr >= stack_entry->start ||
3912             addr <  stack_entry->start - stack_entry->aux.avail_ssize) {
3913                 goto done;
3914         } 
3915         
3916         /* Find the minimum grow amount */
3917         grow_amount = roundup (stack_entry->start - addr, PAGE_SIZE);
3918         if (grow_amount > stack_entry->aux.avail_ssize) {
3919                 rv = KERN_NO_SPACE;
3920                 goto done;
3921         }
3922
3923         /*
3924          * If there is no longer enough space between the entries
3925          * nogo, and adjust the available space.  Note: this 
3926          * should only happen if the user has mapped into the
3927          * stack area after the stack was created, and is
3928          * probably an error.
3929          *
3930          * This also effectively destroys any guard page the user
3931          * might have intended by limiting the stack size.
3932          */
3933         if (grow_amount > stack_entry->start - end) {
3934                 if (use_read_lock && vm_map_lock_upgrade(map)) {
3935                         /* lost lock */
3936                         use_read_lock = 0;
3937                         goto Retry;
3938                 }
3939                 use_read_lock = 0;
3940                 stack_entry->aux.avail_ssize = stack_entry->start - end;
3941                 rv = KERN_NO_SPACE;
3942                 goto done;
3943         }
3944
3945         is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr;
3946
3947         /* If this is the main process stack, see if we're over the 
3948          * stack limit.
3949          */
3950         if (is_procstack && (vm->vm_ssize + grow_amount >
3951                              p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
3952                 rv = KERN_NO_SPACE;
3953                 goto done;
3954         }
3955
3956         /* Round up the grow amount modulo SGROWSIZ */
3957         grow_amount = roundup (grow_amount, sgrowsiz);
3958         if (grow_amount > stack_entry->aux.avail_ssize) {
3959                 grow_amount = stack_entry->aux.avail_ssize;
3960         }
3961         if (is_procstack && (vm->vm_ssize + grow_amount >
3962                              p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
3963                 grow_amount = p->p_rlimit[RLIMIT_STACK].rlim_cur - vm->vm_ssize;
3964         }
3965
3966         /* If we would blow our VMEM resource limit, no go */
3967         if (map->size + grow_amount > p->p_rlimit[RLIMIT_VMEM].rlim_cur) {
3968                 rv = KERN_NO_SPACE;
3969                 goto done;
3970         }
3971
3972         if (use_read_lock && vm_map_lock_upgrade(map)) {
3973                 /* lost lock */
3974                 use_read_lock = 0;
3975                 goto Retry;
3976         }
3977         use_read_lock = 0;
3978
3979         /* Get the preliminary new entry start value */
3980         addr = stack_entry->start - grow_amount;
3981
3982         /* If this puts us into the previous entry, cut back our growth
3983          * to the available space.  Also, see the note above.
3984          */
3985         if (addr < end) {
3986                 stack_entry->aux.avail_ssize = stack_entry->start - end;
3987                 addr = end;
3988         }
3989
3990         rv = vm_map_insert(map, &count, NULL, NULL,
3991                            0, addr, stack_entry->start,
3992                            VM_MAPTYPE_NORMAL,
3993                            VM_SUBSYS_STACK, VM_PROT_ALL, VM_PROT_ALL, 0);
3994
3995         /* Adjust the available stack space by the amount we grew. */
3996         if (rv == KERN_SUCCESS) {
3997                 if (prev_entry) {
3998                         vm_map_clip_end(map, prev_entry, addr, &count);
3999                         next = vm_map_rb_tree_RB_NEXT(prev_entry);
4000                 } else {
4001                         next = RB_MIN(vm_map_rb_tree, &map->rb_root);
4002                 }
4003                 if (next->end != stack_entry->start  ||
4004                     next->start != addr) {
4005                         panic ("Bad stack grow start/end in new stack entry");
4006                 } else {
4007                         next->aux.avail_ssize =
4008                                 stack_entry->aux.avail_ssize -
4009                                 (next->end - next->start);
4010                         if (is_procstack) {
4011                                 vm->vm_ssize += next->end -
4012                                                 next->start;
4013                         }
4014                 }
4015
4016                 if (map->flags & MAP_WIREFUTURE)
4017                         vm_map_unwire(map, next->start, next->end, FALSE);
4018         }
4019
4020 done:
4021         if (use_read_lock)
4022                 vm_map_unlock_read(map);
4023         else
4024                 vm_map_unlock(map);
4025         vm_map_entry_release(count);
4026         return (rv);
4027 }
4028
4029 /*
4030  * Unshare the specified VM space for exec.  If other processes are
4031  * mapped to it, then create a new one.  The new vmspace is null.
4032  *
4033  * No requirements.
4034  */
4035 void
4036 vmspace_exec(struct proc *p, struct vmspace *vmcopy) 
4037 {
4038         struct vmspace *oldvmspace = p->p_vmspace;
4039         struct vmspace *newvmspace;
4040         vm_map_t map = &p->p_vmspace->vm_map;
4041
4042         /*
4043          * If we are execing a resident vmspace we fork it, otherwise
4044          * we create a new vmspace.  Note that exitingcnt is not
4045          * copied to the new vmspace.
4046          */
4047         lwkt_gettoken(&oldvmspace->vm_map.token);
4048         if (vmcopy)  {
4049                 newvmspace = vmspace_fork(vmcopy);
4050                 lwkt_gettoken(&newvmspace->vm_map.token);
4051         } else {
4052                 newvmspace = vmspace_alloc(vm_map_min(map), vm_map_max(map));
4053                 lwkt_gettoken(&newvmspace->vm_map.token);
4054                 bcopy(&oldvmspace->vm_startcopy, &newvmspace->vm_startcopy,
4055                       (caddr_t)&oldvmspace->vm_endcopy -
4056                        (caddr_t)&oldvmspace->vm_startcopy);
4057         }
4058
4059         /*
4060          * Finish initializing the vmspace before assigning it
4061          * to the process.  The vmspace will become the current vmspace
4062          * if p == curproc.
4063          */
4064         pmap_pinit2(vmspace_pmap(newvmspace));
4065         pmap_replacevm(p, newvmspace, 0);
4066         lwkt_reltoken(&newvmspace->vm_map.token);
4067         lwkt_reltoken(&oldvmspace->vm_map.token);
4068         vmspace_rel(oldvmspace);
4069 }
4070
4071 /*
4072  * Unshare the specified VM space for forcing COW.  This
4073  * is called by rfork, for the (RFMEM|RFPROC) == 0 case.
4074  */
4075 void
4076 vmspace_unshare(struct proc *p) 
4077 {
4078         struct vmspace *oldvmspace = p->p_vmspace;
4079         struct vmspace *newvmspace;
4080
4081         lwkt_gettoken(&oldvmspace->vm_map.token);
4082         if (vmspace_getrefs(oldvmspace) == 1) {
4083                 lwkt_reltoken(&oldvmspace->vm_map.token);
4084                 return;
4085         }
4086         newvmspace = vmspace_fork(oldvmspace);
4087         lwkt_gettoken(&newvmspace->vm_map.token);
4088         pmap_pinit2(vmspace_pmap(newvmspace));
4089         pmap_replacevm(p, newvmspace, 0);
4090         lwkt_reltoken(&newvmspace->vm_map.token);
4091         lwkt_reltoken(&oldvmspace->vm_map.token);
4092         vmspace_rel(oldvmspace);
4093 }
4094
4095 /*
4096  * vm_map_hint: return the beginning of the best area suitable for
4097  * creating a new mapping with "prot" protection.
4098  *
4099  * No requirements.
4100  */
4101 vm_offset_t
4102 vm_map_hint(struct proc *p, vm_offset_t addr, vm_prot_t prot)
4103 {
4104         struct vmspace *vms = p->p_vmspace;
4105         struct rlimit limit;
4106         rlim_t dsiz;
4107
4108         /*
4109          * Acquire datasize limit for mmap() operation,
4110          * calculate nearest power of 2.
4111          */
4112         if (kern_getrlimit(RLIMIT_DATA, &limit))
4113                 limit.rlim_cur = maxdsiz;
4114         dsiz = limit.rlim_cur;
4115
4116         if (!randomize_mmap || addr != 0) {
4117                 /*
4118                  * Set a reasonable start point for the hint if it was
4119                  * not specified or if it falls within the heap space.
4120                  * Hinted mmap()s do not allocate out of the heap space.
4121                  */
4122                 if (addr == 0 ||
4123                     (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
4124                      addr < round_page((vm_offset_t)vms->vm_daddr + dsiz))) {
4125                         addr = round_page((vm_offset_t)vms->vm_daddr + dsiz);
4126                 }
4127
4128                 return addr;
4129         }
4130
4131         /*
4132          * randomize_mmap && addr == 0.  For now randomize the
4133          * address within a dsiz range beyond the data limit.
4134          */
4135         addr = (vm_offset_t)vms->vm_daddr + dsiz;
4136         if (dsiz)
4137                 addr += (karc4random64() & 0x7FFFFFFFFFFFFFFFLU) % dsiz;
4138         return (round_page(addr));
4139 }
4140
4141 /*
4142  * Finds the VM object, offset, and protection for a given virtual address
4143  * in the specified map, assuming a page fault of the type specified.
4144  *
4145  * Leaves the map in question locked for read; return values are guaranteed
4146  * until a vm_map_lookup_done call is performed.  Note that the map argument
4147  * is in/out; the returned map must be used in the call to vm_map_lookup_done.
4148  *
4149  * A handle (out_entry) is returned for use in vm_map_lookup_done, to make
4150  * that fast.
4151  *
4152  * If a lookup is requested with "write protection" specified, the map may
4153  * be changed to perform virtual copying operations, although the data
4154  * referenced will remain the same.
4155  *
4156  * No requirements.
4157  */
4158 int
4159 vm_map_lookup(vm_map_t *var_map,                /* IN/OUT */
4160               vm_offset_t vaddr,
4161               vm_prot_t fault_typea,
4162               vm_map_entry_t *out_entry,        /* OUT */
4163               struct vm_map_backing **bap,      /* OUT */
4164               vm_pindex_t *pindex,              /* OUT */
4165               vm_prot_t *out_prot,              /* OUT */
4166               int *wflags)                      /* OUT */
4167 {
4168         vm_map_entry_t entry;
4169         vm_map_t map = *var_map;
4170         vm_prot_t prot;
4171         vm_prot_t fault_type = fault_typea;
4172         int use_read_lock = 1;
4173         int rv = KERN_SUCCESS;
4174         int count;
4175         thread_t td = curthread;
4176
4177         /*
4178          * vm_map_entry_reserve() implements an important mitigation
4179          * against mmap() span running the kernel out of vm_map_entry
4180          * structures, but it can also cause an infinite call recursion.
4181          * Use td_nest_count to prevent an infinite recursion (allows
4182          * the vm_map code to dig into the pcpu vm_map_entry reserve).
4183          */
4184         count = 0;
4185         if (td->td_nest_count == 0) {
4186                 ++td->td_nest_count;
4187                 count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
4188                 --td->td_nest_count;
4189         }
4190 RetryLookup:
4191         if (use_read_lock)
4192                 vm_map_lock_read(map);
4193         else
4194                 vm_map_lock(map);
4195
4196         /*
4197          * Always do a full lookup.  The hint doesn't get us much anymore
4198          * now that the map is RB'd.
4199          */
4200         cpu_ccfence();
4201         *out_entry = NULL;
4202         *bap = NULL;
4203
4204         {
4205                 vm_map_entry_t tmp_entry;
4206
4207                 if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
4208                         rv = KERN_INVALID_ADDRESS;
4209                         goto done;
4210                 }
4211                 entry = tmp_entry;
4212                 *out_entry = entry;
4213         }
4214         
4215         /*
4216          * Handle submaps.
4217          */
4218         if (entry->maptype == VM_MAPTYPE_SUBMAP) {
4219                 vm_map_t old_map = map;
4220
4221                 *var_map = map = entry->ba.sub_map;
4222                 if (use_read_lock)
4223                         vm_map_unlock_read(old_map);
4224                 else
4225                         vm_map_unlock(old_map);
4226                 use_read_lock = 1;
4227                 goto RetryLookup;
4228         }
4229
4230         /*
4231          * Check whether this task is allowed to have this page.
4232          * Note the special case for MAP_ENTRY_COW pages with an override.
4233          * This is to implement a forced COW for debuggers.
4234          */
4235         if (fault_type & VM_PROT_OVERRIDE_WRITE)
4236                 prot = entry->max_protection;
4237         else
4238                 prot = entry->protection;
4239
4240         fault_type &= (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
4241         if ((fault_type & prot) != fault_type) {
4242                 rv = KERN_PROTECTION_FAILURE;
4243                 goto done;
4244         }
4245
4246         if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
4247             (entry->eflags & MAP_ENTRY_COW) &&
4248             (fault_type & VM_PROT_WRITE) &&
4249             (fault_typea & VM_PROT_OVERRIDE_WRITE) == 0) {
4250                 rv = KERN_PROTECTION_FAILURE;
4251                 goto done;
4252         }
4253
4254         /*
4255          * If this page is not pageable, we have to get it for all possible
4256          * accesses.
4257          */
4258         *wflags = 0;
4259         if (entry->wired_count) {
4260                 *wflags |= FW_WIRED;
4261                 prot = fault_type = entry->protection;
4262         }
4263
4264         /*
4265          * Virtual page tables may need to update the accessed (A) bit
4266          * in a page table entry.  Upgrade the fault to a write fault for
4267          * that case if the map will support it.  If the map does not support
4268          * it the page table entry simply will not be updated.
4269          */
4270         if (entry->maptype == VM_MAPTYPE_VPAGETABLE) {
4271                 if (prot & VM_PROT_WRITE)
4272                         fault_type |= VM_PROT_WRITE;
4273         }
4274
4275         if (curthread->td_lwp && curthread->td_lwp->lwp_vmspace &&
4276             pmap_emulate_ad_bits(&curthread->td_lwp->lwp_vmspace->vm_pmap)) {
4277                 if ((prot & VM_PROT_WRITE) == 0)
4278                         fault_type |= VM_PROT_WRITE;
4279         }
4280
4281         /*
4282          * Only NORMAL and VPAGETABLE maps are object-based.  UKSMAPs are not.
4283          */
4284         if (entry->maptype != VM_MAPTYPE_NORMAL &&
4285             entry->maptype != VM_MAPTYPE_VPAGETABLE) {
4286                 *bap = NULL;
4287                 goto skip;
4288         }
4289
4290         /*
4291          * If the entry was copy-on-write, we either ...
4292          */
4293         if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
4294                 /*
4295                  * If we want to write the page, we may as well handle that
4296                  * now since we've got the map locked.
4297                  *
4298                  * If we don't need to write the page, we just demote the
4299                  * permissions allowed.
4300                  */
4301                 if (fault_type & VM_PROT_WRITE) {
4302                         /*
4303                          * Not allowed if TDF_NOFAULT is set as the shadowing
4304                          * operation can deadlock against the faulting
4305                          * function due to the copy-on-write.
4306                          */
4307                         if (curthread->td_flags & TDF_NOFAULT) {
4308                                 rv = KERN_FAILURE_NOFAULT;
4309                                 goto done;
4310                         }
4311
4312                         /*
4313                          * Make a new vm_map_backing + object, and place it
4314                          * in the object chain.  Note that no new references
4315                          * have appeared -- one just moved from the map to
4316                          * the new object.
4317                          */
4318                         if (use_read_lock && vm_map_lock_upgrade(map)) {
4319                                 /* lost lock */
4320                                 use_read_lock = 0;
4321                                 goto RetryLookup;
4322                         }
4323                         use_read_lock = 0;
4324                         vm_map_entry_shadow(entry, 0);
4325                         *wflags |= FW_DIDCOW;
4326                 } else {
4327                         /*
4328                          * We're attempting to read a copy-on-write page --
4329                          * don't allow writes.
4330                          */
4331                         prot &= ~VM_PROT_WRITE;
4332                 }
4333         }
4334
4335         /*
4336          * Create an object if necessary.  This code also handles
4337          * partitioning large entries to improve vm_fault performance.
4338          */
4339         if (entry->ba.object == NULL && !map->system_map) {
4340                 if (use_read_lock && vm_map_lock_upgrade(map))  {
4341                         /* lost lock */
4342                         use_read_lock = 0;
4343                         goto RetryLookup;
4344                 }
4345                 use_read_lock = 0;
4346
4347                 /*
4348                  * Partition large entries, giving each its own VM object,
4349                  * to improve concurrent fault performance.  This is only
4350                  * applicable to userspace.
4351                  */
4352                 if (map != &kernel_map &&
4353                     entry->maptype == VM_MAPTYPE_NORMAL &&
4354                     ((entry->start ^ entry->end) & ~MAP_ENTRY_PARTITION_MASK) &&
4355                     vm_map_partition_enable) {
4356                         if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
4357                                 entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
4358                                 ++mycpu->gd_cnt.v_intrans_coll;
4359                                 ++mycpu->gd_cnt.v_intrans_wait;
4360                                 vm_map_transition_wait(map, 0);
4361                                 goto RetryLookup;
4362                         }
4363                         vm_map_entry_partition(map, entry, vaddr, &count);
4364                 }
4365                 vm_map_entry_allocate_object(entry);
4366         }
4367
4368         /*
4369          * Return the object/offset from this entry.  If the entry was
4370          * copy-on-write or empty, it has been fixed up.
4371          */
4372         *bap = &entry->ba;
4373
4374 skip:
4375         *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->ba.offset);
4376
4377         /*
4378          * Return whether this is the only map sharing this data.  On
4379          * success we return with a read lock held on the map.  On failure
4380          * we return with the map unlocked.
4381          */
4382         *out_prot = prot;
4383 done:
4384         if (rv == KERN_SUCCESS) {
4385                 if (use_read_lock == 0)
4386                         vm_map_lock_downgrade(map);
4387         } else if (use_read_lock) {
4388                 vm_map_unlock_read(map);
4389         } else {
4390                 vm_map_unlock(map);
4391         }
4392         if (count > 0)
4393                 vm_map_entry_release(count);
4394
4395         return (rv);
4396 }
4397
4398 /*
4399  * Releases locks acquired by a vm_map_lookup()
4400  * (according to the handle returned by that lookup).
4401  *
4402  * No other requirements.
4403  */
4404 void
4405 vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry, int count)
4406 {
4407         /*
4408          * Unlock the main-level map
4409          */
4410         vm_map_unlock_read(map);
4411         if (count)
4412                 vm_map_entry_release(count);
4413 }
4414
4415 static void
4416 vm_map_entry_partition(vm_map_t map, vm_map_entry_t entry,
4417                        vm_offset_t vaddr, int *countp)
4418 {
4419         vaddr &= ~MAP_ENTRY_PARTITION_MASK;
4420         vm_map_clip_start(map, entry, vaddr, countp);
4421         vaddr += MAP_ENTRY_PARTITION_SIZE;
4422         vm_map_clip_end(map, entry, vaddr, countp);
4423 }
4424
4425 /*
4426  * Quick hack, needs some help to make it more SMP friendly.
4427  */
4428 void
4429 vm_map_interlock(vm_map_t map, struct vm_map_ilock *ilock,
4430                  vm_offset_t ran_beg, vm_offset_t ran_end)
4431 {
4432         struct vm_map_ilock *scan;
4433
4434         ilock->ran_beg = ran_beg;
4435         ilock->ran_end = ran_end;
4436         ilock->flags = 0;
4437
4438         spin_lock(&map->ilock_spin);
4439 restart:
4440         for (scan = map->ilock_base; scan; scan = scan->next) {
4441                 if (ran_end > scan->ran_beg && ran_beg < scan->ran_end) {
4442                         scan->flags |= ILOCK_WAITING;
4443                         ssleep(scan, &map->ilock_spin, 0, "ilock", 0);
4444                         goto restart;
4445                 }
4446         }
4447         ilock->next = map->ilock_base;
4448         map->ilock_base = ilock;
4449         spin_unlock(&map->ilock_spin);
4450 }
4451
4452 void
4453 vm_map_deinterlock(vm_map_t map, struct  vm_map_ilock *ilock)
4454 {
4455         struct vm_map_ilock *scan;
4456         struct vm_map_ilock **scanp;
4457
4458         spin_lock(&map->ilock_spin);
4459         scanp = &map->ilock_base;
4460         while ((scan = *scanp) != NULL) {
4461                 if (scan == ilock) {
4462                         *scanp = ilock->next;
4463                         spin_unlock(&map->ilock_spin);
4464                         if (ilock->flags & ILOCK_WAITING)
4465                                 wakeup(ilock);
4466                         return;
4467                 }
4468                 scanp = &scan->next;
4469         }
4470         spin_unlock(&map->ilock_spin);
4471         panic("vm_map_deinterlock: missing ilock!");
4472 }
4473
4474 #include "opt_ddb.h"
4475 #ifdef DDB
4476 #include <ddb/ddb.h>
4477
4478 /*
4479  * Debugging only
4480  */
4481 DB_SHOW_COMMAND(map, vm_map_print)
4482 {
4483         static int nlines;
4484         /* XXX convert args. */
4485         vm_map_t map = (vm_map_t)addr;
4486         boolean_t full = have_addr;
4487
4488         vm_map_entry_t entry;
4489
4490         db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
4491             (void *)map,
4492             (void *)map->pmap, map->nentries, map->timestamp);
4493         nlines++;
4494
4495         if (!full && db_indent)
4496                 return;
4497
4498         db_indent += 2;
4499         RB_FOREACH(entry, vm_map_rb_tree, &map->rb_root) {
4500                 db_iprintf("map entry %p: start=%p, end=%p\n",
4501                     (void *)entry, (void *)entry->start, (void *)entry->end);
4502                 nlines++;
4503                 {
4504                         static char *inheritance_name[4] =
4505                         {"share", "copy", "none", "donate_copy"};
4506
4507                         db_iprintf(" prot=%x/%x/%s",
4508                             entry->protection,
4509                             entry->max_protection,
4510                             inheritance_name[(int)(unsigned char)
4511                                                 entry->inheritance]);
4512                         if (entry->wired_count != 0)
4513                                 db_printf(", wired");
4514                 }
4515                 switch(entry->maptype) {
4516                 case VM_MAPTYPE_SUBMAP:
4517                         /* XXX no %qd in kernel.  Truncate entry->ba.offset. */
4518                         db_printf(", share=%p, offset=0x%lx\n",
4519                             (void *)entry->ba.sub_map,
4520                             (long)entry->ba.offset);
4521                         nlines++;
4522
4523                         db_indent += 2;
4524                         vm_map_print((db_expr_t)(intptr_t)entry->ba.sub_map,
4525                                      full, 0, NULL);
4526                         db_indent -= 2;
4527                         break;
4528                 case VM_MAPTYPE_NORMAL:
4529                 case VM_MAPTYPE_VPAGETABLE:
4530                         /* XXX no %qd in kernel.  Truncate entry->ba.offset. */
4531                         db_printf(", object=%p, offset=0x%lx",
4532                             (void *)entry->ba.object,
4533                             (long)entry->ba.offset);
4534                         if (entry->eflags & MAP_ENTRY_COW)
4535                                 db_printf(", copy (%s)",
4536                                     (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
4537                         db_printf("\n");
4538                         nlines++;
4539
4540                         if (entry->ba.object) {
4541                                 db_indent += 2;
4542                                 vm_object_print((db_expr_t)(intptr_t)
4543                                                 entry->ba.object,
4544                                                 full, 0, NULL);
4545                                 nlines += 4;
4546                                 db_indent -= 2;
4547                         }
4548                         break;
4549                 case VM_MAPTYPE_UKSMAP:
4550                         db_printf(", uksmap=%p, offset=0x%lx",
4551                             (void *)entry->ba.uksmap,
4552                             (long)entry->ba.offset);
4553                         if (entry->eflags & MAP_ENTRY_COW)
4554                                 db_printf(", copy (%s)",
4555                                     (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
4556                         db_printf("\n");
4557                         nlines++;
4558                         break;
4559                 default:
4560                         break;
4561                 }
4562         }
4563         db_indent -= 2;
4564         if (db_indent == 0)
4565                 nlines = 0;
4566 }
4567
4568 /*
4569  * Debugging only
4570  */
4571 DB_SHOW_COMMAND(procvm, procvm)
4572 {
4573         struct proc *p;
4574
4575         if (have_addr) {
4576                 p = (struct proc *) addr;
4577         } else {
4578                 p = curproc;
4579         }
4580
4581         db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
4582             (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
4583             (void *)vmspace_pmap(p->p_vmspace));
4584
4585         vm_map_print((db_expr_t)(intptr_t)&p->p_vmspace->vm_map, 1, 0, NULL);
4586 }
4587
4588 #endif /* DDB */