kernel - Fix long-standing vm_map token panic
[dragonfly.git] / sys / vm / vm_map.c
CommitLineData
984263bc 1/*
46754a20
MD
2 * (MPSAFE)
3 *
984263bc
MD
4 * Copyright (c) 1991, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * The Mach Operating System project at Carnegie-Mellon University.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * from: @(#)vm_map.c 8.3 (Berkeley) 1/12/94
39 *
40 *
41 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
42 * All rights reserved.
43 *
44 * Authors: Avadis Tevanian, Jr., Michael Wayne Young
45 *
46 * Permission to use, copy, modify and distribute this software and
47 * its documentation is hereby granted, provided that both the copyright
48 * notice and this permission notice appear in all copies of the
49 * software, derivative works or modified versions, and any portions
50 * thereof, and that both notices appear in supporting documentation.
51 *
52 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
53 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
54 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
55 *
56 * Carnegie Mellon requests users of this software to return to
57 *
58 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
59 * School of Computer Science
60 * Carnegie Mellon University
61 * Pittsburgh PA 15213-3890
62 *
63 * any improvements or extensions that they make and grant Carnegie the
64 * rights to redistribute these changes.
65 *
66 * $FreeBSD: src/sys/vm/vm_map.c,v 1.187.2.19 2003/05/27 00:47:02 alc Exp $
e3161323 67 * $DragonFly: src/sys/vm/vm_map.c,v 1.56 2007/04/29 18:25:41 dillon Exp $
984263bc
MD
68 */
69
70/*
71 * Virtual memory mapping module.
72 */
73
74#include <sys/param.h>
75#include <sys/systm.h>
e3161323 76#include <sys/kernel.h>
984263bc 77#include <sys/proc.h>
ff13bc52 78#include <sys/serialize.h>
fef0fdf2 79#include <sys/lock.h>
984263bc
MD
80#include <sys/vmmeter.h>
81#include <sys/mman.h>
82#include <sys/vnode.h>
83#include <sys/resourcevar.h>
fef0fdf2 84#include <sys/shm.h>
686dbf64 85#include <sys/tree.h>
e3161323 86#include <sys/malloc.h>
984263bc
MD
87
88#include <vm/vm.h>
89#include <vm/vm_param.h>
984263bc
MD
90#include <vm/pmap.h>
91#include <vm/vm_map.h>
92#include <vm/vm_page.h>
93#include <vm/vm_object.h>
94#include <vm/vm_pager.h>
95#include <vm/vm_kern.h>
96#include <vm/vm_extern.h>
97#include <vm/swap_pager.h>
98#include <vm/vm_zone.h>
99
a108bf71 100#include <sys/thread2.h>
e3161323 101#include <sys/sysref2.h>
911e30e2
AH
102#include <sys/random.h>
103#include <sys/sysctl.h>
a108bf71 104
984263bc 105/*
46754a20
MD
106 * Virtual memory maps provide for the mapping, protection, and sharing
107 * of virtual memory objects. In addition, this module provides for an
108 * efficient virtual copy of memory from one map to another.
984263bc 109 *
46754a20 110 * Synchronization is required prior to most operations.
984263bc 111 *
46754a20
MD
112 * Maps consist of an ordered doubly-linked list of simple entries.
113 * A hint and a RB tree is used to speed-up lookups.
984263bc 114 *
46754a20
MD
115 * Callers looking to modify maps specify start/end addresses which cause
116 * the related map entry to be clipped if necessary, and then later
117 * recombined if the pieces remained compatible.
984263bc 118 *
46754a20
MD
119 * Virtual copy operations are performed by copying VM object references
120 * from one map to another, and then marking both regions as copy-on-write.
984263bc 121 */
e3161323 122static void vmspace_terminate(struct vmspace *vm);
e654922c
MD
123static void vmspace_lock(struct vmspace *vm);
124static void vmspace_unlock(struct vmspace *vm);
e3161323
MD
125static void vmspace_dtor(void *obj, void *private);
126
127MALLOC_DEFINE(M_VMSPACE, "vmspace", "vmspace objcache backingstore");
128
129struct sysref_class vmspace_sysref_class = {
130 .name = "vmspace",
131 .mtype = M_VMSPACE,
132 .proto = SYSREF_PROTO_VMSPACE,
133 .offset = offsetof(struct vmspace, vm_sysref),
134 .objsize = sizeof(struct vmspace),
521f81c7 135 .nom_cache = 32,
e3161323
MD
136 .flags = SRC_MANAGEDINIT,
137 .dtor = vmspace_dtor,
138 .ops = {
e654922c
MD
139 .terminate = (sysref_terminate_func_t)vmspace_terminate,
140 .lock = (sysref_lock_func_t)vmspace_lock,
141 .unlock = (sysref_lock_func_t)vmspace_unlock
e3161323
MD
142 }
143};
984263bc 144
8e5ea5f7
MD
145/*
146 * per-cpu page table cross mappings are initialized in early boot
147 * and might require a considerable number of vm_map_entry structures.
148 */
149#define VMEPERCPU (MAXCPU+1)
c4ae567f 150
a108bf71 151static struct vm_zone mapentzone_store, mapzone_store;
e3161323 152static vm_zone_t mapentzone, mapzone;
a108bf71 153static struct vm_object mapentobj, mapobj;
984263bc
MD
154
155static struct vm_map_entry map_entry_init[MAX_MAPENT];
c4ae567f 156static struct vm_map_entry cpu_map_entry_init[MAXCPU][VMEPERCPU];
984263bc
MD
157static struct vm_map map_init[MAX_KMAP];
158
911e30e2
AH
159static int randomize_mmap;
160SYSCTL_INT(_vm, OID_AUTO, randomize_mmap, CTLFLAG_RW, &randomize_mmap, 0,
161 "Randomize mmap offsets");
162
b12defdc 163static void vm_map_entry_shadow(vm_map_entry_t entry, int addref);
a108bf71
MD
164static vm_map_entry_t vm_map_entry_create(vm_map_t map, int *);
165static void vm_map_entry_dispose (vm_map_t map, vm_map_entry_t entry, int *);
166static void _vm_map_clip_end (vm_map_t, vm_map_entry_t, vm_offset_t, int *);
167static void _vm_map_clip_start (vm_map_t, vm_map_entry_t, vm_offset_t, int *);
168static void vm_map_entry_delete (vm_map_t, vm_map_entry_t, int *);
1388df65
RG
169static void vm_map_entry_unwire (vm_map_t, vm_map_entry_t);
170static void vm_map_copy_entry (vm_map_t, vm_map_t, vm_map_entry_t,
171 vm_map_entry_t);
a108bf71 172static void vm_map_unclip_range (vm_map_t map, vm_map_entry_t start_entry, vm_offset_t start, vm_offset_t end, int *count, int flags);
984263bc 173
e3161323 174/*
46754a20
MD
175 * Initialize the vm_map module. Must be called before any other vm_map
176 * routines.
e3161323 177 *
46754a20
MD
178 * Map and entry structures are allocated from the general purpose
179 * memory pool with some exceptions:
e3161323 180 *
46754a20
MD
181 * - The kernel map is allocated statically.
182 * - Initial kernel map entries are allocated out of a static pool.
e3161323
MD
183 *
184 * These restrictions are necessary since malloc() uses the
185 * maps and requires map entries.
46754a20
MD
186 *
187 * Called from the low level boot code only.
e3161323 188 */
984263bc 189void
57e43348 190vm_map_startup(void)
984263bc
MD
191{
192 mapzone = &mapzone_store;
193 zbootinit(mapzone, "MAP", sizeof (struct vm_map),
194 map_init, MAX_KMAP);
984263bc
MD
195 mapentzone = &mapentzone_store;
196 zbootinit(mapentzone, "MAP ENTRY", sizeof (struct vm_map_entry),
197 map_entry_init, MAX_MAPENT);
198}
199
200/*
46754a20
MD
201 * Called prior to any vmspace allocations.
202 *
203 * Called from the low level boot code only.
e3161323
MD
204 */
205void
206vm_init2(void)
207{
208 zinitna(mapentzone, &mapentobj, NULL, 0, 0,
209 ZONE_USE_RESERVE | ZONE_SPECIAL, 1);
210 zinitna(mapzone, &mapobj, NULL, 0, 0, 0, 1);
211 pmap_init2();
212 vm_object_init2();
213}
214
215
216/*
686dbf64 217 * Red black tree functions
46754a20
MD
218 *
219 * The caller must hold the related map lock.
686dbf64
MD
220 */
221static int rb_vm_map_compare(vm_map_entry_t a, vm_map_entry_t b);
222RB_GENERATE(vm_map_rb_tree, vm_map_entry, rb_entry, rb_vm_map_compare);
223
224/* a->start is address, and the only field has to be initialized */
225static int
226rb_vm_map_compare(vm_map_entry_t a, vm_map_entry_t b)
227{
228 if (a->start < b->start)
229 return(-1);
230 else if (a->start > b->start)
231 return(1);
232 return(0);
233}
234
235/*
e3161323
MD
236 * Allocate a vmspace structure, including a vm_map and pmap.
237 * Initialize numerous fields. While the initial allocation is zerod,
238 * subsequence reuse from the objcache leaves elements of the structure
239 * intact (particularly the pmap), so portions must be zerod.
240 *
241 * The structure is not considered activated until we call sysref_activate().
46754a20
MD
242 *
243 * No requirements.
984263bc
MD
244 */
245struct vmspace *
57e43348 246vmspace_alloc(vm_offset_t min, vm_offset_t max)
984263bc
MD
247{
248 struct vmspace *vm;
249
e3161323 250 vm = sysref_alloc(&vmspace_sysref_class);
54a764e8 251 bzero(&vm->vm_startcopy,
e3161323 252 (char *)&vm->vm_endcopy - (char *)&vm->vm_startcopy);
a2ee730d
MD
253 vm_map_init(&vm->vm_map, min, max, NULL); /* initializes token */
254
255 /*
256 * Use a hold to prevent any additional racing hold from terminating
257 * the vmspace before we manage to activate it. This also acquires
258 * the token for safety.
259 */
260 KKASSERT(vm->vm_holdcount == 0);
261 KKASSERT(vm->vm_exitingcnt == 0);
262 vmspace_hold(vm);
e3161323 263 pmap_pinit(vmspace_pmap(vm)); /* (some fields reused) */
984263bc 264 vm->vm_map.pmap = vmspace_pmap(vm); /* XXX */
984263bc 265 vm->vm_shm = NULL;
a2ee730d 266 vm->vm_flags = 0;
135d7199 267 cpu_vmspace_alloc(vm);
e3161323 268 sysref_activate(&vm->vm_sysref);
a2ee730d 269 vmspace_drop(vm);
46754a20 270
984263bc
MD
271 return (vm);
272}
273
e3161323 274/*
a2ee730d
MD
275 * Free a primary reference to a vmspace. This can trigger a
276 * stage-1 termination.
277 */
278void
279vmspace_free(struct vmspace *vm)
280{
281 /*
282 * We want all finalization to occur via vmspace_drop() so we
283 * need to hold the vm around the put.
284 */
285 vmspace_hold(vm);
286 sysref_put(&vm->vm_sysref);
287 vmspace_drop(vm);
288}
289
290void
291vmspace_ref(struct vmspace *vm)
292{
293 sysref_get(&vm->vm_sysref);
294}
295
296void
297vmspace_hold(struct vmspace *vm)
298{
299 refcount_acquire(&vm->vm_holdcount);
300 lwkt_gettoken(&vm->vm_map.token);
301}
302
303void
304vmspace_drop(struct vmspace *vm)
305{
306 lwkt_reltoken(&vm->vm_map.token);
307 if (refcount_release(&vm->vm_holdcount)) {
308 if (vm->vm_exitingcnt == 0 &&
309 sysref_isinactive(&vm->vm_sysref)) {
310 vmspace_terminate(vm);
311 }
312 }
313}
314
315/*
e3161323
MD
316 * dtor function - Some elements of the pmap are retained in the
317 * free-cached vmspaces to improve performance. We have to clean them up
318 * here before returning the vmspace to the memory pool.
46754a20
MD
319 *
320 * No requirements.
e3161323
MD
321 */
322static void
323vmspace_dtor(void *obj, void *private)
a108bf71 324{
e3161323
MD
325 struct vmspace *vm = obj;
326
327 pmap_puninit(vmspace_pmap(vm));
984263bc
MD
328}
329
e3161323 330/*
a2ee730d
MD
331 * Called in three cases:
332 *
333 * (1) When the last sysref is dropped and the vmspace becomes inactive.
334 * (holdcount will not be 0 because the vmspace is held through the op)
e3161323 335 *
a2ee730d
MD
336 * (2) When exitingcount becomes 0 on the last reap
337 * (holdcount will not be 0 because the vmspace is held through the op)
e3161323 338 *
a2ee730d 339 * (3) When the holdcount becomes 0 in addition to the above two
e3161323
MD
340 *
341 * sysref will not scrap the object until we call sysref_put() once more
342 * after the last ref has been dropped.
46754a20 343 *
a2ee730d
MD
344 * VMSPACE_EXIT1 flags the primary deactivation
345 * VMSPACE_EXIT2 flags the last reap
e3161323
MD
346 */
347static void
348vmspace_terminate(struct vmspace *vm)
984263bc 349{
a108bf71
MD
350 int count;
351
e3161323 352 /*
a2ee730d 353 *
e3161323 354 */
b12defdc 355 lwkt_gettoken(&vm->vm_map.token);
a2ee730d
MD
356 if ((vm->vm_flags & VMSPACE_EXIT1) == 0) {
357 vm->vm_flags |= VMSPACE_EXIT1;
e3161323
MD
358 shmexit(vm);
359 pmap_remove_pages(vmspace_pmap(vm), VM_MIN_USER_ADDRESS,
360 VM_MAX_USER_ADDRESS);
361 vm_map_remove(&vm->vm_map, VM_MIN_USER_ADDRESS,
362 VM_MAX_USER_ADDRESS);
e3161323 363 }
a2ee730d
MD
364 if ((vm->vm_flags & VMSPACE_EXIT2) == 0 && vm->vm_exitingcnt == 0) {
365 vm->vm_flags |= VMSPACE_EXIT2;
366 cpu_vmspace_free(vm);
367 shmexit(vm);
368 KKASSERT(vm->vm_upcalls == NULL);
fef0fdf2 369
a2ee730d
MD
370 /*
371 * Lock the map, to wait out all other references to it.
372 * Delete all of the mappings and pages they hold, then call
373 * the pmap module to reclaim anything left.
374 */
375 count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
376 vm_map_lock(&vm->vm_map);
377 vm_map_delete(&vm->vm_map, vm->vm_map.min_offset,
378 vm->vm_map.max_offset, &count);
379 vm_map_unlock(&vm->vm_map);
380 vm_map_entry_release(count);
a722be49 381
a2ee730d
MD
382 lwkt_gettoken(&vmspace_pmap(vm)->pm_token);
383 pmap_release(vmspace_pmap(vm));
384 lwkt_reltoken(&vmspace_pmap(vm)->pm_token);
385 }
984263bc 386
b12defdc 387 lwkt_reltoken(&vm->vm_map.token);
a2ee730d
MD
388 if (vm->vm_exitingcnt == 0 && vm->vm_holdcount == 0) {
389 KKASSERT(vm->vm_flags & VMSPACE_EXIT1);
390 KKASSERT(vm->vm_flags & VMSPACE_EXIT2);
391 sysref_put(&vm->vm_sysref);
392 }
984263bc
MD
393}
394
46754a20
MD
395/*
396 * vmspaces are not currently locked.
397 */
e654922c
MD
398static void
399vmspace_lock(struct vmspace *vm __unused)
400{
401}
402
403static void
404vmspace_unlock(struct vmspace *vm __unused)
405{
406}
407
e3161323 408/*
46754a20
MD
409 * This is called during exit indicating that the vmspace is no
410 * longer in used by an exiting process, but the process has not yet
a2ee730d 411 * been reaped.
46754a20
MD
412 *
413 * No requirements.
414 */
415void
416vmspace_exitbump(struct vmspace *vm)
417{
a2ee730d 418 vmspace_hold(vm);
46754a20 419 ++vm->vm_exitingcnt;
a2ee730d 420 vmspace_drop(vm); /* handles termination sequencing */
46754a20
MD
421}
422
423/*
a2ee730d
MD
424 * Decrement the exitingcnt and issue the stage-2 termination if it becomes
425 * zero and the stage1 termination has already occured.
46754a20
MD
426 *
427 * No requirements.
e3161323 428 */
984263bc
MD
429void
430vmspace_exitfree(struct proc *p)
431{
432 struct vmspace *vm;
433
434 vm = p->p_vmspace;
435 p->p_vmspace = NULL;
a2ee730d
MD
436 vmspace_hold(vm);
437 KKASSERT(vm->vm_exitingcnt > 0);
438 if (--vm->vm_exitingcnt == 0 && sysref_isinactive(&vm->vm_sysref))
e3161323 439 vmspace_terminate(vm);
a2ee730d 440 vmspace_drop(vm); /* handles termination sequencing */
984263bc
MD
441}
442
443/*
46754a20
MD
444 * Swap useage is determined by taking the proportional swap used by
445 * VM objects backing the VM map. To make up for fractional losses,
446 * if the VM object has any swap use at all the associated map entries
447 * count for at least 1 swap page.
984263bc 448 *
46754a20 449 * No requirements.
984263bc
MD
450 */
451int
b12defdc 452vmspace_swap_count(struct vmspace *vm)
984263bc 453{
b12defdc 454 vm_map_t map = &vm->vm_map;
984263bc 455 vm_map_entry_t cur;
1b874851 456 vm_object_t object;
984263bc 457 int count = 0;
1b874851 458 int n;
984263bc 459
a2ee730d 460 vmspace_hold(vm);
984263bc 461 for (cur = map->header.next; cur != &map->header; cur = cur->next) {
1b874851
MD
462 switch(cur->maptype) {
463 case VM_MAPTYPE_NORMAL:
464 case VM_MAPTYPE_VPAGETABLE:
465 if ((object = cur->object.vm_object) == NULL)
466 break;
96adc753
MD
467 if (object->swblock_count) {
468 n = (cur->end - cur->start) / PAGE_SIZE;
469 count += object->swblock_count *
984263bc
MD
470 SWAP_META_PAGES * n / object->size + 1;
471 }
1b874851
MD
472 break;
473 default:
474 break;
984263bc
MD
475 }
476 }
a2ee730d
MD
477 vmspace_drop(vm);
478
984263bc
MD
479 return(count);
480}
481
20479584 482/*
46754a20
MD
483 * Calculate the approximate number of anonymous pages in use by
484 * this vmspace. To make up for fractional losses, we count each
485 * VM object as having at least 1 anonymous page.
20479584 486 *
46754a20 487 * No requirements.
20479584
MD
488 */
489int
b12defdc 490vmspace_anonymous_count(struct vmspace *vm)
20479584 491{
b12defdc 492 vm_map_t map = &vm->vm_map;
20479584
MD
493 vm_map_entry_t cur;
494 vm_object_t object;
495 int count = 0;
496
a2ee730d 497 vmspace_hold(vm);
20479584
MD
498 for (cur = map->header.next; cur != &map->header; cur = cur->next) {
499 switch(cur->maptype) {
500 case VM_MAPTYPE_NORMAL:
501 case VM_MAPTYPE_VPAGETABLE:
502 if ((object = cur->object.vm_object) == NULL)
503 break;
504 if (object->type != OBJT_DEFAULT &&
505 object->type != OBJT_SWAP) {
506 break;
507 }
508 count += object->resident_page_count;
509 break;
510 default:
511 break;
512 }
513 }
a2ee730d
MD
514 vmspace_drop(vm);
515
20479584
MD
516 return(count);
517}
518
984263bc 519/*
46754a20
MD
520 * Creates and returns a new empty VM map with the given physical map
521 * structure, and having the given lower and upper address bounds.
984263bc 522 *
46754a20 523 * No requirements.
984263bc
MD
524 */
525vm_map_t
e4846942 526vm_map_create(vm_map_t result, pmap_t pmap, vm_offset_t min, vm_offset_t max)
984263bc 527{
e4846942
MD
528 if (result == NULL)
529 result = zalloc(mapzone);
530 vm_map_init(result, min, max, pmap);
984263bc
MD
531 return (result);
532}
533
534/*
46754a20
MD
535 * Initialize an existing vm_map structure such as that in the vmspace
536 * structure. The pmap is initialized elsewhere.
537 *
538 * No requirements.
984263bc
MD
539 */
540void
e4846942 541vm_map_init(struct vm_map *map, vm_offset_t min, vm_offset_t max, pmap_t pmap)
984263bc
MD
542{
543 map->header.next = map->header.prev = &map->header;
686dbf64 544 RB_INIT(&map->rb_root);
984263bc
MD
545 map->nentries = 0;
546 map->size = 0;
547 map->system_map = 0;
984263bc
MD
548 map->min_offset = min;
549 map->max_offset = max;
e4846942 550 map->pmap = pmap;
984263bc
MD
551 map->first_free = &map->header;
552 map->hint = &map->header;
553 map->timestamp = 0;
69e16e2a 554 map->flags = 0;
b12defdc 555 lwkt_token_init(&map->token, "vm_map");
625a2937 556 lockinit(&map->lock, "thrd_sleep", (hz + 9) / 10, 0);
521f81c7 557 TUNABLE_INT("vm.cache_vmspaces", &vmspace_sysref_class.nom_cache);
984263bc
MD
558}
559
560/*
53025830
MD
561 * Shadow the vm_map_entry's object. This typically needs to be done when
562 * a write fault is taken on an entry which had previously been cloned by
563 * fork(). The shared object (which might be NULL) must become private so
564 * we add a shadow layer above it.
565 *
566 * Object allocation for anonymous mappings is defered as long as possible.
567 * When creating a shadow, however, the underlying object must be instantiated
568 * so it can be shared.
569 *
570 * If the map segment is governed by a virtual page table then it is
571 * possible to address offsets beyond the mapped area. Just allocate
572 * a maximally sized object for this case.
46754a20
MD
573 *
574 * The vm_map must be exclusively locked.
575 * No other requirements.
53025830
MD
576 */
577static
578void
b12defdc 579vm_map_entry_shadow(vm_map_entry_t entry, int addref)
53025830
MD
580{
581 if (entry->maptype == VM_MAPTYPE_VPAGETABLE) {
582 vm_object_shadow(&entry->object.vm_object, &entry->offset,
b12defdc 583 0x7FFFFFFF, addref); /* XXX */
53025830
MD
584 } else {
585 vm_object_shadow(&entry->object.vm_object, &entry->offset,
b12defdc 586 atop(entry->end - entry->start), addref);
53025830
MD
587 }
588 entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
589}
590
591/*
592 * Allocate an object for a vm_map_entry.
593 *
594 * Object allocation for anonymous mappings is defered as long as possible.
595 * This function is called when we can defer no longer, generally when a map
596 * entry might be split or forked or takes a page fault.
597 *
598 * If the map segment is governed by a virtual page table then it is
599 * possible to address offsets beyond the mapped area. Just allocate
600 * a maximally sized object for this case.
46754a20
MD
601 *
602 * The vm_map must be exclusively locked.
603 * No other requirements.
53025830
MD
604 */
605void
606vm_map_entry_allocate_object(vm_map_entry_t entry)
607{
608 vm_object_t obj;
609
610 if (entry->maptype == VM_MAPTYPE_VPAGETABLE) {
611 obj = vm_object_allocate(OBJT_DEFAULT, 0x7FFFFFFF); /* XXX */
612 } else {
613 obj = vm_object_allocate(OBJT_DEFAULT,
614 atop(entry->end - entry->start));
615 }
616 entry->object.vm_object = obj;
617 entry->offset = 0;
618}
619
620/*
46754a20
MD
621 * Set an initial negative count so the first attempt to reserve
622 * space preloads a bunch of vm_map_entry's for this cpu. Also
623 * pre-allocate 2 vm_map_entries which will be needed by zalloc() to
624 * map a new page for vm_map_entry structures. SMP systems are
625 * particularly sensitive.
c4ae567f 626 *
46754a20
MD
627 * This routine is called in early boot so we cannot just call
628 * vm_map_entry_reserve().
41a01a4d 629 *
46754a20 630 * Called from the low level boot code only (for each cpu)
41a01a4d
MD
631 */
632void
633vm_map_entry_reserve_cpu_init(globaldata_t gd)
634{
c4ae567f
MD
635 vm_map_entry_t entry;
636 int i;
637
41a01a4d 638 gd->gd_vme_avail -= MAP_RESERVE_COUNT * 2;
c4ae567f
MD
639 entry = &cpu_map_entry_init[gd->gd_cpuid][0];
640 for (i = 0; i < VMEPERCPU; ++i, ++entry) {
641 entry->next = gd->gd_vme_base;
642 gd->gd_vme_base = entry;
643 }
41a01a4d
MD
644}
645
646/*
46754a20
MD
647 * Reserves vm_map_entry structures so code later on can manipulate
648 * map_entry structures within a locked map without blocking trying
649 * to allocate a new vm_map_entry.
a108bf71 650 *
46754a20 651 * No requirements.
a108bf71
MD
652 */
653int
654vm_map_entry_reserve(int count)
655{
656 struct globaldata *gd = mycpu;
657 vm_map_entry_t entry;
658
a108bf71
MD
659 /*
660 * Make sure we have enough structures in gd_vme_base to handle
661 * the reservation request.
a5fc46c9
MD
662 *
663 * The critical section protects access to the per-cpu gd.
a108bf71 664 */
46754a20 665 crit_enter();
ac13eccd 666 while (gd->gd_vme_avail < count) {
a108bf71
MD
667 entry = zalloc(mapentzone);
668 entry->next = gd->gd_vme_base;
669 gd->gd_vme_base = entry;
670 ++gd->gd_vme_avail;
671 }
ac13eccd 672 gd->gd_vme_avail -= count;
a108bf71 673 crit_exit();
46754a20 674
a108bf71
MD
675 return(count);
676}
677
678/*
46754a20
MD
679 * Releases previously reserved vm_map_entry structures that were not
680 * used. If we have too much junk in our per-cpu cache clean some of
681 * it out.
a108bf71 682 *
46754a20 683 * No requirements.
a108bf71
MD
684 */
685void
686vm_map_entry_release(int count)
687{
688 struct globaldata *gd = mycpu;
689 vm_map_entry_t entry;
690
691 crit_enter();
692 gd->gd_vme_avail += count;
693 while (gd->gd_vme_avail > MAP_RESERVE_SLOP) {
694 entry = gd->gd_vme_base;
695 KKASSERT(entry != NULL);
696 gd->gd_vme_base = entry->next;
697 --gd->gd_vme_avail;
698 crit_exit();
699 zfree(mapentzone, entry);
700 crit_enter();
701 }
702 crit_exit();
703}
704
705/*
46754a20
MD
706 * Reserve map entry structures for use in kernel_map itself. These
707 * entries have *ALREADY* been reserved on a per-cpu basis when the map
708 * was inited. This function is used by zalloc() to avoid a recursion
709 * when zalloc() itself needs to allocate additional kernel memory.
a108bf71 710 *
46754a20
MD
711 * This function works like the normal reserve but does not load the
712 * vm_map_entry cache (because that would result in an infinite
713 * recursion). Note that gd_vme_avail may go negative. This is expected.
c4ae567f 714 *
46754a20
MD
715 * Any caller of this function must be sure to renormalize after
716 * potentially eating entries to ensure that the reserve supply
717 * remains intact.
a108bf71 718 *
46754a20 719 * No requirements.
a108bf71
MD
720 */
721int
722vm_map_entry_kreserve(int count)
723{
724 struct globaldata *gd = mycpu;
725
726 crit_enter();
c4ae567f 727 gd->gd_vme_avail -= count;
a108bf71 728 crit_exit();
46754a20
MD
729 KASSERT(gd->gd_vme_base != NULL,
730 ("no reserved entries left, gd_vme_avail = %d\n",
731 gd->gd_vme_avail));
a108bf71
MD
732 return(count);
733}
734
735/*
46754a20
MD
736 * Release previously reserved map entries for kernel_map. We do not
737 * attempt to clean up like the normal release function as this would
738 * cause an unnecessary (but probably not fatal) deep procedure call.
a108bf71 739 *
46754a20 740 * No requirements.
a108bf71
MD
741 */
742void
743vm_map_entry_krelease(int count)
744{
745 struct globaldata *gd = mycpu;
746
747 crit_enter();
c4ae567f 748 gd->gd_vme_avail += count;
a108bf71
MD
749 crit_exit();
750}
751
752/*
46754a20 753 * Allocates a VM map entry for insertion. No entry fields are filled in.
984263bc 754 *
46754a20
MD
755 * The entries should have previously been reserved. The reservation count
756 * is tracked in (*countp).
a108bf71 757 *
46754a20 758 * No requirements.
984263bc 759 */
8a8d5d85 760static vm_map_entry_t
a108bf71 761vm_map_entry_create(vm_map_t map, int *countp)
984263bc 762{
a108bf71
MD
763 struct globaldata *gd = mycpu;
764 vm_map_entry_t entry;
8a8d5d85 765
a108bf71
MD
766 KKASSERT(*countp > 0);
767 --*countp;
768 crit_enter();
769 entry = gd->gd_vme_base;
770 KASSERT(entry != NULL, ("gd_vme_base NULL! count %d", *countp));
771 gd->gd_vme_base = entry->next;
772 crit_exit();
46754a20 773
a108bf71 774 return(entry);
984263bc
MD
775}
776
777/*
46754a20 778 * Dispose of a vm_map_entry that is no longer being referenced.
984263bc 779 *
46754a20 780 * No requirements.
984263bc 781 */
8a8d5d85 782static void
a108bf71 783vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry, int *countp)
984263bc 784{
a108bf71
MD
785 struct globaldata *gd = mycpu;
786
686dbf64
MD
787 KKASSERT(map->hint != entry);
788 KKASSERT(map->first_free != entry);
789
a108bf71
MD
790 ++*countp;
791 crit_enter();
792 entry->next = gd->gd_vme_base;
793 gd->gd_vme_base = entry;
794 crit_exit();
984263bc
MD
795}
796
8a8d5d85 797
984263bc 798/*
46754a20 799 * Insert/remove entries from maps.
984263bc 800 *
46754a20 801 * The related map must be exclusively locked.
b12defdc 802 * The caller must hold map->token
46754a20 803 * No other requirements.
984263bc
MD
804 */
805static __inline void
806vm_map_entry_link(vm_map_t map,
807 vm_map_entry_t after_where,
808 vm_map_entry_t entry)
809{
46754a20
MD
810 ASSERT_VM_MAP_LOCKED(map);
811
984263bc
MD
812 map->nentries++;
813 entry->prev = after_where;
814 entry->next = after_where->next;
815 entry->next->prev = entry;
816 after_where->next = entry;
0cd275af
MD
817 if (vm_map_rb_tree_RB_INSERT(&map->rb_root, entry))
818 panic("vm_map_entry_link: dup addr map %p ent %p", map, entry);
984263bc
MD
819}
820
821static __inline void
822vm_map_entry_unlink(vm_map_t map,
823 vm_map_entry_t entry)
824{
825 vm_map_entry_t prev;
826 vm_map_entry_t next;
827
46754a20
MD
828 ASSERT_VM_MAP_LOCKED(map);
829
830 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
831 panic("vm_map_entry_unlink: attempt to mess with "
832 "locked entry! %p", entry);
833 }
984263bc
MD
834 prev = entry->prev;
835 next = entry->next;
836 next->prev = prev;
837 prev->next = next;
686dbf64 838 vm_map_rb_tree_RB_REMOVE(&map->rb_root, entry);
984263bc
MD
839 map->nentries--;
840}
841
842/*
46754a20
MD
843 * Finds the map entry containing (or immediately preceding) the specified
844 * address in the given map. The entry is returned in (*entry).
845 *
846 * The boolean result indicates whether the address is actually contained
847 * in the map.
984263bc 848 *
46754a20
MD
849 * The related map must be locked.
850 * No other requirements.
984263bc
MD
851 */
852boolean_t
46754a20 853vm_map_lookup_entry(vm_map_t map, vm_offset_t address, vm_map_entry_t *entry)
984263bc 854{
686dbf64 855 vm_map_entry_t tmp;
984263bc
MD
856 vm_map_entry_t last;
857
46754a20 858 ASSERT_VM_MAP_LOCKED(map);
686dbf64 859#if 0
984263bc 860 /*
686dbf64
MD
861 * XXX TEMPORARILY DISABLED. For some reason our attempt to revive
862 * the hint code with the red-black lookup meets with system crashes
863 * and lockups. We do not yet know why.
864 *
865 * It is possible that the problem is related to the setting
866 * of the hint during map_entry deletion, in the code specified
867 * at the GGG comment later on in this file.
aacb506b
MD
868 *
869 * YYY More likely it's because this function can be called with
870 * a shared lock on the map, resulting in map->hint updates possibly
871 * racing. Fixed now but untested.
984263bc 872 */
686dbf64
MD
873 /*
874 * Quickly check the cached hint, there's a good chance of a match.
875 */
aacb506b
MD
876 tmp = map->hint;
877 cpu_ccfence();
878 if (tmp != &map->header) {
686dbf64
MD
879 if (address >= tmp->start && address < tmp->end) {
880 *entry = tmp;
881 return(TRUE);
984263bc 882 }
984263bc 883 }
686dbf64 884#endif
984263bc
MD
885
886 /*
686dbf64
MD
887 * Locate the record from the top of the tree. 'last' tracks the
888 * closest prior record and is returned if no match is found, which
889 * in binary tree terms means tracking the most recent right-branch
890 * taken. If there is no prior record, &map->header is returned.
984263bc 891 */
686dbf64
MD
892 last = &map->header;
893 tmp = RB_ROOT(&map->rb_root);
894
895 while (tmp) {
896 if (address >= tmp->start) {
897 if (address < tmp->end) {
898 *entry = tmp;
899 map->hint = tmp;
900 return(TRUE);
984263bc 901 }
686dbf64
MD
902 last = tmp;
903 tmp = RB_RIGHT(tmp, rb_entry);
904 } else {
905 tmp = RB_LEFT(tmp, rb_entry);
984263bc 906 }
984263bc 907 }
686dbf64 908 *entry = last;
984263bc
MD
909 return (FALSE);
910}
911
912/*
46754a20
MD
913 * Inserts the given whole VM object into the target map at the specified
914 * address range. The object's size should match that of the address range.
984263bc 915 *
46754a20 916 * The map must be exclusively locked.
b12defdc 917 * The object must be held.
46754a20 918 * The caller must have reserved sufficient vm_map_entry structures.
984263bc 919 *
b12defdc
MD
920 * If object is non-NULL, ref count must be bumped by caller prior to
921 * making call to account for the new entry.
984263bc
MD
922 */
923int
a108bf71
MD
924vm_map_insert(vm_map_t map, int *countp,
925 vm_object_t object, vm_ooffset_t offset,
1b874851
MD
926 vm_offset_t start, vm_offset_t end,
927 vm_maptype_t maptype,
928 vm_prot_t prot, vm_prot_t max,
984263bc
MD
929 int cow)
930{
931 vm_map_entry_t new_entry;
932 vm_map_entry_t prev_entry;
933 vm_map_entry_t temp_entry;
934 vm_eflags_t protoeflags;
b12defdc 935 int must_drop = 0;
984263bc 936
46754a20 937 ASSERT_VM_MAP_LOCKED(map);
b12defdc
MD
938 if (object)
939 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
46754a20 940
984263bc
MD
941 /*
942 * Check that the start and end points are not bogus.
943 */
984263bc
MD
944 if ((start < map->min_offset) || (end > map->max_offset) ||
945 (start >= end))
946 return (KERN_INVALID_ADDRESS);
947
948 /*
949 * Find the entry prior to the proposed starting address; if it's part
950 * of an existing entry, this range is bogus.
951 */
984263bc
MD
952 if (vm_map_lookup_entry(map, start, &temp_entry))
953 return (KERN_NO_SPACE);
954
955 prev_entry = temp_entry;
956
957 /*
958 * Assert that the next entry doesn't overlap the end point.
959 */
960
961 if ((prev_entry->next != &map->header) &&
962 (prev_entry->next->start < end))
963 return (KERN_NO_SPACE);
964
965 protoeflags = 0;
966
967 if (cow & MAP_COPY_ON_WRITE)
968 protoeflags |= MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY;
969
970 if (cow & MAP_NOFAULT) {
971 protoeflags |= MAP_ENTRY_NOFAULT;
972
973 KASSERT(object == NULL,
974 ("vm_map_insert: paradoxical MAP_NOFAULT request"));
975 }
976 if (cow & MAP_DISABLE_SYNCER)
977 protoeflags |= MAP_ENTRY_NOSYNC;
978 if (cow & MAP_DISABLE_COREDUMP)
979 protoeflags |= MAP_ENTRY_NOCOREDUMP;
c809941b
MD
980 if (cow & MAP_IS_STACK)
981 protoeflags |= MAP_ENTRY_STACK;
e40cfbd7
MD
982 if (cow & MAP_IS_KSTACK)
983 protoeflags |= MAP_ENTRY_KSTACK;
984263bc 984
b12defdc 985 lwkt_gettoken(&map->token);
2de4f77e 986
984263bc
MD
987 if (object) {
988 /*
989 * When object is non-NULL, it could be shared with another
990 * process. We have to set or clear OBJ_ONEMAPPING
991 * appropriately.
992 */
993 if ((object->ref_count > 1) || (object->shadow_count != 0)) {
994 vm_object_clear_flag(object, OBJ_ONEMAPPING);
995 }
996 }
997 else if ((prev_entry != &map->header) &&
998 (prev_entry->eflags == protoeflags) &&
999 (prev_entry->end == start) &&
1000 (prev_entry->wired_count == 0) &&
1b874851 1001 prev_entry->maptype == maptype &&
984263bc
MD
1002 ((prev_entry->object.vm_object == NULL) ||
1003 vm_object_coalesce(prev_entry->object.vm_object,
1004 OFF_TO_IDX(prev_entry->offset),
1005 (vm_size_t)(prev_entry->end - prev_entry->start),
1006 (vm_size_t)(end - prev_entry->end)))) {
1007 /*
1008 * We were able to extend the object. Determine if we
1009 * can extend the previous map entry to include the
1010 * new range as well.
1011 */
1012 if ((prev_entry->inheritance == VM_INHERIT_DEFAULT) &&
1013 (prev_entry->protection == prot) &&
1014 (prev_entry->max_protection == max)) {
1015 map->size += (end - prev_entry->end);
1016 prev_entry->end = end;
a108bf71 1017 vm_map_simplify_entry(map, prev_entry, countp);
b12defdc 1018 lwkt_reltoken(&map->token);
984263bc
MD
1019 return (KERN_SUCCESS);
1020 }
1021
1022 /*
1023 * If we can extend the object but cannot extend the
1024 * map entry, we have to create a new map entry. We
1025 * must bump the ref count on the extended object to
1026 * account for it. object may be NULL.
1027 */
1028 object = prev_entry->object.vm_object;
1029 offset = prev_entry->offset +
1030 (prev_entry->end - prev_entry->start);
b12defdc
MD
1031 if (object) {
1032 vm_object_hold(object);
1033 vm_object_chain_wait(object);
1034 vm_object_reference_locked(object);
1035 must_drop = 1;
1036 }
984263bc
MD
1037 }
1038
1039 /*
1040 * NOTE: if conditionals fail, object can be NULL here. This occurs
1041 * in things like the buffer map where we manage kva but do not manage
1042 * backing objects.
1043 */
1044
1045 /*
1046 * Create a new entry
1047 */
1048
a108bf71 1049 new_entry = vm_map_entry_create(map, countp);
984263bc
MD
1050 new_entry->start = start;
1051 new_entry->end = end;
1052
1b874851 1053 new_entry->maptype = maptype;
984263bc
MD
1054 new_entry->eflags = protoeflags;
1055 new_entry->object.vm_object = object;
1056 new_entry->offset = offset;
afeabdca 1057 new_entry->aux.master_pde = 0;
984263bc
MD
1058
1059 new_entry->inheritance = VM_INHERIT_DEFAULT;
1060 new_entry->protection = prot;
1061 new_entry->max_protection = max;
1062 new_entry->wired_count = 0;
1063
1064 /*
1065 * Insert the new entry into the list
1066 */
1067
1068 vm_map_entry_link(map, prev_entry, new_entry);
1069 map->size += new_entry->end - new_entry->start;
1070
1071 /*
791c6551
MD
1072 * Update the free space hint. Entries cannot overlap.
1073 * An exact comparison is needed to avoid matching
1074 * against the map->header.
984263bc
MD
1075 */
1076 if ((map->first_free == prev_entry) &&
791c6551 1077 (prev_entry->end == new_entry->start)) {
984263bc
MD
1078 map->first_free = new_entry;
1079 }
1080
1081#if 0
1082 /*
1083 * Temporarily removed to avoid MAP_STACK panic, due to
1084 * MAP_STACK being a huge hack. Will be added back in
1085 * when MAP_STACK (and the user stack mapping) is fixed.
1086 */
1087 /*
1088 * It may be possible to simplify the entry
1089 */
a108bf71 1090 vm_map_simplify_entry(map, new_entry, countp);
984263bc
MD
1091#endif
1092
afeabdca
MD
1093 /*
1094 * Try to pre-populate the page table. Mappings governed by virtual
1095 * page tables cannot be prepopulated without a lot of work, so
1096 * don't try.
1097 */
1098 if ((cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL)) &&
1099 maptype != VM_MAPTYPE_VPAGETABLE) {
083a7402 1100 pmap_object_init_pt(map->pmap, start, prot,
984263bc
MD
1101 object, OFF_TO_IDX(offset), end - start,
1102 cow & MAP_PREFAULT_PARTIAL);
1103 }
b12defdc
MD
1104 if (must_drop)
1105 vm_object_drop(object);
984263bc 1106
b12defdc 1107 lwkt_reltoken(&map->token);
984263bc
MD
1108 return (KERN_SUCCESS);
1109}
1110
1111/*
1112 * Find sufficient space for `length' bytes in the given map, starting at
46754a20 1113 * `start'. Returns 0 on success, 1 on no space.
e9bb90e8
MD
1114 *
1115 * This function will returned an arbitrarily aligned pointer. If no
1116 * particular alignment is required you should pass align as 1. Note that
1117 * the map may return PAGE_SIZE aligned pointers if all the lengths used in
1118 * the map are a multiple of PAGE_SIZE, even if you pass a smaller align
1119 * argument.
1120 *
1121 * 'align' should be a power of 2 but is not required to be.
46754a20
MD
1122 *
1123 * The map must be exclusively locked.
1124 * No other requirements.
984263bc
MD
1125 */
1126int
c809941b 1127vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length,
9388fcaa 1128 vm_size_t align, int flags, vm_offset_t *addr)
984263bc
MD
1129{
1130 vm_map_entry_t entry, next;
1131 vm_offset_t end;
e9bb90e8 1132 vm_offset_t align_mask;
984263bc
MD
1133
1134 if (start < map->min_offset)
1135 start = map->min_offset;
1136 if (start > map->max_offset)
1137 return (1);
1138
1139 /*
e9bb90e8
MD
1140 * If the alignment is not a power of 2 we will have to use
1141 * a mod/division, set align_mask to a special value.
1142 */
1143 if ((align | (align - 1)) + 1 != (align << 1))
1144 align_mask = (vm_offset_t)-1;
1145 else
1146 align_mask = align - 1;
1147
1148 /*
984263bc
MD
1149 * Look for the first possible address; if there's already something
1150 * at this address, we have to start after it.
1151 */
1152 if (start == map->min_offset) {
1153 if ((entry = map->first_free) != &map->header)
1154 start = entry->end;
1155 } else {
1156 vm_map_entry_t tmp;
1157
1158 if (vm_map_lookup_entry(map, start, &tmp))
1159 start = tmp->end;
1160 entry = tmp;
1161 }
1162
1163 /*
1164 * Look through the rest of the map, trying to fit a new region in the
1165 * gap between existing regions, or after the very last region.
1166 */
1167 for (;; start = (entry = next)->end) {
1168 /*
e9bb90e8
MD
1169 * Adjust the proposed start by the requested alignment,
1170 * be sure that we didn't wrap the address.
1171 */
1172 if (align_mask == (vm_offset_t)-1)
1173 end = ((start + align - 1) / align) * align;
1174 else
1175 end = (start + align_mask) & ~align_mask;
1176 if (end < start)
1177 return (1);
1178 start = end;
1179 /*
984263bc 1180 * Find the end of the proposed new region. Be sure we didn't
e9bb90e8
MD
1181 * go beyond the end of the map, or wrap around the address.
1182 * Then check to see if this is the last entry or if the
1183 * proposed end fits in the gap between this and the next
1184 * entry.
984263bc
MD
1185 */
1186 end = start + length;
1187 if (end > map->max_offset || end < start)
1188 return (1);
1189 next = entry->next;
c809941b
MD
1190
1191 /*
1192 * If the next entry's start address is beyond the desired
1193 * end address we may have found a good entry.
1194 *
1195 * If the next entry is a stack mapping we do not map into
1196 * the stack's reserved space.
1197 *
1198 * XXX continue to allow mapping into the stack's reserved
1199 * space if doing a MAP_STACK mapping inside a MAP_STACK
1200 * mapping, for backwards compatibility. But the caller
1201 * really should use MAP_STACK | MAP_TRYFIXED if they
1202 * want to do that.
1203 */
1204 if (next == &map->header)
984263bc 1205 break;
c809941b
MD
1206 if (next->start >= end) {
1207 if ((next->eflags & MAP_ENTRY_STACK) == 0)
1208 break;
1209 if (flags & MAP_STACK)
1210 break;
1211 if (next->start - next->aux.avail_ssize >= end)
1212 break;
1213 }
984263bc 1214 }
686dbf64 1215 map->hint = entry;
a8cf2878
MD
1216
1217 /*
1218 * Grow the kernel_map if necessary. pmap_growkernel() will panic
1219 * if it fails. The kernel_map is locked and nothing can steal
1220 * our address space if pmap_growkernel() blocks.
1221 *
1222 * NOTE: This may be unconditionally called for kldload areas on
1223 * x86_64 because these do not bump kernel_vm_end (which would
1224 * fill 128G worth of page tables!). Therefore we must not
1225 * retry.
1226 */
e4846942 1227 if (map == &kernel_map) {
a8cf2878
MD
1228 vm_offset_t kstop;
1229
1230 kstop = round_page(start + length);
1231 if (kstop > kernel_vm_end)
1232 pmap_growkernel(start, kstop);
984263bc 1233 }
a108bf71 1234 *addr = start;
984263bc
MD
1235 return (0);
1236}
1237
1238/*
46754a20 1239 * vm_map_find finds an unallocated region in the target address map with
b12defdc
MD
1240 * the given length and allocates it. The search is defined to be first-fit
1241 * from the specified address; the region found is returned in the same
1242 * parameter.
984263bc 1243 *
46754a20
MD
1244 * If object is non-NULL, ref count must be bumped by caller
1245 * prior to making call to account for the new entry.
1246 *
1247 * No requirements. This function will lock the map temporarily.
984263bc
MD
1248 */
1249int
1250vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
9388fcaa 1251 vm_offset_t *addr, vm_size_t length, vm_size_t align,
c809941b 1252 boolean_t fitit,
1b874851
MD
1253 vm_maptype_t maptype,
1254 vm_prot_t prot, vm_prot_t max,
1255 int cow)
984263bc
MD
1256{
1257 vm_offset_t start;
03aa8d99 1258 int result;
a108bf71 1259 int count;
984263bc
MD
1260
1261 start = *addr;
1262
a108bf71 1263 count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
984263bc 1264 vm_map_lock(map);
b12defdc
MD
1265 if (object)
1266 vm_object_hold(object);
c809941b 1267 if (fitit) {
9388fcaa 1268 if (vm_map_findspace(map, start, length, align, 0, addr)) {
c4ad22e2
MD
1269 if (object)
1270 vm_object_drop(object);
984263bc 1271 vm_map_unlock(map);
a108bf71 1272 vm_map_entry_release(count);
984263bc
MD
1273 return (KERN_NO_SPACE);
1274 }
1275 start = *addr;
1276 }
a108bf71 1277 result = vm_map_insert(map, &count, object, offset,
1b874851
MD
1278 start, start + length,
1279 maptype,
1280 prot, max,
1281 cow);
b12defdc
MD
1282 if (object)
1283 vm_object_drop(object);
984263bc 1284 vm_map_unlock(map);
a108bf71 1285 vm_map_entry_release(count);
984263bc 1286
984263bc
MD
1287 return (result);
1288}
1289
1290/*
46754a20
MD
1291 * Simplify the given map entry by merging with either neighbor. This
1292 * routine also has the ability to merge with both neighbors.
984263bc 1293 *
46754a20
MD
1294 * This routine guarentees that the passed entry remains valid (though
1295 * possibly extended). When merging, this routine may delete one or
1296 * both neighbors. No action is taken on entries which have their
1297 * in-transition flag set.
984263bc 1298 *
46754a20 1299 * The map must be exclusively locked.
984263bc
MD
1300 */
1301void
a108bf71 1302vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry, int *countp)
984263bc
MD
1303{
1304 vm_map_entry_t next, prev;
1305 vm_size_t prevsize, esize;
1306
1b874851 1307 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
12e4aaff 1308 ++mycpu->gd_cnt.v_intrans_coll;
984263bc
MD
1309 return;
1310 }
1311
1b874851
MD
1312 if (entry->maptype == VM_MAPTYPE_SUBMAP)
1313 return;
1314
984263bc
MD
1315 prev = entry->prev;
1316 if (prev != &map->header) {
1317 prevsize = prev->end - prev->start;
1318 if ( (prev->end == entry->start) &&
1b874851 1319 (prev->maptype == entry->maptype) &&
984263bc
MD
1320 (prev->object.vm_object == entry->object.vm_object) &&
1321 (!prev->object.vm_object ||
1322 (prev->offset + prevsize == entry->offset)) &&
1323 (prev->eflags == entry->eflags) &&
1324 (prev->protection == entry->protection) &&
1325 (prev->max_protection == entry->max_protection) &&
1326 (prev->inheritance == entry->inheritance) &&
1327 (prev->wired_count == entry->wired_count)) {
1328 if (map->first_free == prev)
1329 map->first_free = entry;
1330 if (map->hint == prev)
1331 map->hint = entry;
1332 vm_map_entry_unlink(map, prev);
1333 entry->start = prev->start;
1334 entry->offset = prev->offset;
1335 if (prev->object.vm_object)
1336 vm_object_deallocate(prev->object.vm_object);
a108bf71 1337 vm_map_entry_dispose(map, prev, countp);
984263bc
MD
1338 }
1339 }
1340
1341 next = entry->next;
1342 if (next != &map->header) {
1343 esize = entry->end - entry->start;
1344 if ((entry->end == next->start) &&
1b874851 1345 (next->maptype == entry->maptype) &&
984263bc
MD
1346 (next->object.vm_object == entry->object.vm_object) &&
1347 (!entry->object.vm_object ||
1348 (entry->offset + esize == next->offset)) &&
1349 (next->eflags == entry->eflags) &&
1350 (next->protection == entry->protection) &&
1351 (next->max_protection == entry->max_protection) &&
1352 (next->inheritance == entry->inheritance) &&
1353 (next->wired_count == entry->wired_count)) {
1354 if (map->first_free == next)
1355 map->first_free = entry;
1356 if (map->hint == next)
1357 map->hint = entry;
1358 vm_map_entry_unlink(map, next);
1359 entry->end = next->end;
1360 if (next->object.vm_object)
1361 vm_object_deallocate(next->object.vm_object);
a108bf71 1362 vm_map_entry_dispose(map, next, countp);
984263bc
MD
1363 }
1364 }
1365}
46754a20 1366
984263bc 1367/*
46754a20
MD
1368 * Asserts that the given entry begins at or after the specified address.
1369 * If necessary, it splits the entry into two.
984263bc 1370 */
46754a20
MD
1371#define vm_map_clip_start(map, entry, startaddr, countp) \
1372{ \
1373 if (startaddr > entry->start) \
1374 _vm_map_clip_start(map, entry, startaddr, countp); \
984263bc
MD
1375}
1376
1377/*
46754a20
MD
1378 * This routine is called only when it is known that the entry must be split.
1379 *
1380 * The map must be exclusively locked.
984263bc
MD
1381 */
1382static void
46754a20
MD
1383_vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start,
1384 int *countp)
984263bc
MD
1385{
1386 vm_map_entry_t new_entry;
1387
1388 /*
1389 * Split off the front portion -- note that we must insert the new
1390 * entry BEFORE this one, so that this entry has the specified
1391 * starting address.
1392 */
1393
a108bf71 1394 vm_map_simplify_entry(map, entry, countp);
984263bc
MD
1395
1396 /*
1397 * If there is no object backing this entry, we might as well create
1398 * one now. If we defer it, an object can get created after the map
1399 * is clipped, and individual objects will be created for the split-up
1400 * map. This is a bit of a hack, but is also about the best place to
1401 * put this improvement.
1402 */
984263bc 1403 if (entry->object.vm_object == NULL && !map->system_map) {
53025830 1404 vm_map_entry_allocate_object(entry);
984263bc
MD
1405 }
1406
a108bf71 1407 new_entry = vm_map_entry_create(map, countp);
984263bc
MD
1408 *new_entry = *entry;
1409
1410 new_entry->end = start;
1411 entry->offset += (start - entry->start);
1412 entry->start = start;
1413
1414 vm_map_entry_link(map, entry->prev, new_entry);
1415
1b874851
MD
1416 switch(entry->maptype) {
1417 case VM_MAPTYPE_NORMAL:
1418 case VM_MAPTYPE_VPAGETABLE:
b12defdc
MD
1419 if (new_entry->object.vm_object) {
1420 vm_object_hold(new_entry->object.vm_object);
1421 vm_object_chain_wait(new_entry->object.vm_object);
1422 vm_object_reference_locked(new_entry->object.vm_object);
1423 vm_object_drop(new_entry->object.vm_object);
1424 }
1b874851
MD
1425 break;
1426 default:
1427 break;
984263bc
MD
1428 }
1429}
1430
1431/*
46754a20
MD
1432 * Asserts that the given entry ends at or before the specified address.
1433 * If necessary, it splits the entry into two.
984263bc 1434 *
46754a20 1435 * The map must be exclusively locked.
984263bc 1436 */
46754a20
MD
1437#define vm_map_clip_end(map, entry, endaddr, countp) \
1438{ \
1439 if (endaddr < entry->end) \
1440 _vm_map_clip_end(map, entry, endaddr, countp); \
984263bc
MD
1441}
1442
1443/*
46754a20
MD
1444 * This routine is called only when it is known that the entry must be split.
1445 *
1446 * The map must be exclusively locked.
984263bc
MD
1447 */
1448static void
46754a20
MD
1449_vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end,
1450 int *countp)
984263bc
MD
1451{
1452 vm_map_entry_t new_entry;
1453
1454 /*
1455 * If there is no object backing this entry, we might as well create
1456 * one now. If we defer it, an object can get created after the map
1457 * is clipped, and individual objects will be created for the split-up
1458 * map. This is a bit of a hack, but is also about the best place to
1459 * put this improvement.
1460 */
1461
1462 if (entry->object.vm_object == NULL && !map->system_map) {
53025830 1463 vm_map_entry_allocate_object(entry);
984263bc
MD
1464 }
1465
1466 /*
1467 * Create a new entry and insert it AFTER the specified entry
1468 */
1469
a108bf71 1470 new_entry = vm_map_entry_create(map, countp);
984263bc
MD
1471 *new_entry = *entry;
1472
1473 new_entry->start = entry->end = end;
1474 new_entry->offset += (end - entry->start);
1475
1476 vm_map_entry_link(map, entry, new_entry);
1477
1b874851
MD
1478 switch(entry->maptype) {
1479 case VM_MAPTYPE_NORMAL:
1480 case VM_MAPTYPE_VPAGETABLE:
b12defdc
MD
1481 if (new_entry->object.vm_object) {
1482 vm_object_hold(new_entry->object.vm_object);
1483 vm_object_chain_wait(new_entry->object.vm_object);
1484 vm_object_reference_locked(new_entry->object.vm_object);
1485 vm_object_drop(new_entry->object.vm_object);
1486 }
1b874851
MD
1487 break;
1488 default:
1489 break;
984263bc
MD
1490 }
1491}
1492
1493/*
46754a20
MD
1494 * Asserts that the starting and ending region addresses fall within the
1495 * valid range for the map.
984263bc 1496 */
46754a20
MD
1497#define VM_MAP_RANGE_CHECK(map, start, end) \
1498{ \
1499 if (start < vm_map_min(map)) \
1500 start = vm_map_min(map); \
1501 if (end > vm_map_max(map)) \
1502 end = vm_map_max(map); \
1503 if (start > end) \
1504 start = end; \
1505}
984263bc
MD
1506
1507/*
46754a20
MD
1508 * Used to block when an in-transition collison occurs. The map
1509 * is unlocked for the sleep and relocked before the return.
984263bc 1510 */
984263bc
MD
1511void
1512vm_map_transition_wait(vm_map_t map)
1513{
ff13bc52 1514 tsleep_interlock(map, 0);
984263bc 1515 vm_map_unlock(map);
ff13bc52 1516 tsleep(map, PINTERLOCKED, "vment", 0);
984263bc
MD
1517 vm_map_lock(map);
1518}
1519
1520/*
46754a20
MD
1521 * When we do blocking operations with the map lock held it is
1522 * possible that a clip might have occured on our in-transit entry,
1523 * requiring an adjustment to the entry in our loop. These macros
1524 * help the pageable and clip_range code deal with the case. The
1525 * conditional costs virtually nothing if no clipping has occured.
984263bc
MD
1526 */
1527
1528#define CLIP_CHECK_BACK(entry, save_start) \
1529 do { \
1530 while (entry->start != save_start) { \
1531 entry = entry->prev; \
1532 KASSERT(entry != &map->header, ("bad entry clip")); \
1533 } \
1534 } while(0)
1535
1536#define CLIP_CHECK_FWD(entry, save_end) \
1537 do { \
1538 while (entry->end != save_end) { \
1539 entry = entry->next; \
1540 KASSERT(entry != &map->header, ("bad entry clip")); \
1541 } \
1542 } while(0)
1543
1544
1545/*
46754a20
MD
1546 * Clip the specified range and return the base entry. The
1547 * range may cover several entries starting at the returned base
1548 * and the first and last entry in the covering sequence will be
1549 * properly clipped to the requested start and end address.
1550 *
1551 * If no holes are allowed you should pass the MAP_CLIP_NO_HOLES
1552 * flag.
1553 *
1554 * The MAP_ENTRY_IN_TRANSITION flag will be set for the entries
1555 * covered by the requested range.
1556 *
1557 * The map must be exclusively locked on entry and will remain locked
1558 * on return. If no range exists or the range contains holes and you
1559 * specified that no holes were allowed, NULL will be returned. This
1560 * routine may temporarily unlock the map in order avoid a deadlock when
1561 * sleeping.
984263bc
MD
1562 */
1563static
1564vm_map_entry_t
a108bf71 1565vm_map_clip_range(vm_map_t map, vm_offset_t start, vm_offset_t end,
46754a20 1566 int *countp, int flags)
984263bc
MD
1567{
1568 vm_map_entry_t start_entry;
1569 vm_map_entry_t entry;
1570
1571 /*
1572 * Locate the entry and effect initial clipping. The in-transition
1573 * case does not occur very often so do not try to optimize it.
1574 */
1575again:
1576 if (vm_map_lookup_entry(map, start, &start_entry) == FALSE)
1577 return (NULL);
1578 entry = start_entry;
1579 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
1580 entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
12e4aaff
MD
1581 ++mycpu->gd_cnt.v_intrans_coll;
1582 ++mycpu->gd_cnt.v_intrans_wait;
984263bc
MD
1583 vm_map_transition_wait(map);
1584 /*
1585 * entry and/or start_entry may have been clipped while
1586 * we slept, or may have gone away entirely. We have
1587 * to restart from the lookup.
1588 */
1589 goto again;
1590 }
46754a20 1591
984263bc
MD
1592 /*
1593 * Since we hold an exclusive map lock we do not have to restart
1594 * after clipping, even though clipping may block in zalloc.
1595 */
a108bf71
MD
1596 vm_map_clip_start(map, entry, start, countp);
1597 vm_map_clip_end(map, entry, end, countp);
984263bc
MD
1598 entry->eflags |= MAP_ENTRY_IN_TRANSITION;
1599
1600 /*
1601 * Scan entries covered by the range. When working on the next
1602 * entry a restart need only re-loop on the current entry which
1603 * we have already locked, since 'next' may have changed. Also,
1604 * even though entry is safe, it may have been clipped so we
1605 * have to iterate forwards through the clip after sleeping.
1606 */
1607 while (entry->next != &map->header && entry->next->start < end) {
1608 vm_map_entry_t next = entry->next;
1609
1610 if (flags & MAP_CLIP_NO_HOLES) {
1611 if (next->start > entry->end) {
1612 vm_map_unclip_range(map, start_entry,
a108bf71 1613 start, entry->end, countp, flags);
984263bc
MD
1614 return(NULL);
1615 }
1616 }
1617
1618 if (next->eflags & MAP_ENTRY_IN_TRANSITION) {
1619 vm_offset_t save_end = entry->end;
1620 next->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
12e4aaff
MD
1621 ++mycpu->gd_cnt.v_intrans_coll;
1622 ++mycpu->gd_cnt.v_intrans_wait;
984263bc
MD
1623 vm_map_transition_wait(map);
1624
1625 /*
1626 * clips might have occured while we blocked.
1627 */
1628 CLIP_CHECK_FWD(entry, save_end);
1629 CLIP_CHECK_BACK(start_entry, start);
1630 continue;
1631 }
1632 /*
1633 * No restart necessary even though clip_end may block, we
1634 * are holding the map lock.
1635 */
a108bf71 1636 vm_map_clip_end(map, next, end, countp);
984263bc
MD
1637 next->eflags |= MAP_ENTRY_IN_TRANSITION;
1638 entry = next;
1639 }
1640 if (flags & MAP_CLIP_NO_HOLES) {
1641 if (entry->end != end) {
1642 vm_map_unclip_range(map, start_entry,
a108bf71 1643 start, entry->end, countp, flags);
984263bc
MD
1644 return(NULL);
1645 }
1646 }
1647 return(start_entry);
1648}
1649
1650/*
46754a20
MD
1651 * Undo the effect of vm_map_clip_range(). You should pass the same
1652 * flags and the same range that you passed to vm_map_clip_range().
1653 * This code will clear the in-transition flag on the entries and
1654 * wake up anyone waiting. This code will also simplify the sequence
1655 * and attempt to merge it with entries before and after the sequence.
1656 *
1657 * The map must be locked on entry and will remain locked on return.
1658 *
1659 * Note that you should also pass the start_entry returned by
1660 * vm_map_clip_range(). However, if you block between the two calls
1661 * with the map unlocked please be aware that the start_entry may
1662 * have been clipped and you may need to scan it backwards to find
1663 * the entry corresponding with the original start address. You are
1664 * responsible for this, vm_map_unclip_range() expects the correct
1665 * start_entry to be passed to it and will KASSERT otherwise.
984263bc
MD
1666 */
1667static
1668void
46754a20
MD
1669vm_map_unclip_range(vm_map_t map, vm_map_entry_t start_entry,
1670 vm_offset_t start, vm_offset_t end,
1671 int *countp, int flags)
984263bc
MD
1672{
1673 vm_map_entry_t entry;
1674
1675 entry = start_entry;
1676
1677 KASSERT(entry->start == start, ("unclip_range: illegal base entry"));
1678 while (entry != &map->header && entry->start < end) {
46754a20
MD
1679 KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION,
1680 ("in-transition flag not set during unclip on: %p",
1681 entry));
1682 KASSERT(entry->end <= end,
1683 ("unclip_range: tail wasn't clipped"));
984263bc
MD
1684 entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
1685 if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
1686 entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
1687 wakeup(map);
1688 }
1689 entry = entry->next;
1690 }
1691
1692 /*
1693 * Simplification does not block so there is no restart case.
1694 */
1695 entry = start_entry;
1696 while (entry != &map->header && entry->start < end) {
a108bf71 1697 vm_map_simplify_entry(map, entry, countp);
984263bc
MD
1698 entry = entry->next;
1699 }
1700}
1701
1702/*
46754a20 1703 * Mark the given range as handled by a subordinate map.
984263bc 1704 *
46754a20
MD
1705 * This range must have been created with vm_map_find(), and no other
1706 * operations may have been performed on this range prior to calling
1707 * vm_map_submap().
984263bc 1708 *
46754a20 1709 * Submappings cannot be removed.
984263bc 1710 *
46754a20 1711 * No requirements.
984263bc
MD
1712 */
1713int
a108bf71 1714vm_map_submap(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_map_t submap)
984263bc
MD
1715{
1716 vm_map_entry_t entry;
1717 int result = KERN_INVALID_ARGUMENT;
a108bf71 1718 int count;
984263bc 1719
a108bf71 1720 count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
984263bc
MD
1721 vm_map_lock(map);
1722
1723 VM_MAP_RANGE_CHECK(map, start, end);
1724
1725 if (vm_map_lookup_entry(map, start, &entry)) {
a108bf71 1726 vm_map_clip_start(map, entry, start, &count);
984263bc
MD
1727 } else {
1728 entry = entry->next;
1729 }
1730
a108bf71 1731 vm_map_clip_end(map, entry, end, &count);
984263bc
MD
1732
1733 if ((entry->start == start) && (entry->end == end) &&
1734 ((entry->eflags & MAP_ENTRY_COW) == 0) &&
1735 (entry->object.vm_object == NULL)) {
1736 entry->object.sub_map = submap;
1b874851 1737 entry->maptype = VM_MAPTYPE_SUBMAP;
984263bc
MD
1738 result = KERN_SUCCESS;
1739 }
1740 vm_map_unlock(map);
a108bf71 1741 vm_map_entry_release(count);
984263bc
MD
1742
1743 return (result);
1744}
1745
1746/*
1b874851
MD
1747 * Sets the protection of the specified address region in the target map.
1748 * If "set_max" is specified, the maximum protection is to be set;
1749 * otherwise, only the current protection is affected.
1750 *
1751 * The protection is not applicable to submaps, but is applicable to normal
1752 * maps and maps governed by virtual page tables. For example, when operating
1753 * on a virtual page table our protection basically controls how COW occurs
1754 * on the backing object, whereas the virtual page table abstraction itself
1755 * is an abstraction for userland.
46754a20
MD
1756 *
1757 * No requirements.
984263bc
MD
1758 */
1759int
1760vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
1761 vm_prot_t new_prot, boolean_t set_max)
1762{
1763 vm_map_entry_t current;
1764 vm_map_entry_t entry;
a108bf71 1765 int count;
984263bc 1766
a108bf71 1767 count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
984263bc
MD
1768 vm_map_lock(map);
1769
1770 VM_MAP_RANGE_CHECK(map, start, end);
1771
1772 if (vm_map_lookup_entry(map, start, &entry)) {
a108bf71 1773 vm_map_clip_start(map, entry, start, &count);
984263bc
MD
1774 } else {
1775 entry = entry->next;
1776 }
1777
1778 /*
1779 * Make a first pass to check for protection violations.
1780 */
984263bc
MD
1781 current = entry;
1782 while ((current != &map->header) && (current->start < end)) {
1b874851 1783 if (current->maptype == VM_MAPTYPE_SUBMAP) {
984263bc 1784 vm_map_unlock(map);
a108bf71 1785 vm_map_entry_release(count);
984263bc
MD
1786 return (KERN_INVALID_ARGUMENT);
1787 }
1788 if ((new_prot & current->max_protection) != new_prot) {
1789 vm_map_unlock(map);
a108bf71 1790 vm_map_entry_release(count);
984263bc
MD
1791 return (KERN_PROTECTION_FAILURE);
1792 }
1793 current = current->next;
1794 }
1795
1796 /*
1797 * Go back and fix up protections. [Note that clipping is not
1798 * necessary the second time.]
1799 */
984263bc
MD
1800 current = entry;
1801
1802 while ((current != &map->header) && (current->start < end)) {
1803 vm_prot_t old_prot;
1804
a108bf71 1805 vm_map_clip_end(map, current, end, &count);
984263bc
MD
1806
1807 old_prot = current->protection;
1b874851 1808 if (set_max) {
984263bc
MD
1809 current->protection =
1810 (current->max_protection = new_prot) &
1811 old_prot;
1b874851 1812 } else {
984263bc 1813 current->protection = new_prot;
1b874851 1814 }
984263bc
MD
1815
1816 /*
1817 * Update physical map if necessary. Worry about copy-on-write
1818 * here -- CHECK THIS XXX
1819 */
1820
1821 if (current->protection != old_prot) {
1822#define MASK(entry) (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
1823 VM_PROT_ALL)
1824
1825 pmap_protect(map->pmap, current->start,
1826 current->end,
1827 current->protection & MASK(current));
1828#undef MASK
1829 }
1830
a108bf71 1831 vm_map_simplify_entry(map, current, &count);
984263bc
MD
1832
1833 current = current->next;
1834 }
1835
1836 vm_map_unlock(map);
a108bf71 1837 vm_map_entry_release(count);
984263bc
MD
1838 return (KERN_SUCCESS);
1839}
1840
1841/*
46754a20
MD
1842 * This routine traverses a processes map handling the madvise
1843 * system call. Advisories are classified as either those effecting
1844 * the vm_map_entry structure, or those effecting the underlying
1845 * objects.
984263bc 1846 *
46754a20 1847 * The <value> argument is used for extended madvise calls.
afeabdca 1848 *
46754a20 1849 * No requirements.
984263bc 1850 */
984263bc 1851int
afeabdca
MD
1852vm_map_madvise(vm_map_t map, vm_offset_t start, vm_offset_t end,
1853 int behav, off_t value)
984263bc
MD
1854{
1855 vm_map_entry_t current, entry;
1856 int modify_map = 0;
afeabdca 1857 int error = 0;
a108bf71 1858 int count;
984263bc
MD
1859
1860 /*
1861 * Some madvise calls directly modify the vm_map_entry, in which case
1862 * we need to use an exclusive lock on the map and we need to perform
1863 * various clipping operations. Otherwise we only need a read-lock
1864 * on the map.
1865 */
1866
a108bf71
MD
1867 count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
1868
984263bc
MD
1869 switch(behav) {
1870 case MADV_NORMAL:
1871 case MADV_SEQUENTIAL:
1872 case MADV_RANDOM:
1873 case MADV_NOSYNC:
1874 case MADV_AUTOSYNC:
1875 case MADV_NOCORE:
1876 case MADV_CORE:
afeabdca
MD
1877 case MADV_SETMAP:
1878 case MADV_INVAL:
984263bc
MD
1879 modify_map = 1;
1880 vm_map_lock(map);
1881 break;
1882 case MADV_WILLNEED:
1883 case MADV_DONTNEED:
1884 case MADV_FREE:
1885 vm_map_lock_read(map);
1886 break;
1887 default:
a108bf71 1888 vm_map_entry_release(count);
afeabdca 1889 return (EINVAL);
984263bc
MD
1890 }
1891
1892 /*
1893 * Locate starting entry and clip if necessary.
1894 */
1895
1896 VM_MAP_RANGE_CHECK(map, start, end);
1897
1898 if (vm_map_lookup_entry(map, start, &entry)) {
1899 if (modify_map)
a108bf71 1900 vm_map_clip_start(map, entry, start, &count);
984263bc
MD
1901 } else {
1902 entry = entry->next;
1903 }
1904
1905 if (modify_map) {
1906 /*
1907 * madvise behaviors that are implemented in the vm_map_entry.
1908 *
1909 * We clip the vm_map_entry so that behavioral changes are
1910 * limited to the specified address range.
1911 */
1912 for (current = entry;
1913 (current != &map->header) && (current->start < end);
1914 current = current->next
1915 ) {
1b874851 1916 if (current->maptype == VM_MAPTYPE_SUBMAP)
984263bc
MD
1917 continue;
1918
a108bf71 1919 vm_map_clip_end(map, current, end, &count);
984263bc
MD
1920
1921 switch (behav) {
1922 case MADV_NORMAL:
1923 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL);
1924 break;
1925 case MADV_SEQUENTIAL:
1926 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL);
1927 break;
1928 case MADV_RANDOM:
1929 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM);
1930 break;
1931 case MADV_NOSYNC:
1932 current->eflags |= MAP_ENTRY_NOSYNC;
1933 break;
1934 case MADV_AUTOSYNC:
1935 current->eflags &= ~MAP_ENTRY_NOSYNC;
1936 break;
1937 case MADV_NOCORE:
1938 current->eflags |= MAP_ENTRY_NOCOREDUMP;
1939 break;
1940 case MADV_CORE:
1941 current->eflags &= ~MAP_ENTRY_NOCOREDUMP;
1942 break;
afeabdca
MD
1943 case MADV_INVAL:
1944 /*
1945 * Invalidate the related pmap entries, used
1946 * to flush portions of the real kernel's
1947 * pmap when the caller has removed or
1948 * modified existing mappings in a virtual
1949 * page table.
1950 */
1951 pmap_remove(map->pmap,
1952 current->start, current->end);
1953 break;
1954 case MADV_SETMAP:
1955 /*
1956 * Set the page directory page for a map
1957 * governed by a virtual page table. Mark
1958 * the entry as being governed by a virtual
1959 * page table if it is not.
1960 *
1961 * XXX the page directory page is stored
1962 * in the avail_ssize field if the map_entry.
1963 *
1964 * XXX the map simplification code does not
1965 * compare this field so weird things may
1966 * happen if you do not apply this function
1967 * to the entire mapping governed by the
1968 * virtual page table.
1969 */
1970 if (current->maptype != VM_MAPTYPE_VPAGETABLE) {
1971 error = EINVAL;
1972 break;
1973 }
1974 current->aux.master_pde = value;
1975 pmap_remove(map->pmap,
1976 current->start, current->end);
1977 break;
984263bc 1978 default:
afeabdca 1979 error = EINVAL;
984263bc
MD
1980 break;
1981 }
a108bf71 1982 vm_map_simplify_entry(map, current, &count);
984263bc
MD
1983 }
1984 vm_map_unlock(map);
1985 } else {
1986 vm_pindex_t pindex;
1987 int count;
1988
1989 /*
1990 * madvise behaviors that are implemented in the underlying
1991 * vm_object.
1992 *
1993 * Since we don't clip the vm_map_entry, we have to clip
1994 * the vm_object pindex and count.
1b874851
MD
1995 *
1996 * NOTE! We currently do not support these functions on
1997 * virtual page tables.
984263bc
MD
1998 */
1999 for (current = entry;
2000 (current != &map->header) && (current->start < end);
2001 current = current->next
2002 ) {
2003 vm_offset_t useStart;
2004
1b874851 2005 if (current->maptype != VM_MAPTYPE_NORMAL)
984263bc
MD
2006 continue;
2007
2008 pindex = OFF_TO_IDX(current->offset);
2009 count = atop(current->end - current->start);
2010 useStart = current->start;
2011
2012 if (current->start < start) {
2013 pindex += atop(start - current->start);
2014 count -= atop(start - current->start);
2015 useStart = start;
2016 }
2017 if (current->end > end)
2018 count -= atop(current->end - end);
2019
2020 if (count <= 0)
2021 continue;
2022
2023 vm_object_madvise(current->object.vm_object,
2024 pindex, count, behav);
afeabdca
MD
2025
2026 /*
2027 * Try to populate the page table. Mappings governed
2028 * by virtual page tables cannot be pre-populated
2029 * without a lot of work so don't try.
2030 */
2031 if (behav == MADV_WILLNEED &&
2032 current->maptype != VM_MAPTYPE_VPAGETABLE) {
984263bc
MD
2033 pmap_object_init_pt(
2034 map->pmap,
2035 useStart,
083a7402 2036 current->protection,
984263bc
MD
2037 current->object.vm_object,
2038 pindex,
2039 (count << PAGE_SHIFT),
2040 MAP_PREFAULT_MADVISE
2041 );
2042 }
2043 }
2044 vm_map_unlock_read(map);
2045 }
a108bf71 2046 vm_map_entry_release(count);
afeabdca 2047 return(error);
984263bc
MD
2048}
2049
2050
2051/*
46754a20
MD
2052 * Sets the inheritance of the specified address range in the target map.
2053 * Inheritance affects how the map will be shared with child maps at the
2054 * time of vm_map_fork.
984263bc
MD
2055 */
2056int
2057vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
2058 vm_inherit_t new_inheritance)
2059{
2060 vm_map_entry_t entry;
2061 vm_map_entry_t temp_entry;
a108bf71 2062 int count;
984263bc
MD
2063
2064 switch (new_inheritance) {
2065 case VM_INHERIT_NONE:
2066 case VM_INHERIT_COPY:
2067 case VM_INHERIT_SHARE:
2068 break;
2069 default:
2070 return (KERN_INVALID_ARGUMENT);
2071 }
2072
a108bf71 2073 count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
984263bc
MD
2074 vm_map_lock(map);
2075
2076 VM_MAP_RANGE_CHECK(map, start, end);
2077
2078 if (vm_map_lookup_entry(map, start, &temp_entry)) {
2079 entry = temp_entry;
a108bf71 2080 vm_map_clip_start(map, entry, start, &count);
984263bc
MD
2081 } else
2082 entry = temp_entry->next;
2083
2084 while ((entry != &map->header) && (entry->start < end)) {
a108bf71 2085 vm_map_clip_end(map, entry, end, &count);
984263bc
MD
2086
2087 entry->inheritance = new_inheritance;
2088
a108bf71 2089 vm_map_simplify_entry(map, entry, &count);
984263bc
MD
2090
2091 entry = entry->next;
2092 }
984263bc 2093 vm_map_unlock(map);
a108bf71 2094 vm_map_entry_release(count);
984263bc
MD
2095 return (KERN_SUCCESS);
2096}
2097
2098/*
2099 * Implement the semantics of mlock
2100 */
2101int
57e43348 2102vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t real_end,
46754a20 2103 boolean_t new_pageable)
984263bc
MD
2104{
2105 vm_map_entry_t entry;
2106 vm_map_entry_t start_entry;
2107 vm_offset_t end;
2108 int rv = KERN_SUCCESS;
a108bf71 2109 int count;
984263bc 2110
a108bf71 2111 count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
984263bc
MD
2112 vm_map_lock(map);
2113 VM_MAP_RANGE_CHECK(map, start, real_end);
2114 end = real_end;
2115
46754a20
MD
2116 start_entry = vm_map_clip_range(map, start, end, &count,
2117 MAP_CLIP_NO_HOLES);
984263bc
MD
2118 if (start_entry == NULL) {
2119 vm_map_unlock(map);
a108bf71 2120 vm_map_entry_release(count);
984263bc
MD
2121 return (KERN_INVALID_ADDRESS);
2122 }
2123
2124 if (new_pageable == 0) {
2125 entry = start_entry;
2126 while ((entry != &map->header) && (entry->start < end)) {
2127 vm_offset_t save_start;
2128 vm_offset_t save_end;
2129
2130 /*
2131 * Already user wired or hard wired (trivial cases)
2132 */
2133 if (entry->eflags & MAP_ENTRY_USER_WIRED) {
2134 entry = entry->next;
2135 continue;
2136 }
2137 if (entry->wired_count != 0) {
2138 entry->wired_count++;
2139 entry->eflags |= MAP_ENTRY_USER_WIRED;
2140 entry = entry->next;
2141 continue;
2142 }
2143
2144 /*
2145 * A new wiring requires instantiation of appropriate
2146 * management structures and the faulting in of the
2147 * page.
2148 */
1b874851 2149 if (entry->maptype != VM_MAPTYPE_SUBMAP) {
46754a20
MD
2150 int copyflag = entry->eflags &
2151 MAP_ENTRY_NEEDS_COPY;
2152 if (copyflag && ((entry->protection &
2153 VM_PROT_WRITE) != 0)) {
b12defdc 2154 vm_map_entry_shadow(entry, 0);
984263bc
MD
2155 } else if (entry->object.vm_object == NULL &&
2156 !map->system_map) {
53025830 2157 vm_map_entry_allocate_object(entry);
984263bc
MD
2158 }
2159 }
2160 entry->wired_count++;
2161 entry->eflags |= MAP_ENTRY_USER_WIRED;
2162
2163 /*
f2d22ebf
MD
2164 * Now fault in the area. Note that vm_fault_wire()
2165 * may release the map lock temporarily, it will be
2166 * relocked on return. The in-transition
984263bc
MD
2167 * flag protects the entries.
2168 */
2169 save_start = entry->start;
2170 save_end = entry->end;
f2d22ebf 2171 rv = vm_fault_wire(map, entry, TRUE);
984263bc
MD
2172 if (rv) {
2173 CLIP_CHECK_BACK(entry, save_start);
2174 for (;;) {
2175 KASSERT(entry->wired_count == 1, ("bad wired_count on entry"));
2176 entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2177 entry->wired_count = 0;
2178 if (entry->end == save_end)
2179 break;
2180 entry = entry->next;
2181 KASSERT(entry != &map->header, ("bad entry clip during backout"));
2182 }
2183 end = save_start; /* unwire the rest */
2184 break;
2185 }
2186 /*
2187 * note that even though the entry might have been
2188 * clipped, the USER_WIRED flag we set prevents
2189 * duplication so we do not have to do a
2190 * clip check.
2191 */
2192 entry = entry->next;
2193 }
2194
2195 /*
2196 * If we failed fall through to the unwiring section to
2197 * unwire what we had wired so far. 'end' has already
2198 * been adjusted.
2199 */
2200 if (rv)
2201 new_pageable = 1;
2202
2203 /*
2204 * start_entry might have been clipped if we unlocked the
2205 * map and blocked. No matter how clipped it has gotten
2206 * there should be a fragment that is on our start boundary.
2207 */
2208 CLIP_CHECK_BACK(start_entry, start);
2209 }
2210
2211 /*
2212 * Deal with the unwiring case.
2213 */
2214 if (new_pageable) {
2215 /*
2216 * This is the unwiring case. We must first ensure that the
2217 * range to be unwired is really wired down. We know there
2218 * are no holes.
2219 */
2220 entry = start_entry;
2221 while ((entry != &map->header) && (entry->start < end)) {
2222 if ((entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
2223 rv = KERN_INVALID_ARGUMENT;
2224 goto done;
2225 }
2226 KASSERT(entry->wired_count != 0, ("wired count was 0 with USER_WIRED set! %p", entry));
2227 entry = entry->next;
2228 }
2229
2230 /*
2231 * Now decrement the wiring count for each region. If a region
2232 * becomes completely unwired, unwire its physical pages and
2233 * mappings.
2234 */
b4eddbac
DR
2235 /*
2236 * The map entries are processed in a loop, checking to
2237 * make sure the entry is wired and asserting it has a wired
2238 * count. However, another loop was inserted more-or-less in
2239 * the middle of the unwiring path. This loop picks up the
2240 * "entry" loop variable from the first loop without first
2241 * setting it to start_entry. Naturally, the secound loop
2242 * is never entered and the pages backing the entries are
2243 * never unwired. This can lead to a leak of wired pages.
2244 */
2245 entry = start_entry;
984263bc 2246 while ((entry != &map->header) && (entry->start < end)) {
f2d22ebf
MD
2247 KASSERT(entry->eflags & MAP_ENTRY_USER_WIRED,
2248 ("expected USER_WIRED on entry %p", entry));
984263bc
MD
2249 entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2250 entry->wired_count--;
2251 if (entry->wired_count == 0)
f2d22ebf 2252 vm_fault_unwire(map, entry);
984263bc
MD
2253 entry = entry->next;
2254 }
2255 }
2256done:
a108bf71 2257 vm_map_unclip_range(map, start_entry, start, real_end, &count,
984263bc
MD
2258 MAP_CLIP_NO_HOLES);
2259 map->timestamp++;
2260 vm_map_unlock(map);
a108bf71 2261 vm_map_entry_release(count);
984263bc
MD
2262 return (rv);
2263}
2264
2265/*
46754a20
MD
2266 * Sets the pageability of the specified address range in the target map.
2267 * Regions specified as not pageable require locked-down physical
2268 * memory and physical page maps.
984263bc 2269 *
46754a20
MD
2270 * The map must not be locked, but a reference must remain to the map
2271 * throughout the call.
984263bc 2272 *
46754a20
MD
2273 * This function may be called via the zalloc path and must properly
2274 * reserve map entries for kernel_map.
a108bf71 2275 *
46754a20 2276 * No requirements.
984263bc
MD
2277 */
2278int
e1359933 2279vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t real_end, int kmflags)
984263bc
MD
2280{
2281 vm_map_entry_t entry;
2282 vm_map_entry_t start_entry;
2283 vm_offset_t end;
2284 int rv = KERN_SUCCESS;
a108bf71 2285 int count;
984263bc 2286
e1359933 2287 if (kmflags & KM_KRESERVE)
a108bf71 2288 count = vm_map_entry_kreserve(MAP_RESERVE_COUNT);
a108bf71
MD
2289 else
2290 count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
984263bc
MD
2291 vm_map_lock(map);
2292 VM_MAP_RANGE_CHECK(map, start, real_end);
2293 end = real_end;
2294
46754a20
MD
2295 start_entry = vm_map_clip_range(map, start, end, &count,
2296 MAP_CLIP_NO_HOLES);
984263bc
MD
2297 if (start_entry == NULL) {
2298 vm_map_unlock(map);
a108bf71
MD
2299 rv = KERN_INVALID_ADDRESS;
2300 goto failure;
984263bc 2301 }
e1359933 2302 if ((kmflags & KM_PAGEABLE) == 0) {
984263bc
MD
2303 /*
2304 * Wiring.
2305 *
2306 * 1. Holding the write lock, we create any shadow or zero-fill
2307 * objects that need to be created. Then we clip each map
2308 * entry to the region to be wired and increment its wiring
2309 * count. We create objects before clipping the map entries
2310 * to avoid object proliferation.
2311 *
2312 * 2. We downgrade to a read lock, and call vm_fault_wire to
2313 * fault in the pages for any newly wired area (wired_count is
2314 * 1).
2315 *
2316 * Downgrading to a read lock for vm_fault_wire avoids a
2317 * possible deadlock with another process that may have faulted
2318 * on one of the pages to be wired (it would mark the page busy,
2319 * blocking us, then in turn block on the map lock that we
2320 * hold). Because of problems in the recursive lock package,
2321 * we cannot upgrade to a write lock in vm_map_lookup. Thus,
2322 * any actions that require the write lock must be done
2323 * beforehand. Because we keep the read lock on the map, the
2324 * copy-on-write status of the entries we modify here cannot
2325 * change.
2326 */
984263bc
MD
2327 entry = start_entry;
2328 while ((entry != &map->header) && (entry->start < end)) {
2329 /*
2330 * Trivial case if the entry is already wired
2331 */
2332 if (entry->wired_count) {
2333 entry->wired_count++;
2334 entry = entry->next;
2335 continue;
2336 }
2337
2338 /*
2339 * The entry is being newly wired, we have to setup
2340 * appropriate management structures. A shadow
2341 * object is required for a copy-on-write region,
2342 * or a normal object for a zero-fill region. We
2343 * do not have to do this for entries that point to sub
2344 * maps because we won't hold the lock on the sub map.
2345 */
1b874851 2346 if (entry->maptype != VM_MAPTYPE_SUBMAP) {
46754a20
MD
2347 int copyflag = entry->eflags &
2348 MAP_ENTRY_NEEDS_COPY;
2349 if (copyflag && ((entry->protection &
2350 VM_PROT_WRITE) != 0)) {
b12defdc 2351 vm_map_entry_shadow(entry, 0);
984263bc
MD
2352 } else if (entry->object.vm_object == NULL &&
2353 !map->system_map) {
53025830 2354 vm_map_entry_allocate_object(entry);
984263bc
MD
2355 }
2356 }
2357
2358 entry->wired_count++;
2359 entry = entry->next;
2360 }
2361
2362 /*
2363 * Pass 2.
2364 */
2365
2366 /*
2367 * HACK HACK HACK HACK
2368 *
46754a20
MD
2369 * vm_fault_wire() temporarily unlocks the map to avoid
2370 * deadlocks. The in-transition flag from vm_map_clip_range
2371 * call should protect us from changes while the map is
2372 * unlocked. T
2373 *
2374 * NOTE: Previously this comment stated that clipping might
2375 * still occur while the entry is unlocked, but from
2376 * what I can tell it actually cannot.
2377 *
2378 * It is unclear whether the CLIP_CHECK_*() calls
2379 * are still needed but we keep them in anyway.
984263bc
MD
2380 *
2381 * HACK HACK HACK HACK
2382 */
2383
984263bc
MD
2384 entry = start_entry;
2385 while (entry != &map->header && entry->start < end) {
2386 /*
2387 * If vm_fault_wire fails for any page we need to undo
2388 * what has been done. We decrement the wiring count
2389 * for those pages which have not yet been wired (now)
2390 * and unwire those that have (later).
2391 */
2392 vm_offset_t save_start = entry->start;
2393 vm_offset_t save_end = entry->end;
2394
2395 if (entry->wired_count == 1)
f2d22ebf 2396 rv = vm_fault_wire(map, entry, FALSE);
984263bc
MD
2397 if (rv) {
2398 CLIP_CHECK_BACK(entry, save_start);
2399 for (;;) {
2400 KASSERT(entry->wired_count == 1, ("wired_count changed unexpectedly"));
2401 entry->wired_count = 0;
2402 if (entry->end == save_end)
2403 break;
2404 entry = entry->next;
2405 KASSERT(entry != &map->header, ("bad entry clip during backout"));
2406 }
2407 end = save_start;
2408 break;
2409 }
2410 CLIP_CHECK_FWD(entry, save_end);
2411 entry = entry->next;
2412 }
984263bc
MD
2413
2414 /*
984263bc
MD
2415 * If a failure occured undo everything by falling through
2416 * to the unwiring code. 'end' has already been adjusted
2417 * appropriately.
2418 */
2419 if (rv)
e1359933 2420 kmflags |= KM_PAGEABLE;
984263bc
MD
2421
2422 /*
f2d22ebf
MD
2423 * start_entry is still IN_TRANSITION but may have been
2424 * clipped since vm_fault_wire() unlocks and relocks the
2425 * map. No matter how clipped it has gotten there should
2426 * be a fragment that is on our start boundary.
984263bc
MD
2427 */
2428 CLIP_CHECK_BACK(start_entry, start);
2429 }
2430
e1359933 2431 if (kmflags & KM_PAGEABLE) {
984263bc
MD
2432 /*
2433 * This is the unwiring case. We must first ensure that the
2434 * range to be unwired is really wired down. We know there
2435 * are no holes.
2436 */
2437 entry = start_entry;
2438 while ((entry != &map->header) && (entry->start < end)) {
2439 if (entry->wired_count == 0) {
2440 rv = KERN_INVALID_ARGUMENT;
2441 goto done;
2442 }
2443 entry = entry->next;
2444 }
2445
2446 /*
2447 * Now decrement the wiring count for each region. If a region
2448 * becomes completely unwired, unwire its physical pages and
2449 * mappings.
2450 */
2451 entry = start_entry;
2452 while ((entry != &map->header) && (entry->start < end)) {
2453 entry->wired_count--;
2454 if (entry->wired_count == 0)
f2d22ebf 2455 vm_fault_unwire(map, entry);
984263bc
MD
2456 entry = entry->next;
2457 }
2458 }
2459done:
46754a20
MD
2460 vm_map_unclip_range(map, start_entry, start, real_end,
2461 &count, MAP_CLIP_NO_HOLES);
984263bc
MD
2462 map->timestamp++;
2463 vm_map_unlock(map);
a108bf71 2464failure:
e1359933 2465 if (kmflags & KM_KRESERVE)
a108bf71 2466 vm_map_entry_krelease(count);
a108bf71
MD
2467 else
2468 vm_map_entry_release(count);
984263bc
MD
2469 return (rv);
2470}
2471
2472/*
46754a20
MD
2473 * Mark a newly allocated address range as wired but do not fault in
2474 * the pages. The caller is expected to load the pages into the object.
a108bf71 2475 *
46754a20
MD
2476 * The map must be locked on entry and will remain locked on return.
2477 * No other requirements.
a108bf71
MD
2478 */
2479void
46754a20
MD
2480vm_map_set_wired_quick(vm_map_t map, vm_offset_t addr, vm_size_t size,
2481 int *countp)
a108bf71
MD
2482{
2483 vm_map_entry_t scan;
2484 vm_map_entry_t entry;
2485
46754a20
MD
2486 entry = vm_map_clip_range(map, addr, addr + size,
2487 countp, MAP_CLIP_NO_HOLES);
2488 for (scan = entry;
2489 scan != &map->header && scan->start < addr + size;
2490 scan = scan->next) {
a108bf71
MD
2491 KKASSERT(entry->wired_count == 0);
2492 entry->wired_count = 1;
2493 }
46754a20
MD
2494 vm_map_unclip_range(map, entry, addr, addr + size,
2495 countp, MAP_CLIP_NO_HOLES);
a108bf71
MD
2496}
2497
2498/*
984263bc
MD
2499 * Push any dirty cached pages in the address range to their pager.
2500 * If syncio is TRUE, dirty pages are written synchronously.
2501 * If invalidate is TRUE, any cached pages are freed as well.
2502 *
2bc7505b
MD
2503 * This routine is called by sys_msync()
2504 *
984263bc 2505 * Returns an error if any part of the specified range is not mapped.
46754a20
MD
2506 *
2507 * No requirements.
984263bc
MD
2508 */
2509int
2bc7505b
MD
2510vm_map_clean(vm_map_t map, vm_offset_t start, vm_offset_t end,
2511 boolean_t syncio, boolean_t invalidate)
984263bc
MD
2512{
2513 vm_map_entry_t current;
2514 vm_map_entry_t entry;
2515 vm_size_t size;
2516 vm_object_t object;
b12defdc 2517 vm_object_t tobj;
984263bc
MD
2518 vm_ooffset_t offset;
2519
2520 vm_map_lock_read(map);
2521 VM_MAP_RANGE_CHECK(map, start, end);
2522 if (!vm_map_lookup_entry(map, start, &entry)) {
2523 vm_map_unlock_read(map);
2524 return (KERN_INVALID_ADDRESS);
2525 }
b12defdc
MD
2526 lwkt_gettoken(&map->token);
2527
984263bc
MD
2528 /*
2529 * Make a first pass to check for holes.
2530 */
2531 for (current = entry; current->start < end; current = current->next) {
1b874851 2532 if (current->maptype == VM_MAPTYPE_SUBMAP) {
6730ca37 2533 lwkt_reltoken(&map->token);
984263bc
MD
2534 vm_map_unlock_read(map);
2535 return (KERN_INVALID_ARGUMENT);
2536 }
2537 if (end > current->end &&
2538 (current->next == &map->header ||
2539 current->end != current->next->start)) {
6730ca37 2540 lwkt_reltoken(&map->token);
984263bc
MD
2541 vm_map_unlock_read(map);
2542 return (KERN_INVALID_ADDRESS);
2543 }
2544 }
2545
2546 if (invalidate)
2547 pmap_remove(vm_map_pmap(map), start, end);
46754a20 2548
984263bc
MD
2549 /*
2550 * Make a second pass, cleaning/uncaching pages from the indicated
2551 * objects as we go.
2552 */
2553 for (current = entry; current->start < end; current = current->next) {
2554 offset = current->offset + (start - current->start);
2555 size = (end <= current->end ? end : current->end) - start;
1b874851 2556 if (current->maptype == VM_MAPTYPE_SUBMAP) {
984263bc
MD
2557 vm_map_t smap;
2558 vm_map_entry_t tentry;
2559 vm_size_t tsize;
2560
2561 smap = current->object.sub_map;
2562 vm_map_lock_read(smap);
418ff780 2563 vm_map_lookup_entry(smap, offset, &tentry);
984263bc
MD
2564 tsize = tentry->end - offset;
2565 if (tsize < size)
2566 size = tsize;
2567 object = tentry->object.vm_object;
2568 offset = tentry->offset + (offset - tentry->start);
2569 vm_map_unlock_read(smap);
2570 } else {
2571 object = current->object.vm_object;
2572 }
b12defdc
MD
2573
2574 if (object)
2575 vm_object_hold(object);
2576
984263bc
MD
2577 /*
2578 * Note that there is absolutely no sense in writing out
2579 * anonymous objects, so we track down the vnode object
2580 * to write out.
2581 * We invalidate (remove) all pages from the address space
2582 * anyway, for semantic correctness.
2583 *
2584 * note: certain anonymous maps, such as MAP_NOSYNC maps,
2585 * may start out with a NULL object.
2586 */
b12defdc
MD
2587 while (object && (tobj = object->backing_object) != NULL) {
2588 vm_object_hold(tobj);
2589 if (tobj == object->backing_object) {
2590 vm_object_lock_swap();
2591 offset += object->backing_object_offset;
2592 vm_object_drop(object);
2593 object = tobj;
2594 if (object->size < OFF_TO_IDX(offset + size))
2595 size = IDX_TO_OFF(object->size) -
2596 offset;
2597 break;
2598 }
2599 vm_object_drop(tobj);
984263bc
MD
2600 }
2601 if (object && (object->type == OBJT_VNODE) &&
2bc7505b
MD
2602 (current->protection & VM_PROT_WRITE) &&
2603 (object->flags & OBJ_NOMSYNC) == 0) {
984263bc
MD
2604 /*
2605 * Flush pages if writing is allowed, invalidate them
2606 * if invalidation requested. Pages undergoing I/O
2607 * will be ignored by vm_object_page_remove().
2608 *
2609 * We cannot lock the vnode and then wait for paging
2610 * to complete without deadlocking against vm_fault.
2611 * Instead we simply call vm_object_page_remove() and
2612 * allow it to block internally on a page-by-page
2613 * basis when it encounters pages undergoing async
2614 * I/O.
2615 */
2616 int flags;
2617
b12defdc 2618 /* no chain wait needed for vnode objects */
2de4f77e 2619 vm_object_reference_locked(object);
ca466bae 2620 vn_lock(object->handle, LK_EXCLUSIVE | LK_RETRY);
984263bc
MD
2621 flags = (syncio || invalidate) ? OBJPC_SYNC : 0;
2622 flags |= invalidate ? OBJPC_INVAL : 0;
1b874851
MD
2623
2624 /*
2625 * When operating on a virtual page table just
2626 * flush the whole object. XXX we probably ought
2627 * to
2628 */
2629 switch(current->maptype) {
2630 case VM_MAPTYPE_NORMAL:
2631 vm_object_page_clean(object,
2632 OFF_TO_IDX(offset),
2633 OFF_TO_IDX(offset + size + PAGE_MASK),
2634 flags);
2635 break;
2636 case VM_MAPTYPE_VPAGETABLE:
2637 vm_object_page_clean(object, 0, 0, flags);
2638 break;
2639 }
a11aaa81 2640 vn_unlock(((struct vnode *)object->handle));
2de4f77e 2641 vm_object_deallocate_locked(object);
984263bc
MD
2642 }
2643 if (object && invalidate &&
2644 ((object->type == OBJT_VNODE) ||
2645 (object->type == OBJT_DEVICE))) {
2f1821ca
MD
2646 int clean_only =
2647 (object->type == OBJT_DEVICE) ? FALSE : TRUE;
b12defdc 2648 /* no chain wait needed for vnode/device objects */
2de4f77e 2649 vm_object_reference_locked(object);
1b874851
MD
2650 switch(current->maptype) {
2651 case VM_MAPTYPE_NORMAL:
2652 vm_object_page_remove(object,
2653 OFF_TO_IDX(offset),
2654 OFF_TO_IDX(offset + size + PAGE_MASK),
2655 clean_only);
2656 break;
2657 case VM_MAPTYPE_VPAGETABLE:
2658 vm_object_page_remove(object, 0, 0, clean_only);
2659 break;
2660 }
2de4f77e 2661 vm_object_deallocate_locked(object);
984263bc
MD
2662 }
2663 start += size;
b12defdc
MD
2664 if (object)
2665 vm_object_drop(object);
984263bc 2666 }
2de4f77e 2667
b12defdc 2668 lwkt_reltoken(&map->token);
2de4f77e 2669 vm_map_unlock_read(map);
46754a20 2670
984263bc
MD
2671 return (KERN_SUCCESS);
2672}
2673
2674/*
46754a20 2675 * Make the region specified by this entry pageable.
984263bc 2676 *
46754a20 2677 * The vm_map must be exclusively locked.
984263bc
MD
2678 */
2679static void
a108bf71 2680vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
984263bc 2681{
f2d22ebf 2682 entry->eflags &= ~MAP_ENTRY_USER_WIRED;
984263bc 2683 entry->wired_count = 0;
f2d22ebf 2684 vm_fault_unwire(map, entry);
984263bc
MD
2685}
2686
2687/*
46754a20 2688 * Deallocate the given entry from the target map.
984263bc 2689 *
46754a20 2690 * The vm_map must be exclusively locked.
984263bc
MD
2691 */
2692static void
a108bf71 2693vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry, int *countp)
984263bc
MD
2694{
2695 vm_map_entry_unlink(map, entry);
2696 map->size -= entry->end - entry->start;
2697
1b874851
MD
2698 switch(entry->maptype) {
2699 case VM_MAPTYPE_NORMAL:
2700 case VM_MAPTYPE_VPAGETABLE:
984263bc 2701 vm_object_deallocate(entry->object.vm_object);
1b874851
MD
2702 break;
2703 default:
2704 break;
984263bc
MD
2705 }
2706
a108bf71 2707 vm_map_entry_dispose(map, entry, countp);
984263bc
MD
2708}
2709
2710/*
46754a20 2711 * Deallocates the given address range from the target map.
984263bc 2712 *
46754a20 2713 * The vm_map must be exclusively locked.
984263bc
MD
2714 */
2715int
a108bf71 2716vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end, int *countp)
984263bc
MD
2717{
2718 vm_object_t object;
2719 vm_map_entry_t entry;
2720 vm_map_entry_t first_entry;
2721
46754a20 2722 ASSERT_VM_MAP_LOCKED(map);
b12defdc 2723 lwkt_gettoken(&map->token);
686dbf64 2724again:
984263bc 2725 /*
686dbf64
MD
2726 * Find the start of the region, and clip it. Set entry to point
2727 * at the first record containing the requested address or, if no
2728 * such record exists, the next record with a greater address. The
2729 * loop will run from this point until a record beyond the termination
2730 * address is encountered.
2731 *
2732 * map->hint must be adjusted to not point to anything we delete,
2733 * so set it to the entry prior to the one being deleted.
2734 *
2735 * GGG see other GGG comment.
984263bc 2736 */
686dbf64 2737 if (vm_map_lookup_entry(map, start, &first_entry)) {
984263bc 2738 entry = first_entry;
a108bf71 2739 vm_map_clip_start(map, entry, start, countp);
686dbf64
MD
2740 map->hint = entry->prev; /* possible problem XXX */
2741 } else {
2742 map->hint = first_entry; /* possible problem XXX */
2743 entry = first_entry->next;
984263bc
MD
2744 }
2745
2746 /*
686dbf64
MD
2747 * If a hole opens up prior to the current first_free then
2748 * adjust first_free. As with map->hint, map->first_free
2749 * cannot be left set to anything we might delete.
984263bc 2750 */
984263bc
MD
2751 if (entry == &map->header) {
2752 map->first_free = &map->header;
2753 } else if (map->first_free->start >= start) {
2754 map->first_free = entry->prev;
2755 }
2756
2757 /*
2758 * Step through all entries in this region
2759 */
984263bc
MD
2760 while ((entry != &map->header) && (entry->start < end)) {
2761 vm_map_entry_t next;
2762 vm_offset_t s, e;
2763 vm_pindex_t offidxstart, offidxend, count;
2764
2765 /*
2766 * If we hit an in-transition entry we have to sleep and
2767 * retry. It's easier (and not really slower) to just retry
2768 * since this case occurs so rarely and the hint is already
2769 * pointing at the right place. We have to reset the
2770 * start offset so as not to accidently delete an entry
2771 * another process just created in vacated space.
2772 */
2773 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
2774 entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
2775 start = entry->start;
12e4aaff
MD
2776 ++mycpu->gd_cnt.v_intrans_coll;
2777 ++mycpu->gd_cnt.v_intrans_wait;
984263bc
MD
2778 vm_map_transition_wait(map);
2779 goto again;
2780 }
a108bf71 2781 vm_map_clip_end(map, entry, end, countp);
984263bc
MD
2782
2783 s = entry->start;
2784 e = entry->end;
2785 next = entry->next;
2786
2787 offidxstart = OFF_TO_IDX(entry->offset);
2788 count = OFF_TO_IDX(e - s);
2789 object = entry->object.vm_object;
2790
2791 /*
2792 * Unwire before removing addresses from the pmap; otherwise,
2793 * unwiring will put the entries back in the pmap.
2794 */
f2d22ebf 2795 if (entry->wired_count != 0)
984263bc 2796 vm_map_entry_unwire(map, entry);
984263bc
MD
2797
2798 offidxend = offidxstart + count;
2799
c439ad8f 2800 if (object == &kernel_object) {
b12defdc 2801 vm_object_hold(object);
46754a20
MD
2802 vm_object_page_remove(object, offidxstart,
2803 offidxend, FALSE);
b12defdc
MD
2804 vm_object_drop(object);
2805 } else if (object && object->type != OBJT_DEFAULT &&
2806 object->type != OBJT_SWAP) {
2807 /*
2808 * vnode object routines cannot be chain-locked
2809 */
2810 vm_object_hold(object);
2811 pmap_remove(map->pmap, s, e);
2812 vm_object_drop(object);
2813 } else if (object) {
2814 vm_object_hold(object);
2815 vm_object_chain_acquire(object);
984263bc 2816 pmap_remove(map->pmap, s, e);
2de4f77e 2817
984263bc
MD
2818 if (object != NULL &&
2819 object->ref_count != 1 &&
46754a20
MD
2820 (object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) ==
2821 OBJ_ONEMAPPING &&
2822 (object->type == OBJT_DEFAULT ||
2823 object->type == OBJT_SWAP)) {
e806bedd 2824 vm_object_collapse(object, NULL);
46754a20
MD
2825 vm_object_page_remove(object, offidxstart,
2826 offidxend, FALSE);
984263bc 2827 if (object->type == OBJT_SWAP) {
46754a20
MD
2828 swap_pager_freespace(object,
2829 offidxstart,
2830 count);
984263bc
MD
2831 }
2832 if (offidxend >= object->size &&
2833 offidxstart < object->size) {
2834 object->size = offidxstart;
2835 }
2836 }
b12defdc
MD
2837 vm_object_chain_release(object);
2838 vm_object_drop(object);
984263bc 2839 }
b4460ab3 2840
984263bc
MD
2841 /*
2842 * Delete the entry (which may delete the object) only after
2843 * removing all pmap entries pointing to its pages.
2844 * (Otherwise, its page frames may be reallocated, and any
2845 * modify bits will be set in the wrong object!)
2846 */
a108bf71 2847 vm_map_entry_delete(map, entry, countp);
984263bc
MD
2848 entry = next;
2849 }
b12defdc 2850 lwkt_reltoken(&map->token);
984263bc
MD
2851 return (KERN_SUCCESS);
2852}
2853
2854/*
46754a20
MD
2855 * Remove the given address range from the target map.
2856 * This is the exported form of vm_map_delete.
984263bc 2857 *
46754a20 2858 * No requirements.
984263bc
MD
2859 */
2860int
a108bf71 2861vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
984263bc 2862{
03aa8d99 2863 int result;
a108bf71 2864 int count;
984263bc 2865
a108bf71 2866 count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
984263bc
MD
2867 vm_map_lock(map);
2868 VM_MAP_RANGE_CHECK(map, start, end);
a108bf71 2869 result = vm_map_delete(map, start, end, &count);
984263bc 2870 vm_map_unlock(map);
a108bf71 2871 vm_map_entry_release(count);
984263bc 2872
984263bc
MD
2873 return (result);
2874}
2875
2876/*
46754a20
MD
2877 * Assert that the target map allows the specified privilege on the
2878 * entire address region given. The entire region must be allocated.
984263bc 2879 *
46754a20 2880 * The caller must specify whether the vm_map is already locked or not.
984263bc
MD
2881 */
2882boolean_t
2883vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
46754a20 2884 vm_prot_t protection, boolean_t have_lock)
984263bc
MD
2885{
2886 vm_map_entry_t entry;
2887 vm_map_entry_t tmp_entry;
46754a20
MD
2888 boolean_t result;
2889
2890 if (have_lock == FALSE)
2891 vm_map_lock_read(map);
984263bc
MD
2892
2893 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
46754a20
MD
2894 if (have_lock == FALSE)
2895 vm_map_unlock_read(map);
984263bc
MD
2896 return (FALSE);
2897 }
2898 entry = tmp_entry;
2899
46754a20 2900 result = TRUE;
984263bc
MD
2901 while (start < end) {
2902 if (entry == &map->header) {
46754a20
MD
2903 result = FALSE;
2904 break;
984263bc
MD
2905 }
2906 /*
2907 * No holes allowed!
2908 */
2909
2910 if (start < entry->start) {
46754a20
MD
2911 result = FALSE;
2912 break;
984263bc
MD
2913 }
2914 /*
2915 * Check protection associated with entry.
2916 */
2917
2918 if ((entry->protection & protection) != protection) {
46754a20
MD
2919 result = FALSE;
2920 break;
984263bc
MD
2921 }
2922 /* go to next entry */
2923
2924 start = entry->end;
2925 entry = entry->next;
2926 }
46754a20
MD
2927 if (have_lock == FALSE)
2928 vm_map_unlock_read(map);
2929 return (result);
984263bc
MD
2930}
2931
2932/*
b12defdc
MD
2933 * If appropriate this function shadows the original object with a new object
2934 * and moves the VM pages from the original object to the new object.
2935 * The original object will also be collapsed, if possible.
46754a20 2936 *
b12defdc
MD
2937 * We can only do this for normal memory objects with a single mapping, and
2938 * it only makes sense to do it if there are 2 or more refs on the original
2939 * object. i.e. typically a memory object that has been extended into
2940 * multiple vm_map_entry's with non-overlapping ranges.
2941 *
2942 * This makes it easier to remove unused pages and keeps object inheritance
2943 * from being a negative impact on memory usage.
2944 *
2945 * On return the (possibly new) entry->object.vm_object will have an
2946 * additional ref on it for the caller to dispose of (usually by cloning
2947 * the vm_map_entry). The additional ref had to be done in this routine
2948 * to avoid racing a collapse. The object's ONEMAPPING flag will also be
2949 * cleared.
2950 *
2951 * The vm_map must be locked and its token held.
984263bc
MD
2952 */
2953static void
a108bf71 2954vm_map_split(vm_map_entry_t entry)
984263bc 2955{
b12defdc
MD
2956#if 0
2957 /* UNOPTIMIZED */
2958 vm_object_t oobject;
2959
2960 oobject = entry->object.vm_object;
2961 vm_object_hold(oobject);
2962 vm_object_chain_wait(oobject);
2963 vm_object_reference_locked(oobject);
2964 vm_object_clear_flag(oobject, OBJ_ONEMAPPING);
2965 vm_object_drop(oobject);
2966#else
2967 /* OPTIMIZED */
2968 vm_object_t oobject, nobject, bobject;
984263bc 2969 vm_offset_t s, e;
b12defdc 2970 vm_page_t m;
984263bc
MD
2971 vm_pindex_t offidxstart, offidxend, idx;
2972 vm_size_t size;
2973 vm_ooffset_t offset;
2974
b12defdc
MD
2975 /*
2976 * Setup. Chain lock the original object throughout the entire
2977 * routine to prevent new page faults from occuring.
2978 *
2979 * XXX can madvise WILLNEED interfere with us too?
2980 */
2981 oobject = entry->object.vm_object;
2982 vm_object_hold(oobject);
2983 vm_object_chain_acquire(oobject);
2984
2985 /*
2986 * Original object cannot be split?
2987 */
2988 if (oobject->handle == NULL || (oobject->type != OBJT_DEFAULT &&
2989 oobject->type != OBJT_SWAP)) {
2990 vm_object_chain_release(oobject);
2991 vm_object_reference_locked(oobject);
2992 vm_object_clear_flag(oobject, OBJ_ONEMAPPING);
2993 vm_object_drop(oobject);
984263bc 2994 return;
b12defdc
MD
2995 }
2996
2997 /*
2998 * Collapse original object with its backing store as an
2999 * optimization to reduce chain lengths when possible.
3000 *
3001 * If ref_count <= 1 there aren't other non-overlapping vm_map_entry's
3002 * for oobject, so there's no point collapsing it.
3003 *
3004 * Then re-check whether the object can be split.
3005 */
e806bedd 3006 vm_object_collapse(oobject, NULL);
b12defdc
MD
3007
3008 if (oobject->ref_count <= 1 ||
3009 (oobject->type != OBJT_DEFAULT && oobject->type != OBJT_SWAP) ||
3010 (oobject->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) != OBJ_ONEMAPPING) {
3011 vm_object_chain_release(oobject);
3012 vm_object_reference_locked(oobject);
3013 vm_object_clear_flag(oobject, OBJ_ONEMAPPING);
3014 vm_object_drop(oobject);
984263bc 3015 return;
b12defdc
MD
3016 }
3017
3018 /*
3019 * Acquire the chain lock on the backing object.
3020 *
3021 * Give bobject an additional ref count for when it will be shadowed
3022 * by nobject.
3023 */
3024 if ((bobject = oobject->backing_object) != NULL) {
3025 vm_object_hold(bobject);
3026 vm_object_chain_wait(bobject);
3027 vm_object_reference_locked(bobject);
3028 vm_object_chain_acquire(bobject);
3029 KKASSERT(bobject->backing_object == bobject);
3030 KKASSERT((bobject->flags & OBJ_DEAD) == 0);
3031 }
984263bc 3032
b12defdc
MD
3033 /*
3034 * Calculate the object page range and allocate the new object.
3035 */
984263bc
MD
3036 offset = entry->offset;
3037 s = entry->start;
3038 e = entry->end;
3039
3040 offidxstart = OFF_TO_IDX(offset);
3041 offidxend = offidxstart + OFF_TO_IDX(e - s);
3042 size = offidxend - offidxstart;
3043
b12defdc 3044 switch(oobject->type) {
5a648714 3045 case OBJT_DEFAULT:
b12defdc
MD
3046 nobject = default_pager_alloc(NULL, IDX_TO_OFF(size),
3047 VM_PROT_ALL, 0);
5a648714
MD
3048 break;
3049 case OBJT_SWAP:
b12defdc
MD
3050 nobject = swap_pager_alloc(NULL, IDX_TO_OFF(size),
3051 VM_PROT_ALL, 0);
5a648714
MD
3052 break;
3053 default:
3054 /* not reached */
b12defdc 3055 nobject = NULL;
5a648714
MD
3056 KKASSERT(0);
3057 }
b12defdc
MD
3058
3059 if (nobject == NULL) {
3060 if (bobject) {
3061 vm_object_chain_release(bobject);
3062 vm_object_deallocate(bobject);
3063 vm_object_drop(bobject);
3064 }
3065 vm_object_chain_release(oobject);
3066 vm_object_reference_locked(oobject);
3067 vm_object_clear_flag(oobject, OBJ_ONEMAPPING);
3068 vm_object_drop(oobject);
984263bc 3069 return;
b12defdc 3070 }
984263bc 3071
46754a20 3072 /*
b12defdc
MD
3073 * The new object will replace entry->object.vm_object so it needs
3074 * a second reference (the caller expects an additional ref).
46754a20 3075 */
b12defdc
MD
3076 vm_object_hold(nobject);
3077 vm_object_reference_locked(nobject);
3078 vm_object_chain_acquire(nobject);
46754a20 3079
b12defdc
MD
3080 /*
3081 * nobject shadows bobject (oobject already shadows bobject).
3082 */
3083 if (bobject) {
3084 nobject->backing_object_offset =
3085 oobject->backing_object_offset + IDX_TO_OFF(offidxstart);
3086 nobject->backing_object = bobject;
3087 bobject->shadow_count++;
3088 bobject->generation++;
3089 LIST_INSERT_HEAD(&bobject->shadow_head, nobject, shadow_list);
3090 vm_object_clear_flag(bobject, OBJ_ONEMAPPING); /* XXX? */
3091 vm_object_chain_release(bobject);
3092 vm_object_drop(bobject);
984263bc
MD
3093 }
3094
b12defdc
MD
3095 /*
3096 * Move the VM pages from oobject to nobject
3097 */
984263bc
MD
3098 for (idx = 0; idx < size; idx++) {
3099 vm_page_t m;
3100
b12defdc
MD
3101 m = vm_page_lookup_busy_wait(oobject, offidxstart + idx,
3102 TRUE, "vmpg");
2de4f77e 3103 if (m == NULL)
984263bc
MD
3104 continue;
3105
3106 /*
3107 * We must wait for pending I/O to complete before we can
3108 * rename the page.
3109 *
3110 * We do not have to VM_PROT_NONE the page as mappings should
3111 * not be changed by this operation.
b12defdc
MD
3112 *
3113 * NOTE: The act of renaming a page updates chaingen for both
3114 * objects.
984263bc 3115 */
b12defdc 3116 vm_page_rename(m, nobject, idx);
984263bc 3117 /* page automatically made dirty by rename and cache handled */
b12defdc 3118 /* page remains busy */
984263bc
MD
3119 }
3120
b12defdc
MD
3121 if (oobject->type == OBJT_SWAP) {
3122 vm_object_pip_add(oobject, 1);
984263bc 3123 /*
b12defdc
MD
3124 * copy oobject pages into nobject and destroy unneeded
3125 * pages in shadow object.
984263bc 3126 */
b12defdc
MD
3127 swap_pager_copy(oobject, nobject, offidxstart, 0);
3128 vm_object_pip_wakeup(oobject);
984263bc
MD
3129 }
3130
06ecca5a
MD
3131 /*
3132 * Wakeup the pages we played with. No spl protection is needed
3133 * for a simple wakeup.
3134 */
984263bc 3135 for (idx = 0; idx < size; idx++) {
b12defdc
MD
3136 m = vm_page_lookup(nobject, idx);
3137 if (m) {
3138 KKASSERT(m->flags & PG_BUSY);
984263bc 3139 vm_page_wakeup(m);
b12defdc 3140 }
984263bc 3141 }
b12defdc 3142 entry->object.vm_object = nobject;
984263bc 3143 entry->offset = 0LL;
b12defdc
MD
3144
3145 /*
3146 * Cleanup
3147 *
3148 * NOTE: There is no need to remove OBJ_ONEMAPPING from oobject, the
3149 * related pages were moved and are no longer applicable to the
3150 * original object.
3151 *
3152 * NOTE: Deallocate oobject (due to its entry->object.vm_object being
3153 * replaced by nobject).
3154 */
3155 vm_object_chain_release(nobject);
3156 vm_object_drop(nobject);
3157 if (bobject) {
3158 vm_object_chain_release(bobject);
3159 vm_object_drop(bobject);
3160 }
3161 vm_object_chain_release(oobject);
3162 /*vm_object_clear_flag(oobject, OBJ_ONEMAPPING);*/
3163 vm_object_deallocate_locked(oobject);
3164 vm_object_drop(oobject);
3165#endif
984263bc
MD
3166}
3167
3168/*
46754a20
MD
3169 * Copies the contents of the source entry to the destination
3170 * entry. The entries *must* be aligned properly.
984263bc 3171 *
d2d8515b 3172 * The vm_maps must be exclusively locked.
b12defdc 3173 * The vm_map's token must be held.
d2d8515b
MD
3174 *
3175 * Because the maps are locked no faults can be in progress during the
3176 * operation.
984263bc
MD
3177 */
3178static void
a108bf71 3179vm_map_copy_entry(vm_map_t src_map, vm_map_t dst_map,
b12defdc 3180 vm_map_entry_t src_entry, vm_map_entry_t dst_entry)
984263bc
MD
3181{
3182 vm_object_t src_object;
3183
1b874851
MD
3184 if (dst_entry->maptype == VM_MAPTYPE_SUBMAP)
3185 return;
3186 if (src_entry->maptype == VM_MAPTYPE_SUBMAP)
984263bc
MD
3187 return;
3188
3189 if (src_entry->wired_count == 0) {
984263bc
MD
3190 /*
3191 * If the source entry is marked needs_copy, it is already
3192 * write-protected.
3193 */
3194 if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) {
3195 pmap_protect(src_map->pmap,
3196 src_entry->start,
3197 src_entry->end,
3198 src_entry->protection & ~VM_PROT_WRITE);
3199 }
3200
3201 /*
3202 * Make a copy of the object.
212f39f5 3203 *
b9469aa4
MD
3204 * The object must be locked prior to checking the object type
3205 * and for the call to vm_object_collapse() and vm_map_split().
3206 * We cannot use *_hold() here because the split code will
3207 * probably try to destroy the object. The lock is a pool
3208 * token and doesn't care.
a2ee730d
MD
3209 *
3210 * We must bump src_map->timestamp when setting
3211 * MAP_ENTRY_NEEDS_COPY to force any concurrent fault
3212 * to retry, otherwise the concurrent fault might improperly
3213 * install a RW pte when its supposed to be a RO(COW) pte.
3214 * This race can occur because a vnode-backed fault may have
3215 * to temporarily release the map lock.
984263bc 3216 */
b12defdc
MD
3217 if (src_entry->object.vm_object != NULL) {
3218 vm_map_split(src_entry);
3219 src_object = src_entry->object.vm_object;
984263bc 3220 dst_entry->object.vm_object = src_object;
b12defdc
MD
3221 src_entry->eflags |= (MAP_ENTRY_COW |
3222 MAP_ENTRY_NEEDS_COPY);
3223 dst_entry->eflags |= (MAP_ENTRY_COW |
3224 MAP_ENTRY_NEEDS_COPY);
984263bc 3225 dst_entry->offset = src_entry->offset;
a2ee730d 3226 ++src_map->timestamp;
984263bc
MD
3227 } else {
3228 dst_entry->object.vm_object = NULL;
3229 dst_entry->offset = 0;
3230 }
3231
3232 pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start,
3233 dst_entry->end - dst_entry->start, src_entry->start);
3234 } else {
3235 /*
3236 * Of course, wired down pages can't be set copy-on-write.
3237 * Cause wired pages to be copied into the new map by
3238 * simulating faults (the new pages are pageable)
3239 */
3240 vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry);
3241 }
3242}
3243
3244/*
3245 * vmspace_fork:
3246 * Create a new process vmspace structure and vm_map
3247 * based on those of an existing process. The new map
3248 * is based on the old map, according to the inheritance
3249 * values on the regions in that map.
3250 *
3251 * The source map must not be locked.
46754a20 3252 * No requirements.
984263bc
MD
3253 */
3254struct vmspace *
a108bf71 3255vmspace_fork(struct vmspace *vm1)
984263bc
MD
3256{
3257 struct vmspace *vm2;
3258 vm_map_t old_map = &vm1->vm_map;
3259 vm_map_t new_map;
3260 vm_map_entry_t old_entry;
3261 vm_map_entry_t new_entry;
3262 vm_object_t object;
a108bf71 3263 int count;
984263bc 3264
b12defdc 3265 lwkt_gettoken(&vm1->vm_map.token);
984263bc 3266 vm_map_lock(old_map);
984263bc 3267
239b4df9
MD
3268 /*
3269 * XXX Note: upcalls are not copied.
3270 */
984263bc 3271 vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset);
b12defdc 3272 lwkt_gettoken(&vm2->vm_map.token);
984263bc 3273 bcopy(&vm1->vm_startcopy, &vm2->vm_startcopy,
239b4df9 3274 (caddr_t)&vm1->vm_endcopy - (caddr_t)&vm1->vm_startcopy);
984263bc
MD
3275 new_map = &vm2->vm_map; /* XXX */
3276 new_map->timestamp = 1;
3277
46754a20
MD
3278 vm_map_lock(new_map);
3279
a108bf71 3280 count = 0;
984263bc 3281 old_entry = old_map->header.next;
a108bf71
MD
3282 while (old_entry != &old_map->header) {
3283 ++count;
3284 old_entry = old_entry->next;
3285 }
984263bc 3286
a108bf71
MD
3287 count = vm_map_entry_reserve(count + MAP_RESERVE_COUNT);
3288
3289 old_entry = old_map->header.next;
984263bc 3290 while (old_entry != &old_map->header) {
1b874851 3291 if (old_entry->maptype == VM_MAPTYPE_SUBMAP)
984263bc
MD
3292 panic("vm_map_fork: encountered a submap");
3293
3294 switch (old_entry->inheritance) {
3295 case VM_INHERIT_NONE:
3296 break;
984263bc
MD
3297 case VM_INHERIT_SHARE:
3298 /*
1b874851
MD
3299 * Clone the entry, creating the shared object if
3300 * necessary.
984263bc 3301 */
b12defdc 3302 if (old_entry->object.vm_object == NULL)
53025830 3303 vm_map_entry_allocate_object(old_entry);
984263bc 3304
984263bc 3305 if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
6056eb53
MD
3306 /*
3307 * Shadow a map_entry which needs a copy,
3308 * replacing its object with a new object
3309 * that points to the old one. Ask the
3310 * shadow code to automatically add an
3311 * additional ref. We can't do it afterwords
3312 * because we might race a collapse. The call
3313 * to vm_map_entry_shadow() will also clear
3314 * OBJ_ONEMAPPING.
3315 */
b12defdc
MD
3316 vm_map_entry_shadow(old_entry, 1);
3317 } else {
6056eb53
MD
3318 /*
3319 * We will make a shared copy of the object,
3320 * and must clear OBJ_ONEMAPPING.
3321 *
3322 * XXX assert that object.vm_object != NULL
3323 * since we allocate it above.
3324 */
b12defdc
MD
3325 if (old_entry->object.vm_object) {
3326 object = old_entry->object.vm_object;
3327 vm_object_hold(object);
3328 vm_object_chain_wait(object);
3329 vm_object_reference_locked(object);
6056eb53
MD
3330 vm_object_clear_flag(object,
3331 OBJ_ONEMAPPING);
b12defdc
MD
3332 vm_object_drop(object);
3333 }
984263bc 3334 }
984263bc
MD
3335
3336 /*
b12defdc
MD
3337 * Clone the entry. We've already bumped the ref on
3338 * any vm_object.
984263bc 3339 */
a108bf71 3340 new_entry = vm_map_entry_create(new_map, &count);
984263bc
MD
3341 *new_entry = *old_entry;
3342 new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3343 new_entry->wired_count = 0;
3344
3345 /*
3346 * Insert the entry into the new map -- we know we're
3347 * inserting at the end of the new map.
3348 */
3349
3350 vm_map_entry_link(new_map, new_map->header.prev,
46754a20 3351 new_entry);
984263bc
MD
3352
3353 /*
3354 * Update the physical map
3355 */
984263bc
MD
3356 pmap_copy(new_map->pmap, old_map->pmap,
3357 new_entry->start,
3358 (old_entry->end - old_entry->start),
3359 old_entry->start);
3360 break;
984263bc
MD
3361 case VM_INHERIT_COPY:
3362 /*
3363 * Clone the entry and link into the map.
3364 */
a108bf71 3365 new_entry = vm_map_entry_create(new_map, &count);
984263bc
MD
3366 *new_entry = *old_entry;
3367 new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3368 new_entry->wired_count = 0;
3369 new_entry->object.vm_object = NULL;
3370 vm_map_entry_link(new_map, new_map->header.prev,
46754a20 3371 new_entry);
984263bc 3372 vm_map_copy_entry(old_map, new_map, old_entry,
46754a20 3373 new_entry);
984263bc
MD
3374 break;
3375 }
3376 old_entry = old_entry->next;
3377 }
3378
3379 new_map->size = old_map->size;
984263bc 3380 vm_map_unlock(old_map);
46754a20 3381 vm_map_unlock(new_map);
a108bf71 3382 vm_map_entry_release(count);
2de4f77e 3383
b12defdc
MD
3384 lwkt_reltoken(&vm2->vm_map.token);
3385 lwkt_reltoken(&vm1->vm_map.token);
984263bc
MD
3386
3387 return (vm2);
3388}
3389
46754a20
MD
3390/*
3391 * Create an auto-grow stack entry
3392 *
3393 * No requirements.
3394 */
984263bc
MD
3395int
3396vm_map_stack (vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
c809941b 3397 int flags, vm_prot_t prot, vm_prot_t max, int cow)
984263bc 3398{
85d25bcf
MD
3399 vm_map_entry_t prev_entry;
3400 vm_map_entry_t new_stack_entry;
3401 vm_size_t init_ssize;
3402 int rv;
a108bf71 3403 int count;
85d25bcf 3404 vm_offset_t tmpaddr;
984263bc 3405
c809941b 3406 cow |= MAP_IS_STACK;
984263bc
MD
3407
3408 if (max_ssize < sgrowsiz)
3409 init_ssize = max_ssize;
3410 else
3411 init_ssize = sgrowsiz;
3412
a108bf71 3413 count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
984263bc
MD
3414 vm_map_lock(map);
3415
85d25bcf
MD
3416 /*
3417 * Find space for the mapping
3418 */
cadb984b 3419 if ((flags & (MAP_FIXED | MAP_TRYFIXED)) == 0) {
c809941b
MD
3420 if (vm_map_findspace(map, addrbos, max_ssize, 1,
3421 flags, &tmpaddr)) {
85d25bcf
MD
3422 vm_map_unlock(map);
3423 vm_map_entry_release(count);
3424 return (KERN_NO_SPACE);
3425 }
3426 addrbos = tmpaddr;
3427 }
3428
984263bc
MD
3429 /* If addr is already mapped, no go */
3430 if (vm_map_lookup_entry(map, addrbos, &prev_entry)) {
3431 vm_map_unlock(map);
a108bf71 3432 vm_map_entry_release(count);
984263bc
MD
3433 return (KERN_NO_SPACE);
3434 }
3435
85d25bcf
MD
3436#if 0
3437 /* XXX already handled by kern_mmap() */
984263bc
MD
3438 /* If we would blow our VMEM resource limit, no go */
3439 if (map->size + init_ssize >
3440 curproc->p_rlimit[RLIMIT_VMEM].rlim_cur) {
3441 vm_map_unlock(map);
a108bf71 3442 vm_map_entry_release(count);
984263bc
MD
3443 return (KERN_NO_SPACE);
3444 }
85d25bcf 3445#endif
984263bc 3446
85d25bcf
MD
3447 /*
3448 * If we can't accomodate max_ssize in the current mapping,
984263bc
MD
3449 * no go. However, we need to be aware that subsequent user
3450 * mappings might map into the space we have reserved for
3451 * stack, and currently this space is not protected.
3452 *
3453 * Hopefully we will at least detect this condition
3454 * when we try to grow the stack.
3455 */
3456 if ((prev_entry->next != &map->header) &&
3457 (prev_entry->next->start < addrbos + max_ssize)) {
3458 vm_map_unlock(map);
a108bf71 3459 vm_map_entry_release(count);
984263bc
MD
3460 return (KERN_NO_SPACE);
3461 }
3462
85d25bcf
MD
3463 /*
3464 * We initially map a stack of only init_ssize. We will
984263bc
MD
3465 * grow as needed later. Since this is to be a grow
3466 * down stack, we map at the top of the range.
3467 *
3468 * Note: we would normally expect prot and max to be
3469 * VM_PROT_ALL, and cow to be 0. Possibly we should
3470 * eliminate these as input parameters, and just
3471 * pass these values here in the insert call.
3472 */
a108bf71
MD
3473 rv = vm_map_insert(map, &count,
3474 NULL, 0, addrbos + max_ssize - init_ssize,
1b874851
MD
3475 addrbos + max_ssize,
3476 VM_MAPTYPE_NORMAL,
3477 prot, max,
3478 cow);
984263bc
MD
3479
3480 /* Now set the avail_ssize amount */
517e1666 3481 if (rv == KERN_SUCCESS) {
984263bc 3482 if (prev_entry != &map->header)
a108bf71 3483 vm_map_clip_end(map, prev_entry, addrbos + max_ssize - init_ssize, &count);
984263bc
MD
3484 new_stack_entry = prev_entry->next;
3485 if (new_stack_entry->end != addrbos + max_ssize ||
3486 new_stack_entry->start != addrbos + max_ssize - init_ssize)
3487 panic ("Bad entry start/end for new stack entry");
3488 else
afeabdca 3489 new_stack_entry->aux.avail_ssize = max_ssize - init_ssize;
984263bc
MD
3490 }
3491
3492 vm_map_unlock(map);
a108bf71 3493 vm_map_entry_release(count);
984263bc
MD
3494 return (rv);
3495}
3496
46754a20
MD
3497/*
3498 * Attempts to grow a vm stack entry. Returns KERN_SUCCESS if the
984263bc
MD
3499 * desired address is already mapped, or if we successfully grow
3500 * the stack. Also returns KERN_SUCCESS if addr is outside the
3501 * stack range (this is strange, but preserves compatibility with
3502 * the grow function in vm_machdep.c).
46754a20
MD
3503 *
3504 * No requirements.
984263bc
MD
3505 */
3506int
3507vm_map_growstack (struct proc *p, vm_offset_t addr)
3508{
3509 vm_map_entry_t prev_entry;
3510 vm_map_entry_t stack_entry;
3511 vm_map_entry_t new_stack_entry;
3512 struct vmspace *vm = p->p_vmspace;
3513 vm_map_t map = &vm->vm_map;
3514 vm_offset_t end;
a108bf71
MD
3515 int grow_amount;
3516 int rv = KERN_SUCCESS;
3517 int is_procstack;
3518 int use_read_lock = 1;
3519 int count;
984263bc 3520
a108bf71 3521 count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
984263bc
MD
3522Retry:
3523 if (use_read_lock)
3524 vm_map_lock_read(map);
3525 else
3526 vm_map_lock(map);
3527
3528 /* If addr is already in the entry range, no need to grow.*/
3529 if (vm_map_lookup_entry(map, addr, &prev_entry))
3530 goto done;
3531
3532 if ((stack_entry = prev_entry->next) == &map->header)
3533 goto done;
3534 if (prev_entry == &map->header)
afeabdca 3535 end = stack_entry->start - stack_entry->aux.avail_ssize;
984263bc
MD
3536 else
3537 end = prev_entry->end;
3538
c809941b
MD
3539 /*
3540 * This next test mimics the old grow function in vm_machdep.c.
984263bc
MD
3541 * It really doesn't quite make sense, but we do it anyway
3542 * for compatibility.
3543 *
3544 * If not growable stack, return success. This signals the
3545 * caller to proceed as he would normally with normal vm.
3546 */
afeabdca 3547 if (stack_entry->aux.avail_ssize < 1 ||
984263bc 3548 addr >= stack_entry->start ||
afeabdca