Commit | Line | Data |
---|---|---|
984263bc MD |
1 | /* |
2 | * Copyright (c) 1991, 1993 | |
3 | * The Regents of the University of California. All rights reserved. | |
c936cb6f | 4 | * Copyright (c) 2003-2022 The DragonFly Project. All rights reserved. |
984263bc MD |
5 | * |
6 | * This code is derived from software contributed to Berkeley by | |
7 | * The Mach Operating System project at Carnegie-Mellon University. | |
8 | * | |
641f3b0a MD |
9 | * This code is derived from software contributed to The DragonFly Project |
10 | * by Matthew Dillon <dillon@backplane.com> | |
11 | * | |
984263bc MD |
12 | * Redistribution and use in source and binary forms, with or without |
13 | * modification, are permitted provided that the following conditions | |
14 | * are met: | |
15 | * 1. Redistributions of source code must retain the above copyright | |
16 | * notice, this list of conditions and the following disclaimer. | |
17 | * 2. Redistributions in binary form must reproduce the above copyright | |
18 | * notice, this list of conditions and the following disclaimer in the | |
19 | * documentation and/or other materials provided with the distribution. | |
dc71b7ab | 20 | * 3. Neither the name of the University nor the names of its contributors |
984263bc MD |
21 | * may be used to endorse or promote products derived from this software |
22 | * without specific prior written permission. | |
23 | * | |
24 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND | |
25 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
26 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
27 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | |
28 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
29 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
30 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
31 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
32 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
33 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
34 | * SUCH DAMAGE. | |
35 | * | |
36 | * from: @(#)vm_map.c 8.3 (Berkeley) 1/12/94 | |
37 | * | |
984263bc MD |
38 | * Copyright (c) 1987, 1990 Carnegie-Mellon University. |
39 | * All rights reserved. | |
40 | * | |
41 | * Authors: Avadis Tevanian, Jr., Michael Wayne Young | |
42 | * | |
43 | * Permission to use, copy, modify and distribute this software and | |
44 | * its documentation is hereby granted, provided that both the copyright | |
45 | * notice and this permission notice appear in all copies of the | |
46 | * software, derivative works or modified versions, and any portions | |
47 | * thereof, and that both notices appear in supporting documentation. | |
48 | * | |
49 | * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" | |
50 | * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND | |
51 | * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. | |
52 | * | |
53 | * Carnegie Mellon requests users of this software to return to | |
54 | * | |
55 | * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU | |
56 | * School of Computer Science | |
57 | * Carnegie Mellon University | |
58 | * Pittsburgh PA 15213-3890 | |
59 | * | |
60 | * any improvements or extensions that they make and grant Carnegie the | |
61 | * rights to redistribute these changes. | |
984263bc | 62 | */ |
984263bc MD |
63 | #include <sys/param.h> |
64 | #include <sys/systm.h> | |
e3161323 | 65 | #include <sys/kernel.h> |
984263bc | 66 | #include <sys/proc.h> |
ff13bc52 | 67 | #include <sys/serialize.h> |
fef0fdf2 | 68 | #include <sys/lock.h> |
984263bc MD |
69 | #include <sys/vmmeter.h> |
70 | #include <sys/mman.h> | |
71 | #include <sys/vnode.h> | |
72 | #include <sys/resourcevar.h> | |
fef0fdf2 | 73 | #include <sys/shm.h> |
686dbf64 | 74 | #include <sys/tree.h> |
e3161323 | 75 | #include <sys/malloc.h> |
93f86408 | 76 | #include <sys/objcache.h> |
4b566556 | 77 | #include <sys/kern_syscall.h> |
984263bc MD |
78 | |
79 | #include <vm/vm.h> | |
80 | #include <vm/vm_param.h> | |
984263bc MD |
81 | #include <vm/pmap.h> |
82 | #include <vm/vm_map.h> | |
83 | #include <vm/vm_page.h> | |
84 | #include <vm/vm_object.h> | |
85 | #include <vm/vm_pager.h> | |
86 | #include <vm/vm_kern.h> | |
87 | #include <vm/vm_extern.h> | |
88 | #include <vm/swap_pager.h> | |
89 | #include <vm/vm_zone.h> | |
90 | ||
911e30e2 AH |
91 | #include <sys/random.h> |
92 | #include <sys/sysctl.h> | |
fc531fbc MD |
93 | #include <sys/spinlock.h> |
94 | ||
95 | #include <sys/thread2.h> | |
96 | #include <sys/spinlock2.h> | |
a108bf71 | 97 | |
984263bc | 98 | /* |
46754a20 MD |
99 | * Virtual memory maps provide for the mapping, protection, and sharing |
100 | * of virtual memory objects. In addition, this module provides for an | |
101 | * efficient virtual copy of memory from one map to another. | |
984263bc | 102 | * |
46754a20 | 103 | * Synchronization is required prior to most operations. |
984263bc | 104 | * |
46754a20 MD |
105 | * Maps consist of an ordered doubly-linked list of simple entries. |
106 | * A hint and a RB tree is used to speed-up lookups. | |
984263bc | 107 | * |
46754a20 MD |
108 | * Callers looking to modify maps specify start/end addresses which cause |
109 | * the related map entry to be clipped if necessary, and then later | |
110 | * recombined if the pieces remained compatible. | |
984263bc | 111 | * |
46754a20 MD |
112 | * Virtual copy operations are performed by copying VM object references |
113 | * from one map to another, and then marking both regions as copy-on-write. | |
984263bc | 114 | */ |
e28c8ef4 | 115 | static boolean_t vmspace_ctor(void *obj, void *privdata, int ocflags); |
93f86408 MD |
116 | static void vmspace_dtor(void *obj, void *privdata); |
117 | static void vmspace_terminate(struct vmspace *vm, int final); | |
e3161323 MD |
118 | |
119 | MALLOC_DEFINE(M_VMSPACE, "vmspace", "vmspace objcache backingstore"); | |
9de48ead | 120 | MALLOC_DEFINE(M_MAP_BACKING, "map_backing", "vm_map_backing to entry"); |
93f86408 | 121 | static struct objcache *vmspace_cache; |
984263bc | 122 | |
8e5ea5f7 MD |
123 | /* |
124 | * per-cpu page table cross mappings are initialized in early boot | |
125 | * and might require a considerable number of vm_map_entry structures. | |
126 | */ | |
4864d541 MD |
127 | #define MAPENTRYBSP_CACHE (MAXCPU+1) |
128 | #define MAPENTRYAP_CACHE 8 | |
c4ae567f | 129 | |
641f3b0a MD |
130 | /* |
131 | * Partioning threaded programs with large anonymous memory areas can | |
132 | * improve concurrent fault performance. | |
133 | */ | |
134 | #define MAP_ENTRY_PARTITION_SIZE ((vm_offset_t)(32 * 1024 * 1024)) | |
ce5d7a1c MD |
135 | #define MAP_ENTRY_PARTITION_MASK (MAP_ENTRY_PARTITION_SIZE - 1) |
136 | ||
137 | #define VM_MAP_ENTRY_WITHIN_PARTITION(entry) \ | |
67e7cb85 | 138 | ((((entry)->ba.start ^ (entry)->ba.end) & ~MAP_ENTRY_PARTITION_MASK) == 0) |
ce5d7a1c | 139 | |
c8f922ba | 140 | static struct vm_zone mapentzone_store; |
df49ec1e | 141 | __read_mostly static vm_zone_t mapentzone; |
984263bc MD |
142 | |
143 | static struct vm_map_entry map_entry_init[MAX_MAPENT]; | |
4864d541 MD |
144 | static struct vm_map_entry cpu_map_entry_init_bsp[MAPENTRYBSP_CACHE]; |
145 | static struct vm_map_entry cpu_map_entry_init_ap[MAXCPU][MAPENTRYAP_CACHE]; | |
984263bc | 146 | |
2ff21866 | 147 | __read_mostly static int randomize_mmap; |
911e30e2 AH |
148 | SYSCTL_INT(_vm, OID_AUTO, randomize_mmap, CTLFLAG_RW, &randomize_mmap, 0, |
149 | "Randomize mmap offsets"); | |
2ff21866 | 150 | __read_mostly static int vm_map_relock_enable = 1; |
ce94514e | 151 | SYSCTL_INT(_vm, OID_AUTO, map_relock_enable, CTLFLAG_RW, |
641f3b0a | 152 | &vm_map_relock_enable, 0, "insert pop pgtable optimization"); |
2ff21866 | 153 | __read_mostly static int vm_map_partition_enable = 1; |
641f3b0a MD |
154 | SYSCTL_INT(_vm, OID_AUTO, map_partition_enable, CTLFLAG_RW, |
155 | &vm_map_partition_enable, 0, "Break up larger vm_map_entry's"); | |
2ff21866 | 156 | __read_mostly static int vm_map_backing_limit = 5; |
44293a80 MD |
157 | SYSCTL_INT(_vm, OID_AUTO, map_backing_limit, CTLFLAG_RW, |
158 | &vm_map_backing_limit, 0, "ba.backing_ba link depth"); | |
2ff21866 | 159 | __read_mostly static int vm_map_backing_shadow_test = 1; |
1c024bc6 MD |
160 | SYSCTL_INT(_vm, OID_AUTO, map_backing_shadow_test, CTLFLAG_RW, |
161 | &vm_map_backing_shadow_test, 0, "ba.object shadow test"); | |
911e30e2 | 162 | |
07540d37 | 163 | static void vmspace_drop_notoken(struct vmspace *vm); |
5b329e62 MD |
164 | static void vm_map_entry_shadow(vm_map_entry_t entry); |
165 | static vm_map_entry_t vm_map_entry_create(int *); | |
a108bf71 | 166 | static void vm_map_entry_dispose (vm_map_t map, vm_map_entry_t entry, int *); |
64b5a8a5 | 167 | static void vm_map_entry_dispose_ba (vm_map_entry_t entry, vm_map_backing_t ba); |
5b329e62 MD |
168 | static void vm_map_backing_replicated(vm_map_t map, |
169 | vm_map_entry_t entry, int flags); | |
67e7cb85 MD |
170 | static void vm_map_backing_adjust_start(vm_map_entry_t entry, |
171 | vm_ooffset_t start); | |
172 | static void vm_map_backing_adjust_end(vm_map_entry_t entry, | |
173 | vm_ooffset_t end); | |
64b5a8a5 MD |
174 | static void vm_map_backing_attach (vm_map_entry_t entry, vm_map_backing_t ba); |
175 | static void vm_map_backing_detach (vm_map_entry_t entry, vm_map_backing_t ba); | |
a108bf71 MD |
176 | static void _vm_map_clip_end (vm_map_t, vm_map_entry_t, vm_offset_t, int *); |
177 | static void _vm_map_clip_start (vm_map_t, vm_map_entry_t, vm_offset_t, int *); | |
178 | static void vm_map_entry_delete (vm_map_t, vm_map_entry_t, int *); | |
1388df65 RG |
179 | static void vm_map_entry_unwire (vm_map_t, vm_map_entry_t); |
180 | static void vm_map_copy_entry (vm_map_t, vm_map_t, vm_map_entry_t, | |
181 | vm_map_entry_t); | |
ce5d7a1c MD |
182 | static void vm_map_unclip_range (vm_map_t map, vm_map_entry_t start_entry, |
183 | vm_offset_t start, vm_offset_t end, int *countp, int flags); | |
184 | static void vm_map_entry_partition(vm_map_t map, vm_map_entry_t entry, | |
185 | vm_offset_t vaddr, int *countp); | |
984263bc | 186 | |
5b329e62 MD |
187 | #define MAP_BACK_CLIPPED 0x0001 |
188 | #define MAP_BACK_BASEOBJREFD 0x0002 | |
189 | ||
e3161323 | 190 | /* |
46754a20 MD |
191 | * Initialize the vm_map module. Must be called before any other vm_map |
192 | * routines. | |
e3161323 | 193 | * |
46754a20 MD |
194 | * Map and entry structures are allocated from the general purpose |
195 | * memory pool with some exceptions: | |
e3161323 | 196 | * |
46754a20 MD |
197 | * - The kernel map is allocated statically. |
198 | * - Initial kernel map entries are allocated out of a static pool. | |
586f3381 MD |
199 | * - We must set ZONE_SPECIAL here or the early boot code can get |
200 | * stuck if there are >63 cores. | |
e3161323 MD |
201 | * |
202 | * These restrictions are necessary since malloc() uses the | |
203 | * maps and requires map entries. | |
46754a20 MD |
204 | * |
205 | * Called from the low level boot code only. | |
e3161323 | 206 | */ |
984263bc | 207 | void |
57e43348 | 208 | vm_map_startup(void) |
984263bc | 209 | { |
984263bc MD |
210 | mapentzone = &mapentzone_store; |
211 | zbootinit(mapentzone, "MAP ENTRY", sizeof (struct vm_map_entry), | |
586f3381 MD |
212 | map_entry_init, MAX_MAPENT); |
213 | mapentzone_store.zflags |= ZONE_SPECIAL; | |
984263bc MD |
214 | } |
215 | ||
e3161323 | 216 | /* |
46754a20 MD |
217 | * Called prior to any vmspace allocations. |
218 | * | |
219 | * Called from the low level boot code only. | |
e3161323 MD |
220 | */ |
221 | void | |
222 | vm_init2(void) | |
223 | { | |
93f86408 MD |
224 | vmspace_cache = objcache_create_mbacked(M_VMSPACE, |
225 | sizeof(struct vmspace), | |
226 | 0, ncpus * 4, | |
227 | vmspace_ctor, vmspace_dtor, | |
228 | NULL); | |
e16c650d | 229 | zinitna(mapentzone, NULL, 0, 0, ZONE_USE_RESERVE | ZONE_SPECIAL); |
e3161323 MD |
230 | pmap_init2(); |
231 | vm_object_init2(); | |
232 | } | |
233 | ||
93f86408 MD |
234 | /* |
235 | * objcache support. We leave the pmap root cached as long as possible | |
236 | * for performance reasons. | |
237 | */ | |
238 | static | |
e28c8ef4 | 239 | boolean_t |
93f86408 MD |
240 | vmspace_ctor(void *obj, void *privdata, int ocflags) |
241 | { | |
242 | struct vmspace *vm = obj; | |
243 | ||
244 | bzero(vm, sizeof(*vm)); | |
07540d37 | 245 | vm->vm_refcnt = VM_REF_DELETED; |
93f86408 MD |
246 | |
247 | return 1; | |
248 | } | |
249 | ||
250 | static | |
251 | void | |
252 | vmspace_dtor(void *obj, void *privdata) | |
253 | { | |
254 | struct vmspace *vm = obj; | |
255 | ||
07540d37 | 256 | KKASSERT(vm->vm_refcnt == VM_REF_DELETED); |
93f86408 MD |
257 | pmap_puninit(vmspace_pmap(vm)); |
258 | } | |
e3161323 | 259 | |
686dbf64 MD |
260 | /* |
261 | * Red black tree functions | |
46754a20 MD |
262 | * |
263 | * The caller must hold the related map lock. | |
686dbf64 MD |
264 | */ |
265 | static int rb_vm_map_compare(vm_map_entry_t a, vm_map_entry_t b); | |
266 | RB_GENERATE(vm_map_rb_tree, vm_map_entry, rb_entry, rb_vm_map_compare); | |
267 | ||
67e7cb85 | 268 | /* a->ba.start is address, and the only field which must be initialized */ |
686dbf64 MD |
269 | static int |
270 | rb_vm_map_compare(vm_map_entry_t a, vm_map_entry_t b) | |
271 | { | |
67e7cb85 | 272 | if (a->ba.start < b->ba.start) |
686dbf64 | 273 | return(-1); |
67e7cb85 | 274 | else if (a->ba.start > b->ba.start) |
686dbf64 MD |
275 | return(1); |
276 | return(0); | |
277 | } | |
278 | ||
93f86408 MD |
279 | /* |
280 | * Initialize vmspace ref/hold counts vmspace0. There is a holdcnt for | |
281 | * every refcnt. | |
282 | */ | |
283 | void | |
284 | vmspace_initrefs(struct vmspace *vm) | |
285 | { | |
286 | vm->vm_refcnt = 1; | |
009afa1d | 287 | vm->vm_holdcnt = 1; |
93f86408 MD |
288 | } |
289 | ||
984263bc | 290 | /* |
e3161323 MD |
291 | * Allocate a vmspace structure, including a vm_map and pmap. |
292 | * Initialize numerous fields. While the initial allocation is zerod, | |
293 | * subsequence reuse from the objcache leaves elements of the structure | |
294 | * intact (particularly the pmap), so portions must be zerod. | |
295 | * | |
93f86408 | 296 | * Returns a referenced vmspace. |
46754a20 MD |
297 | * |
298 | * No requirements. | |
984263bc MD |
299 | */ |
300 | struct vmspace * | |
57e43348 | 301 | vmspace_alloc(vm_offset_t min, vm_offset_t max) |
984263bc MD |
302 | { |
303 | struct vmspace *vm; | |
304 | ||
93f86408 MD |
305 | vm = objcache_get(vmspace_cache, M_WAITOK); |
306 | ||
54a764e8 | 307 | bzero(&vm->vm_startcopy, |
e3161323 | 308 | (char *)&vm->vm_endcopy - (char *)&vm->vm_startcopy); |
a2ee730d MD |
309 | vm_map_init(&vm->vm_map, min, max, NULL); /* initializes token */ |
310 | ||
311 | /* | |
93f86408 MD |
312 | * NOTE: hold to acquires token for safety. |
313 | * | |
314 | * On return vmspace is referenced (refs=1, hold=1). That is, | |
315 | * each refcnt also has a holdcnt. There can be additional holds | |
316 | * (holdcnt) above and beyond the refcnt. Finalization is handled in | |
317 | * two stages, one on refs 1->0, and the the second on hold 1->0. | |
a2ee730d | 318 | */ |
93f86408 | 319 | KKASSERT(vm->vm_holdcnt == 0); |
07540d37 | 320 | KKASSERT(vm->vm_refcnt == VM_REF_DELETED); |
93f86408 | 321 | vmspace_initrefs(vm); |
a2ee730d | 322 | vmspace_hold(vm); |
e3161323 | 323 | pmap_pinit(vmspace_pmap(vm)); /* (some fields reused) */ |
93f86408 | 324 | vm->vm_map.pmap = vmspace_pmap(vm); /* XXX */ |
984263bc | 325 | vm->vm_shm = NULL; |
a2ee730d | 326 | vm->vm_flags = 0; |
135d7199 | 327 | cpu_vmspace_alloc(vm); |
a2ee730d | 328 | vmspace_drop(vm); |
46754a20 | 329 | |
984263bc MD |
330 | return (vm); |
331 | } | |
332 | ||
a2ee730d | 333 | /* |
07540d37 | 334 | * NOTE: Can return 0 if the vmspace is exiting. |
a2ee730d | 335 | */ |
93f86408 MD |
336 | int |
337 | vmspace_getrefs(struct vmspace *vm) | |
a2ee730d | 338 | { |
009afa1d MD |
339 | int32_t n; |
340 | ||
341 | n = vm->vm_refcnt; | |
342 | cpu_ccfence(); | |
343 | if (n & VM_REF_DELETED) | |
344 | n = -1; | |
345 | return n; | |
a2ee730d MD |
346 | } |
347 | ||
348 | void | |
349 | vmspace_hold(struct vmspace *vm) | |
350 | { | |
07540d37 | 351 | atomic_add_int(&vm->vm_holdcnt, 1); |
a2ee730d MD |
352 | lwkt_gettoken(&vm->vm_map.token); |
353 | } | |
354 | ||
009afa1d MD |
355 | /* |
356 | * Drop with final termination interlock. | |
357 | */ | |
a2ee730d MD |
358 | void |
359 | vmspace_drop(struct vmspace *vm) | |
360 | { | |
361 | lwkt_reltoken(&vm->vm_map.token); | |
93f86408 | 362 | vmspace_drop_notoken(vm); |
a2ee730d MD |
363 | } |
364 | ||
07540d37 MD |
365 | static void |
366 | vmspace_drop_notoken(struct vmspace *vm) | |
367 | { | |
368 | if (atomic_fetchadd_int(&vm->vm_holdcnt, -1) == 1) { | |
369 | if (vm->vm_refcnt & VM_REF_DELETED) | |
370 | vmspace_terminate(vm, 1); | |
371 | } | |
372 | } | |
373 | ||
e3161323 | 374 | /* |
93f86408 MD |
375 | * A vmspace object must not be in a terminated state to be able to obtain |
376 | * additional refs on it. | |
46754a20 | 377 | * |
07540d37 MD |
378 | * These are official references to the vmspace, the count is used to check |
379 | * for vmspace sharing. Foreign accessors should use 'hold' and not 'ref'. | |
380 | * | |
381 | * XXX we need to combine hold & ref together into one 64-bit field to allow | |
382 | * holds to prevent stage-1 termination. | |
e3161323 | 383 | */ |
93f86408 MD |
384 | void |
385 | vmspace_ref(struct vmspace *vm) | |
a108bf71 | 386 | { |
07540d37 MD |
387 | uint32_t n; |
388 | ||
009afa1d | 389 | atomic_add_int(&vm->vm_holdcnt, 1); |
07540d37 MD |
390 | n = atomic_fetchadd_int(&vm->vm_refcnt, 1); |
391 | KKASSERT((n & VM_REF_DELETED) == 0); | |
93f86408 | 392 | } |
e3161323 | 393 | |
93f86408 MD |
394 | /* |
395 | * Release a ref on the vmspace. On the 1->0 transition we do stage-1 | |
396 | * termination of the vmspace. Then, on the final drop of the hold we | |
397 | * will do stage-2 final termination. | |
398 | */ | |
399 | void | |
400 | vmspace_rel(struct vmspace *vm) | |
401 | { | |
07540d37 MD |
402 | uint32_t n; |
403 | ||
009afa1d MD |
404 | /* |
405 | * Drop refs. Each ref also has a hold which is also dropped. | |
406 | * | |
407 | * When refs hits 0 compete to get the VM_REF_DELETED flag (hold | |
408 | * prevent finalization) to start termination processing. | |
409 | * Finalization occurs when the last hold count drops to 0. | |
410 | */ | |
411 | n = atomic_fetchadd_int(&vm->vm_refcnt, -1) - 1; | |
412 | while (n == 0) { | |
413 | if (atomic_cmpset_int(&vm->vm_refcnt, 0, VM_REF_DELETED)) { | |
414 | vmspace_terminate(vm, 0); | |
415 | break; | |
416 | } | |
07540d37 MD |
417 | n = vm->vm_refcnt; |
418 | cpu_ccfence(); | |
93f86408 | 419 | } |
009afa1d | 420 | vmspace_drop_notoken(vm); |
984263bc MD |
421 | } |
422 | ||
e3161323 | 423 | /* |
93f86408 MD |
424 | * This is called during exit indicating that the vmspace is no |
425 | * longer in used by an exiting process, but the process has not yet | |
426 | * been reaped. | |
a2ee730d | 427 | * |
07540d37 MD |
428 | * We drop refs, allowing for stage-1 termination, but maintain a holdcnt |
429 | * to prevent stage-2 until the process is reaped. Note hte order of | |
430 | * operation, we must hold first. | |
e3161323 | 431 | * |
93f86408 MD |
432 | * No requirements. |
433 | */ | |
434 | void | |
435 | vmspace_relexit(struct vmspace *vm) | |
436 | { | |
07540d37 MD |
437 | atomic_add_int(&vm->vm_holdcnt, 1); |
438 | vmspace_rel(vm); | |
93f86408 MD |
439 | } |
440 | ||
441 | /* | |
442 | * Called during reap to disconnect the remainder of the vmspace from | |
443 | * the process. On the hold drop the vmspace termination is finalized. | |
e3161323 | 444 | * |
93f86408 MD |
445 | * No requirements. |
446 | */ | |
447 | void | |
448 | vmspace_exitfree(struct proc *p) | |
449 | { | |
450 | struct vmspace *vm; | |
451 | ||
452 | vm = p->p_vmspace; | |
453 | p->p_vmspace = NULL; | |
454 | vmspace_drop_notoken(vm); | |
455 | } | |
456 | ||
457 | /* | |
458 | * Called in two cases: | |
459 | * | |
460 | * (1) When the last refcnt is dropped and the vmspace becomes inactive, | |
461 | * called with final == 0. refcnt will be (u_int)-1 at this point, | |
462 | * and holdcnt will still be non-zero. | |
e3161323 | 463 | * |
93f86408 MD |
464 | * (2) When holdcnt becomes 0, called with final == 1. There should no |
465 | * longer be anyone with access to the vmspace. | |
46754a20 | 466 | * |
a2ee730d MD |
467 | * VMSPACE_EXIT1 flags the primary deactivation |
468 | * VMSPACE_EXIT2 flags the last reap | |
e3161323 MD |
469 | */ |
470 | static void | |
93f86408 | 471 | vmspace_terminate(struct vmspace *vm, int final) |
984263bc | 472 | { |
a108bf71 MD |
473 | int count; |
474 | ||
b12defdc | 475 | lwkt_gettoken(&vm->vm_map.token); |
93f86408 MD |
476 | if (final == 0) { |
477 | KKASSERT((vm->vm_flags & VMSPACE_EXIT1) == 0); | |
07540d37 | 478 | vm->vm_flags |= VMSPACE_EXIT1; |
93f86408 MD |
479 | |
480 | /* | |
481 | * Get rid of most of the resources. Leave the kernel pmap | |
482 | * intact. | |
a51ba7a6 MD |
483 | * |
484 | * If the pmap does not contain wired pages we can bulk-delete | |
76f1911e MD |
485 | * the pmap as a performance optimization before removing the |
486 | * related mappings. | |
a51ba7a6 | 487 | * |
76f1911e MD |
488 | * If the pmap contains wired pages we cannot do this |
489 | * pre-optimization because currently vm_fault_unwire() | |
490 | * expects the pmap pages to exist and will not decrement | |
491 | * p->wire_count if they do not. | |
93f86408 | 492 | */ |
e3161323 | 493 | shmexit(vm); |
a51ba7a6 MD |
494 | if (vmspace_pmap(vm)->pm_stats.wired_count) { |
495 | vm_map_remove(&vm->vm_map, VM_MIN_USER_ADDRESS, | |
496 | VM_MAX_USER_ADDRESS); | |
497 | pmap_remove_pages(vmspace_pmap(vm), VM_MIN_USER_ADDRESS, | |
498 | VM_MAX_USER_ADDRESS); | |
499 | } else { | |
500 | pmap_remove_pages(vmspace_pmap(vm), VM_MIN_USER_ADDRESS, | |
501 | VM_MAX_USER_ADDRESS); | |
502 | vm_map_remove(&vm->vm_map, VM_MIN_USER_ADDRESS, | |
503 | VM_MAX_USER_ADDRESS); | |
504 | } | |
93f86408 MD |
505 | lwkt_reltoken(&vm->vm_map.token); |
506 | } else { | |
507 | KKASSERT((vm->vm_flags & VMSPACE_EXIT1) != 0); | |
508 | KKASSERT((vm->vm_flags & VMSPACE_EXIT2) == 0); | |
509 | ||
510 | /* | |
511 | * Get rid of remaining basic resources. | |
512 | */ | |
a2ee730d | 513 | vm->vm_flags |= VMSPACE_EXIT2; |
a2ee730d | 514 | shmexit(vm); |
fef0fdf2 | 515 | |
76f1911e MD |
516 | count = vm_map_entry_reserve(MAP_RESERVE_COUNT); |
517 | vm_map_lock(&vm->vm_map); | |
518 | cpu_vmspace_free(vm); | |
519 | ||
a2ee730d MD |
520 | /* |
521 | * Lock the map, to wait out all other references to it. | |
522 | * Delete all of the mappings and pages they hold, then call | |
523 | * the pmap module to reclaim anything left. | |
524 | */ | |
47ec0953 MD |
525 | vm_map_delete(&vm->vm_map, |
526 | vm_map_min(&vm->vm_map), | |
527 | vm_map_max(&vm->vm_map), | |
528 | &count); | |
a2ee730d MD |
529 | vm_map_unlock(&vm->vm_map); |
530 | vm_map_entry_release(count); | |
a722be49 | 531 | |
a2ee730d | 532 | pmap_release(vmspace_pmap(vm)); |
93f86408 MD |
533 | lwkt_reltoken(&vm->vm_map.token); |
534 | objcache_put(vmspace_cache, vm); | |
a2ee730d | 535 | } |
984263bc MD |
536 | } |
537 | ||
538 | /* | |
46754a20 MD |
539 | * Swap useage is determined by taking the proportional swap used by |
540 | * VM objects backing the VM map. To make up for fractional losses, | |
541 | * if the VM object has any swap use at all the associated map entries | |
542 | * count for at least 1 swap page. | |
984263bc | 543 | * |
46754a20 | 544 | * No requirements. |
984263bc | 545 | */ |
534ee349 | 546 | vm_offset_t |
b12defdc | 547 | vmspace_swap_count(struct vmspace *vm) |
984263bc | 548 | { |
b12defdc | 549 | vm_map_t map = &vm->vm_map; |
984263bc | 550 | vm_map_entry_t cur; |
1b874851 | 551 | vm_object_t object; |
534ee349 MD |
552 | vm_offset_t count = 0; |
553 | vm_offset_t n; | |
984263bc | 554 | |
a2ee730d | 555 | vmspace_hold(vm); |
47ec0953 MD |
556 | |
557 | RB_FOREACH(cur, vm_map_rb_tree, &map->rb_root) { | |
1b874851 MD |
558 | switch(cur->maptype) { |
559 | case VM_MAPTYPE_NORMAL: | |
9de48ead | 560 | if ((object = cur->ba.object) == NULL) |
1b874851 | 561 | break; |
96adc753 | 562 | if (object->swblock_count) { |
67e7cb85 | 563 | n = (cur->ba.end - cur->ba.start) / PAGE_SIZE; |
96adc753 | 564 | count += object->swblock_count * |
984263bc MD |
565 | SWAP_META_PAGES * n / object->size + 1; |
566 | } | |
1b874851 MD |
567 | break; |
568 | default: | |
569 | break; | |
984263bc MD |
570 | } |
571 | } | |
a2ee730d MD |
572 | vmspace_drop(vm); |
573 | ||
984263bc MD |
574 | return(count); |
575 | } | |
576 | ||
20479584 | 577 | /* |
46754a20 MD |
578 | * Calculate the approximate number of anonymous pages in use by |
579 | * this vmspace. To make up for fractional losses, we count each | |
580 | * VM object as having at least 1 anonymous page. | |
20479584 | 581 | * |
46754a20 | 582 | * No requirements. |
20479584 | 583 | */ |
534ee349 | 584 | vm_offset_t |
b12defdc | 585 | vmspace_anonymous_count(struct vmspace *vm) |
20479584 | 586 | { |
b12defdc | 587 | vm_map_t map = &vm->vm_map; |
20479584 MD |
588 | vm_map_entry_t cur; |
589 | vm_object_t object; | |
534ee349 | 590 | vm_offset_t count = 0; |
20479584 | 591 | |
a2ee730d | 592 | vmspace_hold(vm); |
47ec0953 | 593 | RB_FOREACH(cur, vm_map_rb_tree, &map->rb_root) { |
20479584 MD |
594 | switch(cur->maptype) { |
595 | case VM_MAPTYPE_NORMAL: | |
9de48ead | 596 | if ((object = cur->ba.object) == NULL) |
20479584 MD |
597 | break; |
598 | if (object->type != OBJT_DEFAULT && | |
599 | object->type != OBJT_SWAP) { | |
600 | break; | |
601 | } | |
602 | count += object->resident_page_count; | |
603 | break; | |
604 | default: | |
605 | break; | |
606 | } | |
607 | } | |
a2ee730d MD |
608 | vmspace_drop(vm); |
609 | ||
20479584 MD |
610 | return(count); |
611 | } | |
612 | ||
984263bc | 613 | /* |
46754a20 MD |
614 | * Initialize an existing vm_map structure such as that in the vmspace |
615 | * structure. The pmap is initialized elsewhere. | |
616 | * | |
617 | * No requirements. | |
984263bc MD |
618 | */ |
619 | void | |
47ec0953 MD |
620 | vm_map_init(struct vm_map *map, vm_offset_t min_addr, vm_offset_t max_addr, |
621 | pmap_t pmap) | |
984263bc | 622 | { |
686dbf64 | 623 | RB_INIT(&map->rb_root); |
fc531fbc MD |
624 | spin_init(&map->ilock_spin, "ilock"); |
625 | map->ilock_base = NULL; | |
984263bc MD |
626 | map->nentries = 0; |
627 | map->size = 0; | |
628 | map->system_map = 0; | |
47ec0953 MD |
629 | vm_map_min(map) = min_addr; |
630 | vm_map_max(map) = max_addr; | |
e4846942 | 631 | map->pmap = pmap; |
984263bc | 632 | map->timestamp = 0; |
69e16e2a | 633 | map->flags = 0; |
e6b81333 | 634 | bzero(&map->freehint, sizeof(map->freehint)); |
b12defdc | 635 | lwkt_token_init(&map->token, "vm_map"); |
e9326fb3 | 636 | lockinit(&map->lock, "vm_maplk", (hz + 9) / 10, 0); |
984263bc MD |
637 | } |
638 | ||
e6b81333 MD |
639 | /* |
640 | * Find the first possible free address for the specified request length. | |
641 | * Returns 0 if we don't have one cached. | |
642 | */ | |
643 | static | |
644 | vm_offset_t | |
645 | vm_map_freehint_find(vm_map_t map, vm_size_t length, vm_size_t align) | |
646 | { | |
647 | vm_map_freehint_t *scan; | |
648 | ||
649 | scan = &map->freehint[0]; | |
650 | while (scan < &map->freehint[VM_MAP_FFCOUNT]) { | |
651 | if (scan->length == length && scan->align == align) | |
652 | return(scan->start); | |
653 | ++scan; | |
654 | } | |
655 | return 0; | |
656 | } | |
657 | ||
658 | /* | |
659 | * Unconditionally set the freehint. Called by vm_map_findspace() after | |
660 | * it finds an address. This will help us iterate optimally on the next | |
661 | * similar findspace. | |
662 | */ | |
663 | static | |
664 | void | |
665 | vm_map_freehint_update(vm_map_t map, vm_offset_t start, | |
666 | vm_size_t length, vm_size_t align) | |
667 | { | |
668 | vm_map_freehint_t *scan; | |
669 | ||
670 | scan = &map->freehint[0]; | |
671 | while (scan < &map->freehint[VM_MAP_FFCOUNT]) { | |
672 | if (scan->length == length && scan->align == align) { | |
673 | scan->start = start; | |
674 | return; | |
675 | } | |
676 | ++scan; | |
677 | } | |
678 | scan = &map->freehint[map->freehint_newindex & VM_MAP_FFMASK]; | |
679 | scan->start = start; | |
680 | scan->align = align; | |
681 | scan->length = length; | |
682 | ++map->freehint_newindex; | |
683 | } | |
684 | ||
685 | /* | |
686 | * Update any existing freehints (for any alignment), for the hole we just | |
687 | * added. | |
688 | */ | |
689 | static | |
690 | void | |
691 | vm_map_freehint_hole(vm_map_t map, vm_offset_t start, vm_size_t length) | |
692 | { | |
693 | vm_map_freehint_t *scan; | |
694 | ||
695 | scan = &map->freehint[0]; | |
696 | while (scan < &map->freehint[VM_MAP_FFCOUNT]) { | |
697 | if (scan->length <= length && scan->start > start) | |
698 | scan->start = start; | |
699 | ++scan; | |
700 | } | |
701 | } | |
702 | ||
53025830 | 703 | /* |
9de48ead MD |
704 | * This function handles MAP_ENTRY_NEEDS_COPY by inserting a fronting |
705 | * object in the entry for COW faults. | |
53025830 | 706 | * |
9de48ead MD |
707 | * The entire chain including entry->ba (prior to inserting the fronting |
708 | * object) essentially becomes set in stone... elements of it can be paged | |
709 | * in or out, but cannot be further modified. | |
710 | * | |
711 | * NOTE: If we do not optimize the backing chain then a unique copy is not | |
712 | * needed. Note, however, that because portions of the chain are | |
713 | * shared across pmaps we cannot make any changes to the vm_map_backing | |
714 | * elements themselves. | |
53025830 MD |
715 | * |
716 | * If the map segment is governed by a virtual page table then it is | |
717 | * possible to address offsets beyond the mapped area. Just allocate | |
718 | * a maximally sized object for this case. | |
46754a20 | 719 | * |
15553805 MD |
720 | * If addref is non-zero an additional reference is added to the returned |
721 | * entry. This mechanic exists because the additional reference might have | |
722 | * to be added atomically and not after return to prevent a premature | |
9de48ead | 723 | * collapse. XXX currently there is no collapse code. |
15553805 | 724 | * |
46754a20 MD |
725 | * The vm_map must be exclusively locked. |
726 | * No other requirements. | |
53025830 MD |
727 | */ |
728 | static | |
729 | void | |
5b329e62 | 730 | vm_map_entry_shadow(vm_map_entry_t entry) |
53025830 | 731 | { |
44293a80 | 732 | vm_map_backing_t ba; |
9de48ead MD |
733 | vm_size_t length; |
734 | vm_object_t source; | |
735 | vm_object_t result; | |
9de48ead | 736 | |
4d4f84f5 MD |
737 | /* |
738 | * Number of bytes we have to shadow | |
739 | */ | |
740 | length = atop(entry->ba.end - entry->ba.start); | |
9de48ead | 741 | |
9de48ead MD |
742 | /* |
743 | * Don't create the new object if the old object isn't shared. | |
e32fb2aa | 744 | * This case occurs quite often when programs fork/exec/wait. |
9de48ead | 745 | * |
8492a2fe MD |
746 | * Caller ensures source exists (all backing_ba's must have objects), |
747 | * typically indirectly by virtue of the NEEDS_COPY flag being set. | |
e32fb2aa MD |
748 | * We have a ref on source by virtue of the entry and do not need |
749 | * to lock it to do this test. | |
9de48ead | 750 | */ |
8492a2fe MD |
751 | source = entry->ba.object; |
752 | KKASSERT(source); | |
753 | ||
754 | if (source->type != OBJT_VNODE) { | |
8492a2fe MD |
755 | if (source->ref_count == 1 && |
756 | source->handle == NULL && | |
757 | (source->type == OBJT_DEFAULT || | |
758 | source->type == OBJT_SWAP)) { | |
8492a2fe | 759 | goto done; |
9de48ead MD |
760 | } |
761 | } | |
e32fb2aa MD |
762 | ba = kmalloc(sizeof(*ba), M_MAP_BACKING, M_INTWAIT); /* copied later */ |
763 | vm_object_hold_shared(source); | |
9de48ead | 764 | |
8492a2fe MD |
765 | /* |
766 | * Once it becomes part of a backing_ba chain it can wind up anywhere, | |
767 | * drop the ONEMAPPING flag now. | |
768 | */ | |
769 | vm_object_clear_flag(source, OBJ_ONEMAPPING); | |
770 | ||
9de48ead MD |
771 | /* |
772 | * Allocate a new object with the given length. The new object | |
773 | * is returned referenced but we may have to add another one. | |
774 | * If we are adding a second reference we must clear OBJ_ONEMAPPING. | |
775 | * (typically because the caller is about to clone a vm_map_entry). | |
776 | * | |
777 | * The source object currently has an extra reference to prevent | |
778 | * collapses into it while we mess with its shadow list, which | |
779 | * we will remove later in this routine. | |
780 | * | |
781 | * The target object may require a second reference if asked for one | |
782 | * by the caller. | |
783 | */ | |
5b329e62 | 784 | result = vm_object_allocate_hold(OBJT_DEFAULT, length); |
9de48ead MD |
785 | if (result == NULL) |
786 | panic("vm_object_shadow: no object for shadowing"); | |
9de48ead MD |
787 | |
788 | /* | |
789 | * The new object shadows the source object. | |
790 | * | |
791 | * Try to optimize the result object's page color when shadowing | |
792 | * in order to maintain page coloring consistency in the combined | |
793 | * shadowed object. | |
794 | * | |
795 | * The source object is moved to ba, retaining its existing ref-count. | |
796 | * No additional ref is needed. | |
797 | * | |
798 | * SHADOWING IS NOT APPLICABLE TO OBJT_VNODE OBJECTS | |
799 | */ | |
64b5a8a5 | 800 | vm_map_backing_detach(entry, &entry->ba); |
9de48ead | 801 | *ba = entry->ba; /* previous ba */ |
9de48ead MD |
802 | entry->ba.object = result; /* new ba (at head of entry) */ |
803 | entry->ba.backing_ba = ba; | |
44293a80 | 804 | entry->ba.backing_count = ba->backing_count + 1; |
9de48ead | 805 | entry->ba.offset = 0; |
9de48ead | 806 | |
8492a2fe MD |
807 | /* cpu localization twist */ |
808 | result->pg_color = vm_quickcolor(); | |
9de48ead | 809 | |
64b5a8a5 MD |
810 | vm_map_backing_attach(entry, &entry->ba); |
811 | vm_map_backing_attach(entry, ba); | |
5b329e62 | 812 | |
9de48ead MD |
813 | /* |
814 | * Adjust the return storage. Drop the ref on source before | |
815 | * returning. | |
816 | */ | |
817 | vm_object_drop(result); | |
5b329e62 | 818 | vm_object_drop(source); |
9de48ead | 819 | done: |
53025830 MD |
820 | entry->eflags &= ~MAP_ENTRY_NEEDS_COPY; |
821 | } | |
822 | ||
823 | /* | |
824 | * Allocate an object for a vm_map_entry. | |
825 | * | |
826 | * Object allocation for anonymous mappings is defered as long as possible. | |
827 | * This function is called when we can defer no longer, generally when a map | |
828 | * entry might be split or forked or takes a page fault. | |
829 | * | |
830 | * If the map segment is governed by a virtual page table then it is | |
831 | * possible to address offsets beyond the mapped area. Just allocate | |
832 | * a maximally sized object for this case. | |
46754a20 MD |
833 | * |
834 | * The vm_map must be exclusively locked. | |
835 | * No other requirements. | |
53025830 MD |
836 | */ |
837 | void | |
838 | vm_map_entry_allocate_object(vm_map_entry_t entry) | |
839 | { | |
840 | vm_object_t obj; | |
841 | ||
9de48ead | 842 | /* |
67e7cb85 MD |
843 | * ba.offset is NOT cumulatively added in the backing_ba scan like |
844 | * it was in the old object chain, so we can assign whatever offset | |
845 | * we like to the new object. | |
9de48ead | 846 | * |
67e7cb85 MD |
847 | * For now assign a value of 0 to make debugging object sizes |
848 | * easier. | |
9de48ead | 849 | */ |
67e7cb85 | 850 | entry->ba.offset = 0; |
9de48ead | 851 | |
4d4f84f5 MD |
852 | obj = vm_object_allocate(OBJT_DEFAULT, |
853 | atop(entry->ba.end - entry->ba.start) + | |
854 | entry->ba.offset); | |
9de48ead | 855 | entry->ba.object = obj; |
64b5a8a5 | 856 | vm_map_backing_attach(entry, &entry->ba); |
53025830 MD |
857 | } |
858 | ||
41a01a4d | 859 | /* |
46754a20 MD |
860 | * Set an initial negative count so the first attempt to reserve |
861 | * space preloads a bunch of vm_map_entry's for this cpu. Also | |
862 | * pre-allocate 2 vm_map_entries which will be needed by zalloc() to | |
863 | * map a new page for vm_map_entry structures. SMP systems are | |
864 | * particularly sensitive. | |
c4ae567f | 865 | * |
46754a20 MD |
866 | * This routine is called in early boot so we cannot just call |
867 | * vm_map_entry_reserve(). | |
41a01a4d | 868 | * |
46754a20 | 869 | * Called from the low level boot code only (for each cpu) |
4864d541 MD |
870 | * |
871 | * WARNING! Take care not to have too-big a static/BSS structure here | |
872 | * as MAXCPU can be 256+, otherwise the loader's 64MB heap | |
873 | * can get blown out by the kernel plus the initrd image. | |
41a01a4d MD |
874 | */ |
875 | void | |
876 | vm_map_entry_reserve_cpu_init(globaldata_t gd) | |
877 | { | |
c4ae567f | 878 | vm_map_entry_t entry; |
4864d541 | 879 | int count; |
c4ae567f MD |
880 | int i; |
881 | ||
ce5d7a1c | 882 | atomic_add_int(&gd->gd_vme_avail, -MAP_RESERVE_COUNT * 2); |
4864d541 MD |
883 | if (gd->gd_cpuid == 0) { |
884 | entry = &cpu_map_entry_init_bsp[0]; | |
885 | count = MAPENTRYBSP_CACHE; | |
886 | } else { | |
887 | entry = &cpu_map_entry_init_ap[gd->gd_cpuid][0]; | |
888 | count = MAPENTRYAP_CACHE; | |
889 | } | |
890 | for (i = 0; i < count; ++i, ++entry) { | |
47ec0953 | 891 | MAPENT_FREELIST(entry) = gd->gd_vme_base; |
c4ae567f MD |
892 | gd->gd_vme_base = entry; |
893 | } | |
41a01a4d MD |
894 | } |
895 | ||
a108bf71 | 896 | /* |
ce5d7a1c | 897 | * Reserves vm_map_entry structures so code later-on can manipulate |
46754a20 MD |
898 | * map_entry structures within a locked map without blocking trying |
899 | * to allocate a new vm_map_entry. | |
a108bf71 | 900 | * |
46754a20 | 901 | * No requirements. |
ce5d7a1c MD |
902 | * |
903 | * WARNING! We must not decrement gd_vme_avail until after we have | |
904 | * ensured that sufficient entries exist, otherwise we can | |
905 | * get into an endless call recursion in the zalloc code | |
906 | * itself. | |
a108bf71 MD |
907 | */ |
908 | int | |
909 | vm_map_entry_reserve(int count) | |
910 | { | |
911 | struct globaldata *gd = mycpu; | |
912 | vm_map_entry_t entry; | |
913 | ||
a108bf71 MD |
914 | /* |
915 | * Make sure we have enough structures in gd_vme_base to handle | |
916 | * the reservation request. | |
a5fc46c9 | 917 | * |
ce5d7a1c MD |
918 | * Use a critical section to protect against VM faults. It might |
919 | * not be needed, but we have to be careful here. | |
a108bf71 | 920 | */ |
ce5d7a1c MD |
921 | if (gd->gd_vme_avail < count) { |
922 | crit_enter(); | |
923 | while (gd->gd_vme_avail < count) { | |
924 | entry = zalloc(mapentzone); | |
47ec0953 | 925 | MAPENT_FREELIST(entry) = gd->gd_vme_base; |
ce5d7a1c MD |
926 | gd->gd_vme_base = entry; |
927 | atomic_add_int(&gd->gd_vme_avail, 1); | |
928 | } | |
929 | crit_exit(); | |
a108bf71 | 930 | } |
ce5d7a1c | 931 | atomic_add_int(&gd->gd_vme_avail, -count); |
46754a20 | 932 | |
a108bf71 MD |
933 | return(count); |
934 | } | |
935 | ||
936 | /* | |
46754a20 MD |
937 | * Releases previously reserved vm_map_entry structures that were not |
938 | * used. If we have too much junk in our per-cpu cache clean some of | |
939 | * it out. | |
a108bf71 | 940 | * |
46754a20 | 941 | * No requirements. |
a108bf71 MD |
942 | */ |
943 | void | |
944 | vm_map_entry_release(int count) | |
945 | { | |
946 | struct globaldata *gd = mycpu; | |
947 | vm_map_entry_t entry; | |
ce5d7a1c | 948 | vm_map_entry_t efree; |
a108bf71 | 949 | |
ce5d7a1c MD |
950 | count = atomic_fetchadd_int(&gd->gd_vme_avail, count) + count; |
951 | if (gd->gd_vme_avail > MAP_RESERVE_SLOP) { | |
952 | efree = NULL; | |
a108bf71 | 953 | crit_enter(); |
ce5d7a1c MD |
954 | while (gd->gd_vme_avail > MAP_RESERVE_HYST) { |
955 | entry = gd->gd_vme_base; | |
956 | KKASSERT(entry != NULL); | |
47ec0953 | 957 | gd->gd_vme_base = MAPENT_FREELIST(entry); |
ce5d7a1c | 958 | atomic_add_int(&gd->gd_vme_avail, -1); |
47ec0953 | 959 | MAPENT_FREELIST(entry) = efree; |
ce5d7a1c MD |
960 | efree = entry; |
961 | } | |
962 | crit_exit(); | |
963 | while ((entry = efree) != NULL) { | |
47ec0953 | 964 | efree = MAPENT_FREELIST(efree); |
ce5d7a1c MD |
965 | zfree(mapentzone, entry); |
966 | } | |
a108bf71 | 967 | } |
a108bf71 MD |
968 | } |
969 | ||
970 | /* | |
46754a20 MD |
971 | * Reserve map entry structures for use in kernel_map itself. These |
972 | * entries have *ALREADY* been reserved on a per-cpu basis when the map | |
973 | * was inited. This function is used by zalloc() to avoid a recursion | |
974 | * when zalloc() itself needs to allocate additional kernel memory. | |
a108bf71 | 975 | * |
46754a20 MD |
976 | * This function works like the normal reserve but does not load the |
977 | * vm_map_entry cache (because that would result in an infinite | |
978 | * recursion). Note that gd_vme_avail may go negative. This is expected. | |
c4ae567f | 979 | * |
46754a20 MD |
980 | * Any caller of this function must be sure to renormalize after |
981 | * potentially eating entries to ensure that the reserve supply | |
982 | * remains intact. | |
a108bf71 | 983 | * |
46754a20 | 984 | * No requirements. |
a108bf71 MD |
985 | */ |
986 | int | |
987 | vm_map_entry_kreserve(int count) | |
988 | { | |
989 | struct globaldata *gd = mycpu; | |
990 | ||
ce5d7a1c | 991 | atomic_add_int(&gd->gd_vme_avail, -count); |
46754a20 | 992 | KASSERT(gd->gd_vme_base != NULL, |
ed20d0e3 | 993 | ("no reserved entries left, gd_vme_avail = %d", |
46754a20 | 994 | gd->gd_vme_avail)); |
a108bf71 MD |
995 | return(count); |
996 | } | |
997 | ||
998 | /* | |
46754a20 MD |
999 | * Release previously reserved map entries for kernel_map. We do not |
1000 | * attempt to clean up like the normal release function as this would | |
1001 | * cause an unnecessary (but probably not fatal) deep procedure call. | |
a108bf71 | 1002 | * |
46754a20 | 1003 | * No requirements. |
a108bf71 MD |
1004 | */ |
1005 | void | |
1006 | vm_map_entry_krelease(int count) | |
1007 | { | |
1008 | struct globaldata *gd = mycpu; | |
1009 | ||
ce5d7a1c | 1010 | atomic_add_int(&gd->gd_vme_avail, count); |
a108bf71 MD |
1011 | } |
1012 | ||
984263bc | 1013 | /* |
46754a20 | 1014 | * Allocates a VM map entry for insertion. No entry fields are filled in. |
984263bc | 1015 | * |
46754a20 MD |
1016 | * The entries should have previously been reserved. The reservation count |
1017 | * is tracked in (*countp). | |
a108bf71 | 1018 | * |
46754a20 | 1019 | * No requirements. |
984263bc | 1020 | */ |
8a8d5d85 | 1021 | static vm_map_entry_t |
5b329e62 | 1022 | vm_map_entry_create(int *countp) |
984263bc | 1023 | { |
a108bf71 MD |
1024 | struct globaldata *gd = mycpu; |
1025 | vm_map_entry_t entry; | |
8a8d5d85 | 1026 | |
a108bf71 MD |
1027 | KKASSERT(*countp > 0); |
1028 | --*countp; | |
1029 | crit_enter(); | |
1030 | entry = gd->gd_vme_base; | |
1031 | KASSERT(entry != NULL, ("gd_vme_base NULL! count %d", *countp)); | |
47ec0953 | 1032 | gd->gd_vme_base = MAPENT_FREELIST(entry); |
a108bf71 | 1033 | crit_exit(); |
46754a20 | 1034 | |
a108bf71 | 1035 | return(entry); |
984263bc MD |
1036 | } |
1037 | ||
5b329e62 | 1038 | /* |
64b5a8a5 | 1039 | * Attach and detach backing store elements |
5b329e62 MD |
1040 | */ |
1041 | static void | |
64b5a8a5 | 1042 | vm_map_backing_attach(vm_map_entry_t entry, vm_map_backing_t ba) |
5b329e62 | 1043 | { |
64b5a8a5 | 1044 | vm_object_t obj; |
5b329e62 | 1045 | |
64b5a8a5 | 1046 | switch(entry->maptype) { |
64b5a8a5 MD |
1047 | case VM_MAPTYPE_NORMAL: |
1048 | obj = ba->object; | |
1049 | lockmgr(&obj->backing_lk, LK_EXCLUSIVE); | |
1050 | TAILQ_INSERT_TAIL(&obj->backing_list, ba, entry); | |
1051 | lockmgr(&obj->backing_lk, LK_RELEASE); | |
1052 | break; | |
1053 | case VM_MAPTYPE_UKSMAP: | |
1054 | ba->uksmap(ba, UKSMAPOP_ADD, entry->aux.dev, NULL); | |
1055 | break; | |
1056 | } | |
5b329e62 MD |
1057 | } |
1058 | ||
1059 | static void | |
64b5a8a5 | 1060 | vm_map_backing_detach(vm_map_entry_t entry, vm_map_backing_t ba) |
5b329e62 | 1061 | { |
64b5a8a5 | 1062 | vm_object_t obj; |
5b329e62 | 1063 | |
64b5a8a5 | 1064 | switch(entry->maptype) { |
64b5a8a5 MD |
1065 | case VM_MAPTYPE_NORMAL: |
1066 | obj = ba->object; | |
1067 | lockmgr(&obj->backing_lk, LK_EXCLUSIVE); | |
1068 | TAILQ_REMOVE(&obj->backing_list, ba, entry); | |
1069 | lockmgr(&obj->backing_lk, LK_RELEASE); | |
1070 | break; | |
1071 | case VM_MAPTYPE_UKSMAP: | |
1072 | ba->uksmap(ba, UKSMAPOP_REM, entry->aux.dev, NULL); | |
1073 | break; | |
1074 | } | |
5b329e62 MD |
1075 | } |
1076 | ||
984263bc | 1077 | /* |
44293a80 MD |
1078 | * Dispose of the dynamically allocated backing_ba chain associated |
1079 | * with a vm_map_entry. | |
984263bc | 1080 | * |
44293a80 MD |
1081 | * We decrement the (possibly shared) element and kfree() on the |
1082 | * 1->0 transition. We only iterate to the next backing_ba when | |
1083 | * the previous one went through a 1->0 transition. | |
64b5a8a5 MD |
1084 | * |
1085 | * These can only be normal vm_object based backings. | |
984263bc | 1086 | */ |
8a8d5d85 | 1087 | static void |
64b5a8a5 | 1088 | vm_map_entry_dispose_ba(vm_map_entry_t entry, vm_map_backing_t ba) |
984263bc | 1089 | { |
44293a80 | 1090 | vm_map_backing_t next; |
9de48ead | 1091 | |
44293a80 | 1092 | while (ba) { |
64b5a8a5 MD |
1093 | if (ba->map_object) { |
1094 | vm_map_backing_detach(entry, ba); | |
9de48ead | 1095 | vm_object_deallocate(ba->object); |
5b329e62 | 1096 | } |
44293a80 | 1097 | next = ba->backing_ba; |
9de48ead | 1098 | kfree(ba, M_MAP_BACKING); |
44293a80 | 1099 | ba = next; |
9de48ead | 1100 | } |
44293a80 MD |
1101 | } |
1102 | ||
1103 | /* | |
1104 | * Dispose of a vm_map_entry that is no longer being referenced. | |
1105 | * | |
1106 | * No requirements. | |
1107 | */ | |
1108 | static void | |
1109 | vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry, int *countp) | |
1110 | { | |
1111 | struct globaldata *gd = mycpu; | |
1112 | ||
1113 | /* | |
1114 | * Dispose of the base object and the backing link. | |
1115 | */ | |
1116 | switch(entry->maptype) { | |
1117 | case VM_MAPTYPE_NORMAL: | |
64b5a8a5 MD |
1118 | if (entry->ba.map_object) { |
1119 | vm_map_backing_detach(entry, &entry->ba); | |
44293a80 | 1120 | vm_object_deallocate(entry->ba.object); |
5b329e62 | 1121 | } |
44293a80 | 1122 | break; |
175f5a88 | 1123 | case VM_MAPTYPE_SUBMAP: |
64b5a8a5 | 1124 | break; |
44293a80 | 1125 | case VM_MAPTYPE_UKSMAP: |
64b5a8a5 | 1126 | vm_map_backing_detach(entry, &entry->ba); |
44293a80 MD |
1127 | break; |
1128 | default: | |
1129 | break; | |
1130 | } | |
64b5a8a5 | 1131 | vm_map_entry_dispose_ba(entry, entry->ba.backing_ba); |
9de48ead MD |
1132 | |
1133 | /* | |
1134 | * Cleanup for safety. | |
1135 | */ | |
1136 | entry->ba.backing_ba = NULL; | |
1137 | entry->ba.object = NULL; | |
1138 | entry->ba.offset = 0; | |
a108bf71 MD |
1139 | |
1140 | ++*countp; | |
1141 | crit_enter(); | |
47ec0953 | 1142 | MAPENT_FREELIST(entry) = gd->gd_vme_base; |
a108bf71 MD |
1143 | gd->gd_vme_base = entry; |
1144 | crit_exit(); | |
984263bc MD |
1145 | } |
1146 | ||
8a8d5d85 | 1147 | |
984263bc | 1148 | /* |
46754a20 | 1149 | * Insert/remove entries from maps. |
984263bc | 1150 | * |
46754a20 | 1151 | * The related map must be exclusively locked. |
b12defdc | 1152 | * The caller must hold map->token |
46754a20 | 1153 | * No other requirements. |
984263bc MD |
1154 | */ |
1155 | static __inline void | |
47ec0953 | 1156 | vm_map_entry_link(vm_map_t map, vm_map_entry_t entry) |
984263bc | 1157 | { |
46754a20 MD |
1158 | ASSERT_VM_MAP_LOCKED(map); |
1159 | ||
984263bc | 1160 | map->nentries++; |
0cd275af MD |
1161 | if (vm_map_rb_tree_RB_INSERT(&map->rb_root, entry)) |
1162 | panic("vm_map_entry_link: dup addr map %p ent %p", map, entry); | |
984263bc MD |
1163 | } |
1164 | ||
1165 | static __inline void | |
1166 | vm_map_entry_unlink(vm_map_t map, | |
1167 | vm_map_entry_t entry) | |
1168 | { | |
46754a20 MD |
1169 | ASSERT_VM_MAP_LOCKED(map); |
1170 | ||
1171 | if (entry->eflags & MAP_ENTRY_IN_TRANSITION) { | |
1172 | panic("vm_map_entry_unlink: attempt to mess with " | |
1173 | "locked entry! %p", entry); | |
1174 | } | |
686dbf64 | 1175 | vm_map_rb_tree_RB_REMOVE(&map->rb_root, entry); |
984263bc MD |
1176 | map->nentries--; |
1177 | } | |
1178 | ||
984263bc | 1179 | /* |
46754a20 MD |
1180 | * Finds the map entry containing (or immediately preceding) the specified |
1181 | * address in the given map. The entry is returned in (*entry). | |
1182 | * | |
1183 | * The boolean result indicates whether the address is actually contained | |
1184 | * in the map. | |
984263bc | 1185 | * |
46754a20 MD |
1186 | * The related map must be locked. |
1187 | * No other requirements. | |
984263bc MD |
1188 | */ |
1189 | boolean_t | |
46754a20 | 1190 | vm_map_lookup_entry(vm_map_t map, vm_offset_t address, vm_map_entry_t *entry) |
984263bc | 1191 | { |
686dbf64 | 1192 | vm_map_entry_t tmp; |
984263bc MD |
1193 | vm_map_entry_t last; |
1194 | ||
46754a20 | 1195 | ASSERT_VM_MAP_LOCKED(map); |
984263bc MD |
1196 | |
1197 | /* | |
686dbf64 MD |
1198 | * Locate the record from the top of the tree. 'last' tracks the |
1199 | * closest prior record and is returned if no match is found, which | |
1200 | * in binary tree terms means tracking the most recent right-branch | |
47ec0953 | 1201 | * taken. If there is no prior record, *entry is set to NULL. |
984263bc | 1202 | */ |
47ec0953 | 1203 | last = NULL; |
686dbf64 MD |
1204 | tmp = RB_ROOT(&map->rb_root); |
1205 | ||
1206 | while (tmp) { | |
67e7cb85 MD |
1207 | if (address >= tmp->ba.start) { |
1208 | if (address < tmp->ba.end) { | |
686dbf64 | 1209 | *entry = tmp; |
686dbf64 | 1210 | return(TRUE); |
984263bc | 1211 | } |
686dbf64 MD |
1212 | last = tmp; |
1213 | tmp = RB_RIGHT(tmp, rb_entry); | |
1214 | } else { | |
1215 | tmp = RB_LEFT(tmp, rb_entry); | |
984263bc | 1216 | } |
984263bc | 1217 | } |
686dbf64 | 1218 | *entry = last; |
984263bc MD |
1219 | return (FALSE); |
1220 | } | |
1221 | ||
1222 | /* | |
46754a20 MD |
1223 | * Inserts the given whole VM object into the target map at the specified |
1224 | * address range. The object's size should match that of the address range. | |
984263bc | 1225 | * |
46754a20 | 1226 | * The map must be exclusively locked. |
b12defdc | 1227 | * The object must be held. |
46754a20 | 1228 | * The caller must have reserved sufficient vm_map_entry structures. |
984263bc | 1229 | * |
b12defdc | 1230 | * If object is non-NULL, ref count must be bumped by caller prior to |
5b329e62 | 1231 | * making call to account for the new entry. XXX API is a bit messy. |
984263bc MD |
1232 | */ |
1233 | int | |
64b5a8a5 MD |
1234 | vm_map_insert(vm_map_t map, int *countp, |
1235 | void *map_object, void *map_aux, | |
1236 | vm_ooffset_t offset, void *aux_info, | |
1237 | vm_offset_t start, vm_offset_t end, | |
3091de50 | 1238 | vm_maptype_t maptype, vm_subsys_t id, |
0adbcbd6 | 1239 | vm_prot_t prot, vm_prot_t max, int cow) |
984263bc MD |
1240 | { |
1241 | vm_map_entry_t new_entry; | |
1242 | vm_map_entry_t prev_entry; | |
47ec0953 | 1243 | vm_map_entry_t next; |
984263bc MD |
1244 | vm_map_entry_t temp_entry; |
1245 | vm_eflags_t protoeflags; | |
0adbcbd6 | 1246 | vm_object_t object; |
9de48ead | 1247 | int must_drop = 0; |
0adbcbd6 MD |
1248 | |
1249 | if (maptype == VM_MAPTYPE_UKSMAP) | |
1250 | object = NULL; | |
1251 | else | |
1252 | object = map_object; | |
984263bc | 1253 | |
46754a20 | 1254 | ASSERT_VM_MAP_LOCKED(map); |
b12defdc MD |
1255 | if (object) |
1256 | ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); | |
46754a20 | 1257 | |
984263bc MD |
1258 | /* |
1259 | * Check that the start and end points are not bogus. | |
1260 | */ | |
47ec0953 MD |
1261 | if ((start < vm_map_min(map)) || (end > vm_map_max(map)) || |
1262 | (start >= end)) { | |
984263bc | 1263 | return (KERN_INVALID_ADDRESS); |
47ec0953 | 1264 | } |
984263bc MD |
1265 | |
1266 | /* | |
1267 | * Find the entry prior to the proposed starting address; if it's part | |
1268 | * of an existing entry, this range is bogus. | |
1269 | */ | |
984263bc MD |
1270 | if (vm_map_lookup_entry(map, start, &temp_entry)) |
1271 | return (KERN_NO_SPACE); | |
984263bc MD |
1272 | prev_entry = temp_entry; |
1273 | ||
1274 | /* | |
1275 | * Assert that the next entry doesn't overlap the end point. | |
1276 | */ | |
47ec0953 MD |
1277 | if (prev_entry) |
1278 | next = vm_map_rb_tree_RB_NEXT(prev_entry); | |
1279 | else | |
1280 | next = RB_MIN(vm_map_rb_tree, &map->rb_root); | |
67e7cb85 | 1281 | if (next && next->ba.start < end) |
984263bc MD |
1282 | return (KERN_NO_SPACE); |
1283 | ||
1284 | protoeflags = 0; | |
1285 | ||
1286 | if (cow & MAP_COPY_ON_WRITE) | |
1287 | protoeflags |= MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY; | |
1288 | ||
1289 | if (cow & MAP_NOFAULT) { | |
1290 | protoeflags |= MAP_ENTRY_NOFAULT; | |
1291 | ||
1292 | KASSERT(object == NULL, | |
1293 | ("vm_map_insert: paradoxical MAP_NOFAULT request")); | |
1294 | } | |
1295 | if (cow & MAP_DISABLE_SYNCER) | |
1296 | protoeflags |= MAP_ENTRY_NOSYNC; | |
1297 | if (cow & MAP_DISABLE_COREDUMP) | |
1298 | protoeflags |= MAP_ENTRY_NOCOREDUMP; | |
c809941b MD |
1299 | if (cow & MAP_IS_STACK) |
1300 | protoeflags |= MAP_ENTRY_STACK; | |
e40cfbd7 MD |
1301 | if (cow & MAP_IS_KSTACK) |
1302 | protoeflags |= MAP_ENTRY_KSTACK; | |
984263bc | 1303 | |
b12defdc | 1304 | lwkt_gettoken(&map->token); |
2de4f77e | 1305 | |
984263bc | 1306 | if (object) { |
5b329e62 MD |
1307 | ; |
1308 | } else if (prev_entry && | |
984263bc | 1309 | (prev_entry->eflags == protoeflags) && |
67e7cb85 | 1310 | (prev_entry->ba.end == start) && |
984263bc | 1311 | (prev_entry->wired_count == 0) && |
3091de50 | 1312 | (prev_entry->id == id) && |
1b874851 | 1313 | prev_entry->maptype == maptype && |
0adbcbd6 | 1314 | maptype == VM_MAPTYPE_NORMAL && |
9de48ead MD |
1315 | prev_entry->ba.backing_ba == NULL && /* not backed */ |
1316 | ((prev_entry->ba.object == NULL) || | |
1317 | vm_object_coalesce(prev_entry->ba.object, | |
1318 | OFF_TO_IDX(prev_entry->ba.offset), | |
67e7cb85 MD |
1319 | (vm_size_t)(prev_entry->ba.end - prev_entry->ba.start), |
1320 | (vm_size_t)(end - prev_entry->ba.end)))) { | |
984263bc MD |
1321 | /* |
1322 | * We were able to extend the object. Determine if we | |
1323 | * can extend the previous map entry to include the | |
1324 | * new range as well. | |
1325 | */ | |
1326 | if ((prev_entry->inheritance == VM_INHERIT_DEFAULT) && | |
1327 | (prev_entry->protection == prot) && | |
1328 | (prev_entry->max_protection == max)) { | |
67e7cb85 MD |
1329 | map->size += (end - prev_entry->ba.end); |
1330 | vm_map_backing_adjust_end(prev_entry, end); | |
a108bf71 | 1331 | vm_map_simplify_entry(map, prev_entry, countp); |
b12defdc | 1332 | lwkt_reltoken(&map->token); |
984263bc MD |
1333 | return (KERN_SUCCESS); |
1334 | } | |
1335 | ||
1336 | /* | |
1337 | * If we can extend the object but cannot extend the | |
1338 | * map entry, we have to create a new map entry. We | |
1339 | * must bump the ref count on the extended object to | |
1340 | * account for it. object may be NULL. | |
1341 | */ | |
9de48ead MD |
1342 | object = prev_entry->ba.object; |
1343 | offset = prev_entry->ba.offset + | |
67e7cb85 | 1344 | (prev_entry->ba.end - prev_entry->ba.start); |
b12defdc MD |
1345 | if (object) { |
1346 | vm_object_hold(object); | |
9de48ead | 1347 | vm_object_lock_swap(); /* map->token order */ |
b12defdc | 1348 | vm_object_reference_locked(object); |
15553805 | 1349 | map_object = object; |
9de48ead | 1350 | must_drop = 1; |
b12defdc | 1351 | } |
984263bc MD |
1352 | } |
1353 | ||
1354 | /* | |
1355 | * NOTE: if conditionals fail, object can be NULL here. This occurs | |
1356 | * in things like the buffer map where we manage kva but do not manage | |
1357 | * backing objects. | |
1358 | */ | |
1359 | ||
1360 | /* | |
1361 | * Create a new entry | |
1362 | */ | |
5b329e62 | 1363 | new_entry = vm_map_entry_create(countp); |
67e7cb85 MD |
1364 | new_entry->ba.pmap = map->pmap; |
1365 | new_entry->ba.start = start; | |
1366 | new_entry->ba.end = end; | |
3091de50 | 1367 | new_entry->id = id; |
984263bc | 1368 | |
1b874851 | 1369 | new_entry->maptype = maptype; |
984263bc | 1370 | new_entry->eflags = protoeflags; |
0adbcbd6 MD |
1371 | new_entry->aux.master_pde = 0; /* in case size is different */ |
1372 | new_entry->aux.map_aux = map_aux; | |
9de48ead MD |
1373 | new_entry->ba.map_object = map_object; |
1374 | new_entry->ba.backing_ba = NULL; | |
44293a80 | 1375 | new_entry->ba.backing_count = 0; |
9de48ead | 1376 | new_entry->ba.offset = offset; |
64b5a8a5 | 1377 | new_entry->ba.aux_info = aux_info; |
9de48ead | 1378 | new_entry->ba.flags = 0; |
67e7cb85 | 1379 | new_entry->ba.pmap = map->pmap; |
984263bc MD |
1380 | |
1381 | new_entry->inheritance = VM_INHERIT_DEFAULT; | |
1382 | new_entry->protection = prot; | |
1383 | new_entry->max_protection = max; | |
1384 | new_entry->wired_count = 0; | |
1385 | ||
1386 | /* | |
1387 | * Insert the new entry into the list | |
1388 | */ | |
5b329e62 | 1389 | vm_map_backing_replicated(map, new_entry, MAP_BACK_BASEOBJREFD); |
47ec0953 | 1390 | vm_map_entry_link(map, new_entry); |
67e7cb85 | 1391 | map->size += new_entry->ba.end - new_entry->ba.start; |
984263bc MD |
1392 | |
1393 | /* | |
e6b81333 MD |
1394 | * Don't worry about updating freehint[] when inserting, allow |
1395 | * addresses to be lower than the actual first free spot. | |
984263bc | 1396 | */ |
984263bc MD |
1397 | #if 0 |
1398 | /* | |
1399 | * Temporarily removed to avoid MAP_STACK panic, due to | |
1400 | * MAP_STACK being a huge hack. Will be added back in | |
1401 | * when MAP_STACK (and the user stack mapping) is fixed. | |
1402 | */ | |
1403 | /* | |
1404 | * It may be possible to simplify the entry | |
1405 | */ | |
a108bf71 | 1406 | vm_map_simplify_entry(map, new_entry, countp); |
984263bc MD |
1407 | #endif |
1408 | ||
afeabdca MD |
1409 | /* |
1410 | * Try to pre-populate the page table. Mappings governed by virtual | |
1411 | * page tables cannot be prepopulated without a lot of work, so | |
1412 | * don't try. | |
1413 | */ | |
1414 | if ((cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL)) && | |
0adbcbd6 | 1415 | maptype != VM_MAPTYPE_UKSMAP) { |
ce94514e MD |
1416 | int dorelock = 0; |
1417 | if (vm_map_relock_enable && (cow & MAP_PREFAULT_RELOCK)) { | |
1418 | dorelock = 1; | |
1419 | vm_object_lock_swap(); | |
1420 | vm_object_drop(object); | |
1421 | } | |
530e94fc MD |
1422 | pmap_object_init_pt(map->pmap, new_entry, |
1423 | new_entry->ba.start, | |
1424 | new_entry->ba.end - new_entry->ba.start, | |
984263bc | 1425 | cow & MAP_PREFAULT_PARTIAL); |
ce94514e MD |
1426 | if (dorelock) { |
1427 | vm_object_hold(object); | |
1428 | vm_object_lock_swap(); | |
1429 | } | |
984263bc | 1430 | } |
9de48ead | 1431 | lwkt_reltoken(&map->token); |
b12defdc MD |
1432 | if (must_drop) |
1433 | vm_object_drop(object); | |
984263bc MD |
1434 | |
1435 | return (KERN_SUCCESS); | |
1436 | } | |
1437 | ||
1438 | /* | |
1439 | * Find sufficient space for `length' bytes in the given map, starting at | |
46754a20 | 1440 | * `start'. Returns 0 on success, 1 on no space. |
e9bb90e8 MD |
1441 | * |
1442 | * This function will returned an arbitrarily aligned pointer. If no | |
1443 | * particular alignment is required you should pass align as 1. Note that | |
1444 | * the map may return PAGE_SIZE aligned pointers if all the lengths used in | |
1445 | * the map are a multiple of PAGE_SIZE, even if you pass a smaller align | |
1446 | * argument. | |
1447 | * | |
1448 | * 'align' should be a power of 2 but is not required to be. | |
46754a20 MD |
1449 | * |
1450 | * The map must be exclusively locked. | |
1451 | * No other requirements. | |
984263bc MD |
1452 | */ |
1453 | int | |
c809941b | 1454 | vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length, |
9388fcaa | 1455 | vm_size_t align, int flags, vm_offset_t *addr) |
984263bc | 1456 | { |
47ec0953 | 1457 | vm_map_entry_t entry; |
e6b81333 MD |
1458 | vm_map_entry_t tmp; |
1459 | vm_offset_t hole_start; | |
984263bc | 1460 | vm_offset_t end; |
e9bb90e8 | 1461 | vm_offset_t align_mask; |
984263bc | 1462 | |
47ec0953 MD |
1463 | if (start < vm_map_min(map)) |
1464 | start = vm_map_min(map); | |
1465 | if (start > vm_map_max(map)) | |
984263bc MD |
1466 | return (1); |
1467 | ||
e9bb90e8 MD |
1468 | /* |
1469 | * If the alignment is not a power of 2 we will have to use | |
1470 | * a mod/division, set align_mask to a special value. | |
1471 | */ | |
1472 | if ((align | (align - 1)) + 1 != (align << 1)) | |
1473 | align_mask = (vm_offset_t)-1; | |
1474 | else | |
1475 | align_mask = align - 1; | |
1476 | ||
984263bc | 1477 | /* |
e6b81333 MD |
1478 | * Use freehint to adjust the start point, hopefully reducing |
1479 | * the iteration to O(1). | |
984263bc | 1480 | */ |
e6b81333 MD |
1481 | hole_start = vm_map_freehint_find(map, length, align); |
1482 | if (start < hole_start) | |
1483 | start = hole_start; | |
1484 | if (vm_map_lookup_entry(map, start, &tmp)) | |
67e7cb85 | 1485 | start = tmp->ba.end; |
47ec0953 | 1486 | entry = tmp; /* may be NULL */ |
984263bc MD |
1487 | |
1488 | /* | |
1489 | * Look through the rest of the map, trying to fit a new region in the | |
1490 | * gap between existing regions, or after the very last region. | |
1491 | */ | |
47ec0953 | 1492 | for (;;) { |
e9bb90e8 MD |
1493 | /* |
1494 | * Adjust the proposed start by the requested alignment, | |
1495 | * be sure that we didn't wrap the address. | |
1496 | */ | |
1497 | if (align_mask == (vm_offset_t)-1) | |
a77a893a | 1498 | end = roundup(start, align); |
e9bb90e8 MD |
1499 | else |
1500 | end = (start + align_mask) & ~align_mask; | |
1501 | if (end < start) | |
1502 | return (1); | |
1503 | start = end; | |
e6b81333 | 1504 | |
984263bc MD |
1505 | /* |
1506 | * Find the end of the proposed new region. Be sure we didn't | |
e9bb90e8 MD |
1507 | * go beyond the end of the map, or wrap around the address. |
1508 | * Then check to see if this is the last entry or if the | |
1509 | * proposed end fits in the gap between this and the next | |
1510 | * entry. | |
984263bc MD |
1511 | */ |
1512 | end = start + length; | |
47ec0953 | 1513 | if (end > vm_map_max(map) || end < start) |
984263bc | 1514 | return (1); |
c809941b MD |
1515 | |
1516 | /* | |
47ec0953 MD |
1517 | * Locate the next entry, we can stop if this is the |
1518 | * last entry (we know we are in-bounds so that would | |
1519 | * be a sucess). | |
c809941b | 1520 | */ |
47ec0953 MD |
1521 | if (entry) |
1522 | entry = vm_map_rb_tree_RB_NEXT(entry); | |
1523 | else | |
1524 | entry = RB_MIN(vm_map_rb_tree, &map->rb_root); | |
1525 | if (entry == NULL) | |
984263bc | 1526 | break; |
47ec0953 MD |
1527 | |
1528 | /* | |
1529 | * Determine if the proposed area would overlap the | |
1530 | * next entry. | |
4837705e MD |
1531 | * |
1532 | * When matching against a STACK entry, only allow the | |
1533 | * memory map to intrude on the ungrown portion of the | |
1534 | * STACK entry when MAP_TRYFIXED is set. | |
47ec0953 | 1535 | */ |
67e7cb85 | 1536 | if (entry->ba.start >= end) { |
47ec0953 | 1537 | if ((entry->eflags & MAP_ENTRY_STACK) == 0) |
c809941b | 1538 | break; |
4837705e | 1539 | if (flags & MAP_TRYFIXED) |
c809941b | 1540 | break; |
67e7cb85 | 1541 | if (entry->ba.start - entry->aux.avail_ssize >= end) |
c809941b MD |
1542 | break; |
1543 | } | |
67e7cb85 | 1544 | start = entry->ba.end; |
984263bc | 1545 | } |
e6b81333 MD |
1546 | |
1547 | /* | |
1548 | * Update the freehint | |
1549 | */ | |
1550 | vm_map_freehint_update(map, start, length, align); | |
a8cf2878 MD |
1551 | |
1552 | /* | |
1553 | * Grow the kernel_map if necessary. pmap_growkernel() will panic | |
1554 | * if it fails. The kernel_map is locked and nothing can steal | |
1555 | * our address space if pmap_growkernel() blocks. | |
1556 | * | |
1557 | * NOTE: This may be unconditionally called for kldload areas on | |
1558 | * x86_64 because these do not bump kernel_vm_end (which would | |
1559 | * fill 128G worth of page tables!). Therefore we must not | |
1560 | * retry. | |
1561 | */ | |
1eeaf6b2 | 1562 | if (map == kernel_map) { |
a8cf2878 MD |
1563 | vm_offset_t kstop; |
1564 | ||
1565 | kstop = round_page(start + length); | |
1566 | if (kstop > kernel_vm_end) | |
1567 | pmap_growkernel(start, kstop); | |
984263bc | 1568 | } |
a108bf71 | 1569 | *addr = start; |
984263bc MD |
1570 | return (0); |
1571 | } | |
1572 | ||
1573 | /* | |
46754a20 | 1574 | * vm_map_find finds an unallocated region in the target address map with |
b12defdc MD |
1575 | * the given length and allocates it. The search is defined to be first-fit |
1576 | * from the specified address; the region found is returned in the same | |
1577 | * parameter. | |
984263bc | 1578 | * |
46754a20 MD |
1579 | * If object is non-NULL, ref count must be bumped by caller |
1580 | * prior to making call to account for the new entry. | |
1581 | * | |
1582 | * No requirements. This function will lock the map temporarily. | |
984263bc MD |
1583 | */ |
1584 | int | |
0adbcbd6 MD |
1585 | vm_map_find(vm_map_t map, void *map_object, void *map_aux, |
1586 | vm_ooffset_t offset, vm_offset_t *addr, | |
3091de50 MD |
1587 | vm_size_t length, vm_size_t align, boolean_t fitit, |
1588 | vm_maptype_t maptype, vm_subsys_t id, | |
1589 | vm_prot_t prot, vm_prot_t max, int cow) | |
984263bc MD |
1590 | { |
1591 | vm_offset_t start; | |
0adbcbd6 | 1592 | vm_object_t object; |
64b5a8a5 | 1593 | void *aux_info; |
03aa8d99 | 1594 | int result; |
a108bf71 | 1595 | int count; |
984263bc | 1596 | |
64b5a8a5 | 1597 | /* |
4aa6d05c MD |
1598 | * Certain UKSMAPs may need aux_info. |
1599 | * | |
1600 | * (map_object is the callback function, aux_info is the process | |
1601 | * or thread, if necessary). | |
64b5a8a5 MD |
1602 | */ |
1603 | aux_info = NULL; | |
1604 | if (maptype == VM_MAPTYPE_UKSMAP) { | |
4aa6d05c MD |
1605 | KKASSERT(map_aux != NULL && map_object != NULL); |
1606 | ||
1607 | switch(minor(((struct cdev *)map_aux))) { | |
1608 | case 5: | |
1609 | /* | |
1610 | * /dev/upmap | |
1611 | */ | |
1612 | aux_info = curproc; | |
1613 | break; | |
1614 | case 6: | |
1615 | /* | |
1616 | * /dev/kpmap | |
1617 | */ | |
1618 | break; | |
1619 | case 7: | |
1620 | /* | |
1621 | * /dev/lpmap | |
1622 | */ | |
1623 | aux_info = curthread->td_lwp; | |
1624 | break; | |
1625 | } | |
0adbcbd6 | 1626 | object = NULL; |
64b5a8a5 | 1627 | } else { |
0adbcbd6 | 1628 | object = map_object; |
64b5a8a5 | 1629 | } |
0adbcbd6 | 1630 | |
984263bc MD |
1631 | start = *addr; |
1632 | ||
a108bf71 | 1633 | count = vm_map_entry_reserve(MAP_RESERVE_COUNT); |
984263bc | 1634 | vm_map_lock(map); |
b12defdc | 1635 | if (object) |
2734d278 | 1636 | vm_object_hold_shared(object); |
c809941b | 1637 | if (fitit) { |
9388fcaa | 1638 | if (vm_map_findspace(map, start, length, align, 0, addr)) { |
552112a0 MD |
1639 | if (object) |
1640 | vm_object_drop(object); | |
984263bc | 1641 | vm_map_unlock(map); |
a108bf71 | 1642 | vm_map_entry_release(count); |
984263bc MD |
1643 | return (KERN_NO_SPACE); |
1644 | } | |
1645 | start = *addr; | |
1646 | } | |
64b5a8a5 MD |
1647 | result = vm_map_insert(map, &count, |
1648 | map_object, map_aux, | |
1649 | offset, aux_info, | |
1650 | start, start + length, | |
3091de50 | 1651 | maptype, id, prot, max, cow); |
b12defdc MD |
1652 | if (object) |
1653 | vm_object_drop(object); | |
984263bc | 1654 | vm_map_unlock(map); |
a108bf71 | 1655 | vm_map_entry_release(count); |
984263bc | 1656 | |
984263bc MD |
1657 | return (result); |
1658 | } | |
1659 | ||
1660 | /* | |
46754a20 MD |
1661 | * Simplify the given map entry by merging with either neighbor. This |
1662 | * routine also has the ability to merge with both neighbors. | |
984263bc | 1663 | * |
46754a20 MD |
1664 | * This routine guarentees that the passed entry remains valid (though |
1665 | * possibly extended). When merging, this routine may delete one or | |
1666 | * both neighbors. No action is taken on entries which have their | |
1667 | * in-transition flag set. | |
984263bc | 1668 | * |
46754a20 | 1669 | * The map must be exclusively locked. |
984263bc MD |
1670 | */ |
1671 | void | |
a108bf71 | 1672 | vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry, int *countp) |
984263bc MD |
1673 | { |
1674 | vm_map_entry_t next, prev; | |
1675 | vm_size_t prevsize, esize; | |
1676 | ||
1b874851 | 1677 | if (entry->eflags & MAP_ENTRY_IN_TRANSITION) { |
12e4aaff | 1678 | ++mycpu->gd_cnt.v_intrans_coll; |
984263bc MD |
1679 | return; |
1680 | } | |
1681 | ||
1b874851 MD |
1682 | if (entry->maptype == VM_MAPTYPE_SUBMAP) |
1683 | return; | |
0adbcbd6 MD |
1684 | if (entry->maptype == VM_MAPTYPE_UKSMAP) |
1685 | return; | |
1b874851 | 1686 | |
47ec0953 MD |
1687 | prev = vm_map_rb_tree_RB_PREV(entry); |
1688 | if (prev) { | |
67e7cb85 MD |
1689 | prevsize = prev->ba.end - prev->ba.start; |
1690 | if ( (prev->ba.end == entry->ba.start) && | |
1b874851 | 1691 | (prev->maptype == entry->maptype) && |
9de48ead MD |
1692 | (prev->ba.object == entry->ba.object) && |
1693 | (prev->ba.backing_ba == entry->ba.backing_ba) && | |
1694 | (!prev->ba.object || | |
1695 | (prev->ba.offset + prevsize == entry->ba.offset)) && | |
984263bc MD |
1696 | (prev->eflags == entry->eflags) && |
1697 | (prev->protection == entry->protection) && | |
1698 | (prev->max_protection == entry->max_protection) && | |
1699 | (prev->inheritance == entry->inheritance) && | |
3091de50 | 1700 | (prev->id == entry->id) && |
984263bc | 1701 | (prev->wired_count == entry->wired_count)) { |
67e7cb85 MD |
1702 | /* |
1703 | * NOTE: order important. Unlink before gumming up | |
1704 | * the RBTREE w/adjust, adjust before disposal | |
1705 | * of prior entry, to avoid pmap snafus. | |
1706 | */ | |
984263bc | 1707 | vm_map_entry_unlink(map, prev); |
67e7cb85 MD |
1708 | vm_map_backing_adjust_start(entry, prev->ba.start); |
1709 | if (entry->ba.object == NULL) | |
1710 | entry->ba.offset = 0; | |
a108bf71 | 1711 | vm_map_entry_dispose(map, prev, countp); |
984263bc MD |
1712 | } |
1713 | } | |
1714 | ||
47ec0953 MD |
1715 | next = vm_map_rb_tree_RB_NEXT(entry); |
1716 | if (next) { | |
67e7cb85 MD |
1717 | esize = entry->ba.end - entry->ba.start; |
1718 | if ((entry->ba.end == next->ba.start) && | |
1b874851 | 1719 | (next->maptype == entry->maptype) && |
9de48ead MD |
1720 | (next->ba.object == entry->ba.object) && |
1721 | (prev->ba.backing_ba == entry->ba.backing_ba) && | |
1722 | (!entry->ba.object || | |
1723 | (entry->ba.offset + esize == next->ba.offset)) && | |
984263bc MD |
1724 | (next->eflags == entry->eflags) && |
1725 | (next->protection == entry->protection) && | |
1726 | (next->max_protection == entry->max_protection) && | |
1727 | (next->inheritance == entry->inheritance) && | |
3091de50 | 1728 | (next->id == entry->id) && |
984263bc | 1729 | (next->wired_count == entry->wired_count)) { |
67e7cb85 MD |
1730 | /* |
1731 | * NOTE: order important. Unlink before gumming up | |
1732 | * the RBTREE w/adjust, adjust before disposal | |
1733 | * of prior entry, to avoid pmap snafus. | |
1734 | */ | |
984263bc | 1735 | vm_map_entry_unlink(map, next); |
67e7cb85 | 1736 | vm_map_backing_adjust_end(entry, next->ba.end); |
a108bf71 | 1737 | vm_map_entry_dispose(map, next, countp); |
984263bc MD |
1738 | } |
1739 | } | |
1740 | } | |
46754a20 | 1741 | |
984263bc | 1742 | /* |
46754a20 MD |
1743 | * Asserts that the given entry begins at or after the specified address. |
1744 | * If necessary, it splits the entry into two. | |
984263bc | 1745 | */ |
46754a20 MD |
1746 | #define vm_map_clip_start(map, entry, startaddr, countp) \ |
1747 | { \ | |
67e7cb85 | 1748 | if (startaddr > entry->ba.start) \ |
46754a20 | 1749 | _vm_map_clip_start(map, entry, startaddr, countp); \ |
984263bc MD |
1750 | } |
1751 | ||
1752 | /* | |
46754a20 MD |
1753 | * This routine is called only when it is known that the entry must be split. |
1754 | * | |
1755 | * The map must be exclusively locked. | |
984263bc MD |
1756 | */ |
1757 | static void | |
46754a20 MD |
1758 | _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start, |
1759 | int *countp) | |
984263bc MD |
1760 | { |
1761 | vm_map_entry_t new_entry; | |
1762 | ||
1763 | /* | |
1764 | * Split off the front portion -- note that we must insert the new | |
1765 | * entry BEFORE this one, so that this entry has the specified | |
1766 | * starting address. | |
1767 | */ | |
1768 | ||
a108bf71 | 1769 | vm_map_simplify_entry(map, entry, countp); |
984263bc MD |
1770 | |
1771 | /* | |
1772 | * If there is no object backing this entry, we might as well create | |
1773 | * one now. If we defer it, an object can get created after the map | |
1774 | * is clipped, and individual objects will be created for the split-up | |
1775 | * map. This is a bit of a hack, but is also about the best place to | |
1776 | * put this improvement. | |
1777 | */ | |
9de48ead | 1778 | if (entry->ba.object == NULL && !map->system_map && |
ce5d7a1c | 1779 | VM_MAP_ENTRY_WITHIN_PARTITION(entry)) { |
53025830 | 1780 | vm_map_entry_allocate_object(entry); |
984263bc MD |
1781 | } |
1782 | ||
67e7cb85 MD |
1783 | /* |
1784 | * NOTE: The replicated function will adjust start, end, and offset | |
1785 | * for the remainder of the backing_ba linkages. We must fixup | |
1786 | * the embedded ba. | |
1787 | */ | |
5b329e62 | 1788 | new_entry = vm_map_entry_create(countp); |
984263bc | 1789 | *new_entry = *entry; |
67e7cb85 | 1790 | new_entry->ba.end = start; |
984263bc | 1791 | |
67e7cb85 MD |
1792 | /* |
1793 | * Ordering is important, make sure the new entry is replicated | |
1794 | * before we cut the exiting entry. | |
1795 | */ | |
5b329e62 | 1796 | vm_map_backing_replicated(map, new_entry, MAP_BACK_CLIPPED); |
67e7cb85 | 1797 | vm_map_backing_adjust_start(entry, start); |
47ec0953 | 1798 | vm_map_entry_link(map, new_entry); |
984263bc MD |
1799 | } |
1800 | ||
1801 | /* | |
46754a20 MD |
1802 | * Asserts that the given entry ends at or before the specified address. |
1803 | * If necessary, it splits the entry into two. | |
984263bc | 1804 | * |
46754a20 | 1805 | * The map must be exclusively locked. |
984263bc | 1806 | */ |
46754a20 MD |
1807 | #define vm_map_clip_end(map, entry, endaddr, countp) \ |
1808 | { \ | |
67e7cb85 | 1809 | if (endaddr < entry->ba.end) \ |
46754a20 | 1810 | _vm_map_clip_end(map, entry, endaddr, countp); \ |
984263bc MD |
1811 | } |
1812 | ||
1813 | /* | |
46754a20 MD |
1814 | * This routine is called only when it is known that the entry must be split. |
1815 | * | |
1816 | * The map must be exclusively locked. | |
984263bc MD |
1817 | */ |
1818 | static void | |
46754a20 MD |
1819 | _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end, |
1820 | int *countp) | |
984263bc MD |
1821 | { |
1822 | vm_map_entry_t new_entry; | |
1823 | ||
1824 | /* | |
1825 | * If there is no object backing this entry, we might as well create | |
1826 | * one now. If we defer it, an object can get created after the map | |
1827 | * is clipped, and individual objects will be created for the split-up | |
1828 | * map. This is a bit of a hack, but is also about the best place to | |
1829 | * put this improvement. | |
1830 | */ | |
1831 | ||
9de48ead | 1832 | if (entry->ba.object == NULL && !map->system_map && |
ce5d7a1c | 1833 | VM_MAP_ENTRY_WITHIN_PARTITION(entry)) { |
53025830 | 1834 | vm_map_entry_allocate_object(entry); |
984263bc MD |
1835 | } |
1836 | ||
1837 | /* | |
1838 | * Create a new entry and insert it AFTER the specified entry | |
67e7cb85 MD |
1839 | * |
1840 | * NOTE: The replicated function will adjust start, end, and offset | |
1841 | * for the remainder of the backing_ba linkages. We must fixup | |
1842 | * the embedded ba. | |
984263bc | 1843 | */ |
5b329e62 | 1844 | new_entry = vm_map_entry_create(countp); |
984263bc | 1845 | *new_entry = *entry; |
67e7cb85 MD |
1846 | new_entry->ba.start = end; |
1847 | new_entry->ba.offset += (new_entry->ba.start - entry->ba.start); | |
984263bc | 1848 | |
67e7cb85 MD |
1849 | /* |
1850 | * Ordering is important, make sure the new entry is replicated | |
1851 | * before we cut the exiting entry. | |
1852 | */ | |
5b329e62 | 1853 | vm_map_backing_replicated(map, new_entry, MAP_BACK_CLIPPED); |
67e7cb85 | 1854 | vm_map_backing_adjust_end(entry, end); |
47ec0953 | 1855 | vm_map_entry_link(map, new_entry); |
984263bc MD |
1856 | } |
1857 | ||
1858 | /* | |
46754a20 MD |
1859 | * Asserts that the starting and ending region addresses fall within the |
1860 | * valid range for the map. | |
984263bc | 1861 | */ |
46754a20 MD |
1862 | #define VM_MAP_RANGE_CHECK(map, start, end) \ |
1863 | { \ | |
1864 | if (start < vm_map_min(map)) \ | |
1865 | start = vm_map_min(map); \ | |
1866 | if (end > vm_map_max(map)) \ | |
1867 | end = vm_map_max(map); \ | |
1868 | if (start > end) \ | |
1869 | start = end; \ | |
1870 | } | |
984263bc MD |
1871 | |
1872 | /* | |
46754a20 MD |
1873 | * Used to block when an in-transition collison occurs. The map |
1874 | * is unlocked for the sleep and relocked before the return. | |
984263bc | 1875 | */ |
984263bc | 1876 | void |
641f3b0a | 1877 | vm_map_transition_wait(vm_map_t map, int relock) |
984263bc | 1878 | { |
ff13bc52 | 1879 | tsleep_interlock(map, 0); |
984263bc | 1880 | vm_map_unlock(map); |
ff13bc52 | 1881 | tsleep(map, PINTERLOCKED, "vment", 0); |
641f3b0a MD |
1882 | if (relock) |
1883 | vm_map_lock(map); | |
984263bc MD |
1884 | } |
1885 | ||
1886 | /* | |
46754a20 MD |
1887 | * When we do blocking operations with the map lock held it is |
1888 | * possible that a clip might have occured on our in-transit entry, | |
1889 | * requiring an adjustment to the entry in our loop. These macros | |
1890 | * help the pageable and clip_range code deal with the case. The | |
1891 | * conditional costs virtually nothing if no clipping has occured. | |
984263bc MD |
1892 | */ |
1893 | ||
47ec0953 MD |
1894 | #define CLIP_CHECK_BACK(entry, save_start) \ |
1895 | do { \ | |
67e7cb85 | 1896 | while (entry->ba.start != save_start) { \ |
47ec0953 MD |
1897 | entry = vm_map_rb_tree_RB_PREV(entry); \ |
1898 | KASSERT(entry, ("bad entry clip")); \ | |
1899 | } \ | |
984263bc MD |
1900 | } while(0) |
1901 | ||
47ec0953 MD |
1902 | #define CLIP_CHECK_FWD(entry, save_end) \ |
1903 | do { \ | |
67e7cb85 | 1904 | while (entry->ba.end != save_end) { \ |
47ec0953 MD |
1905 | entry = vm_map_rb_tree_RB_NEXT(entry); \ |
1906 | KASSERT(entry, ("bad entry clip")); \ | |
1907 | } \ | |
984263bc MD |
1908 | } while(0) |
1909 | ||
1910 | ||
1911 | /* | |
46754a20 MD |
1912 | * Clip the specified range and return the base entry. The |
1913 | * range may cover several entries starting at the returned base | |
1914 | * and the first and last entry in the covering sequence will be | |
1915 | * properly clipped to the requested start and end address. | |
1916 | * | |
1917 | * If no holes are allowed you should pass the MAP_CLIP_NO_HOLES | |
1918 | * flag. | |
1919 | * | |
1920 | * The MAP_ENTRY_IN_TRANSITION flag will be set for the entries | |
1921 | * covered by the requested range. | |
1922 | * | |
1923 | * The map must be exclusively locked on entry and will remain locked | |
1924 | * on return. If no range exists or the range contains holes and you | |
1925 | * specified that no holes were allowed, NULL will be returned. This | |
1926 | * routine may temporarily unlock the map in order avoid a deadlock when | |
1927 | * sleeping. | |
984263bc MD |
1928 | */ |
1929 | static | |
1930 | vm_map_entry_t | |
a108bf71 | 1931 | vm_map_clip_range(vm_map_t map, vm_offset_t start, vm_offset_t end, |
46754a20 | 1932 | int *countp, int flags) |
984263bc MD |
1933 | { |
1934 | vm_map_entry_t start_entry; | |
1935 | vm_map_entry_t entry; | |
47ec0953 | 1936 | vm_map_entry_t next; |
984263bc MD |
1937 | |
1938 | /* | |
1939 | * Locate the entry and effect initial clipping. The in-transition | |
1940 | * case does not occur very often so do not try to optimize it. | |
1941 | */ | |
1942 | again: | |
1943 | if (vm_map_lookup_entry(map, start, &start_entry) == FALSE) | |
1944 | return (NULL); | |
1945 | entry = start_entry; | |
1946 | if (entry->eflags & MAP_ENTRY_IN_TRANSITION) { | |
1947 | entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; | |
12e4aaff MD |
1948 | ++mycpu->gd_cnt.v_intrans_coll; |
1949 | ++mycpu->gd_cnt.v_intrans_wait; | |
641f3b0a | 1950 | vm_map_transition_wait(map, 1); |
984263bc MD |
1951 | /* |
1952 | * entry and/or start_entry may have been clipped while | |
1953 | * we slept, or may have gone away entirely. We have | |
1954 | * to restart from the lookup. | |
1955 | */ | |
1956 | goto again; | |
1957 | } | |
46754a20 | 1958 | |
984263bc MD |
1959 | /* |
1960 | * Since we hold an exclusive map lock we do not have to restart | |
1961 | * after clipping, even though clipping may block in zalloc. | |
1962 | */ | |
a108bf71 MD |
1963 | vm_map_clip_start(map, entry, start, countp); |
1964 | vm_map_clip_end(map, entry, end, countp); | |
984263bc MD |
1965 | entry->eflags |= MAP_ENTRY_IN_TRANSITION; |
1966 | ||
1967 | /* | |
1968 | * Scan entries covered by the range. When working on the next | |
1969 | * entry a restart need only re-loop on the current entry which | |
1970 | * we have already locked, since 'next' may have changed. Also, | |
1971 | * even though entry is safe, it may have been clipped so we | |
1972 | * have to iterate forwards through the clip after sleeping. | |
1973 | */ | |
47ec0953 MD |
1974 | for (;;) { |
1975 | next = vm_map_rb_tree_RB_NEXT(entry); | |
67e7cb85 | 1976 | if (next == NULL || next->ba.start >= end) |
47ec0953 | 1977 | break; |
984263bc | 1978 | if (flags & MAP_CLIP_NO_HOLES) { |
67e7cb85 | 1979 | if (next->ba.start > entry->ba.end) { |
984263bc | 1980 | vm_map_unclip_range(map, start_entry, |
67e7cb85 | 1981 | start, entry->ba.end, countp, flags); |
984263bc MD |
1982 | return(NULL); |
1983 | } | |
1984 | } | |
1985 | ||
1986 | if (next->eflags & MAP_ENTRY_IN_TRANSITION) { | |
67e7cb85 | 1987 | vm_offset_t save_end = entry->ba.end; |
984263bc | 1988 | next->eflags |= MAP_ENTRY_NEEDS_WAKEUP; |
12e4aaff MD |
1989 | ++mycpu->gd_cnt.v_intrans_coll; |
1990 | ++mycpu->gd_cnt.v_intrans_wait; | |
641f3b0a | 1991 | vm_map_transition_wait(map, 1); |
984263bc MD |
1992 | |
1993 | /* | |
1994 | * clips might have occured while we blocked. | |
1995 | */ | |
1996 | CLIP_CHECK_FWD(entry, save_end); | |
1997 | CLIP_CHECK_BACK(start_entry, start); | |
1998 | continue; | |
1999 | } | |
641f3b0a | 2000 | |
984263bc MD |
2001 | /* |
2002 | * No restart necessary even though clip_end may block, we | |
2003 | * are holding the map lock. | |
2004 | */ | |
a108bf71 | 2005 | vm_map_clip_end(map, next, end, countp); |
984263bc MD |
2006 | next->eflags |= MAP_ENTRY_IN_TRANSITION; |
2007 | entry = next; | |
2008 | } | |
2009 | if (flags & MAP_CLIP_NO_HOLES) { | |
67e7cb85 | 2010 | if (entry->ba.end != end) { |
984263bc | 2011 | vm_map_unclip_range(map, start_entry, |
67e7cb85 | 2012 | start, entry->ba.end, countp, flags); |
984263bc MD |
2013 | return(NULL); |
2014 | } | |
2015 | } | |
2016 | return(start_entry); | |
2017 | } | |
2018 | ||
2019 | /* | |
46754a20 MD |
2020 | * Undo the effect of vm_map_clip_range(). You should pass the same |
2021 | * flags and the same range that you passed to vm_map_clip_range(). | |
2022 | * This code will clear the in-transition flag on the entries and | |
2023 | * wake up anyone waiting. This code will also simplify the sequence | |
2024 | * and attempt to merge it with entries before and after the sequence. | |
2025 | * | |
2026 | * The map must be locked on entry and will remain locked on return. | |
2027 | * | |
2028 | * Note that you should also pass the start_entry returned by | |
2029 | * vm_map_clip_range(). However, if you block between the two calls | |
2030 | * with the map unlocked please be aware that the start_entry may | |
2031 | * have been clipped and you may need to scan it backwards to find | |
2032 | * the entry corresponding with the original start address. You are | |
2033 | * responsible for this, vm_map_unclip_range() expects the correct | |
2034 | * start_entry to be passed to it and will KASSERT otherwise. | |
984263bc MD |
2035 | */ |
2036 | static | |
2037 | void | |
46754a20 MD |
2038 | vm_map_unclip_range(vm_map_t map, vm_map_entry_t start_entry, |
2039 | vm_offset_t start, vm_offset_t end, | |
2040 | int *countp, int flags) | |
984263bc MD |
2041 | { |
2042 | vm_map_entry_t entry; | |
2043 | ||
2044 | entry = start_entry; | |
2045 | ||
67e7cb85 MD |
2046 | KASSERT(entry->ba.start == start, ("unclip_range: illegal base entry")); |
2047 | while (entry && entry->ba.start < end) { | |
46754a20 MD |
2048 | KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION, |
2049 | ("in-transition flag not set during unclip on: %p", | |
2050 | entry)); | |
67e7cb85 | 2051 | KASSERT(entry->ba.end <= end, |
46754a20 | 2052 | ("unclip_range: tail wasn't clipped")); |
984263bc MD |
2053 | entry->eflags &= ~MAP_ENTRY_IN_TRANSITION; |
2054 | if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) { | |
2055 | entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP; | |
2056 | wakeup(map); | |
2057 | } | |
47ec0953 | 2058 | entry = vm_map_rb_tree_RB_NEXT(entry); |
984263bc MD |
2059 | } |
2060 | ||
2061 | /* | |
2062 | * Simplification does not block so there is no restart case. | |
2063 | */ | |
2064 | entry = start_entry; | |
67e7cb85 | 2065 | while (entry && entry->ba.start < end) { |
a108bf71 | 2066 | vm_map_simplify_entry(map, entry, countp); |
47ec0953 | 2067 | entry = vm_map_rb_tree_RB_NEXT(entry); |
984263bc MD |
2068 | } |
2069 | } | |
2070 | ||
2071 | /* | |
46754a20 | 2072 | * Mark the given range as handled by a subordinate map. |
984263bc | 2073 | * |
46754a20 MD |
2074 | * This range must have been created with vm_map_find(), and no other |
2075 | * operations may have been performed on this range prior to calling | |
2076 | * vm_map_submap(). | |
984263bc | 2077 | * |
46754a20 | 2078 | * Submappings cannot be removed. |
984263bc | 2079 | * |
46754a20 | 2080 | * No requirements. |
984263bc MD |
2081 | */ |
2082 | int | |
a108bf71 | 2083 | vm_map_submap(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_map_t submap) |
984263bc MD |
2084 | { |
2085 | vm_map_entry_t entry; | |
2086 | int result = KERN_INVALID_ARGUMENT; | |
a108bf71 | 2087 | int count; |
984263bc | 2088 | |
a108bf71 | 2089 | count = vm_map_entry_reserve(MAP_RESERVE_COUNT); |
984263bc MD |
2090 | vm_map_lock(map); |
2091 | ||
2092 | VM_MAP_RANGE_CHECK(map, start, end); | |
2093 | ||
2094 | if (vm_map_lookup_entry(map, start, &entry)) { | |
a108bf71 | 2095 | vm_map_clip_start(map, entry, start, &count); |
47ec0953 MD |
2096 | } else if (entry) { |
2097 | entry = vm_map_rb_tree_RB_NEXT(entry); | |
984263bc | 2098 | } else { |
47ec0953 | 2099 | entry = RB_MIN(vm_map_rb_tree, &map->rb_root); |
984263bc MD |
2100 | } |
2101 | ||
a108bf71 | 2102 | vm_map_clip_end(map, entry, end, &count); |
984263bc | 2103 | |
67e7cb85 | 2104 | if ((entry->ba.start == start) && (entry->ba.end == end) && |
984263bc | 2105 | ((entry->eflags & MAP_ENTRY_COW) == 0) && |
9de48ead MD |
2106 | (entry->ba.object == NULL)) { |
2107 | entry->ba.sub_map = submap; | |
1b874851 | 2108 | entry->maptype = VM_MAPTYPE_SUBMAP; |
984263bc MD |
2109 | result = KERN_SUCCESS; |
2110 | } | |
2111 | vm_map_unlock(map); | |
a108bf71 | 2112 | vm_map_entry_release(count); |
984263bc MD |
2113 | |
2114 | return (result); | |
2115 | } | |
2116 | ||
2117 | /* | |
1b874851 MD |
2118 | * Sets the protection of the specified address region in the target map. |
2119 | * If "set_max" is specified, the maximum protection is to be set; | |
2120 | * otherwise, only the current protection is affected. | |
2121 | * | |
2122 | * The protection is not applicable to submaps, but is applicable to normal | |
2123 | * maps and maps governed by virtual page tables. For example, when operating | |
2124 | * on a virtual page table our protection basically controls how COW occurs | |
2125 | * on the backing object, whereas the virtual page table abstraction itself | |
2126 | * is an abstraction for userland. | |
46754a20 MD |
2127 | * |
2128 | * No requirements. | |
984263bc MD |
2129 | */ |
2130 | int | |
2131 | vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end, | |
2132 | vm_prot_t new_prot, boolean_t set_max) | |
2133 | { | |
2134 | vm_map_entry_t current; | |
2135 | vm_map_entry_t entry; | |
a108bf71 | 2136 | int count; |
984263bc | 2137 | |
a108bf71 | 2138 | count = vm_map_entry_reserve(MAP_RESERVE_COUNT); |
984263bc MD |
2139 | vm_map_lock(map); |
2140 | ||
2141 | VM_MAP_RANGE_CHECK(map, start, end); | |
2142 | ||
2143 | if (vm_map_lookup_entry(map, start, &entry)) { | |
a108bf71 | 2144 | vm_map_clip_start(map, entry, start, &count); |
47ec0953 MD |
2145 | } else if (entry) { |
2146 | entry = vm_map_rb_tree_RB_NEXT(entry); | |
984263bc | 2147 | } else { |
47ec0953 | 2148 | entry = RB_MIN(vm_map_rb_tree, &map->rb_root); |
984263bc MD |
2149 | } |
2150 | ||
2151 | /* | |
2152 | * Make a first pass to check for protection violations. | |
2153 | */ | |
984263bc | 2154 | current = entry; |
67e7cb85 | 2155 | while (current && current->ba.start < end) { |
1b874851 | 2156 | if (current->maptype == VM_MAPTYPE_SUBMAP) { |
984263bc | 2157 | vm_map_unlock(map); |
a108bf71 | 2158 | vm_map_entry_release(count); |
984263bc MD |
2159 | return (KERN_INVALID_ARGUMENT); |
2160 | } | |
2161 | if ((new_prot & current->max_protection) != new_prot) { | |
2162 | vm_map_unlock(map); | |
a108bf71 | 2163 | vm_map_entry_release(count); |
984263bc MD |
2164 | return (KERN_PROTECTION_FAILURE); |
2165 | } | |
fa4a12c4 MD |
2166 | |
2167 | /* | |
2168 | * When making a SHARED+RW file mmap writable, update | |
2169 | * v_lastwrite_ts. | |
2170 | */ | |
2171 | if (new_prot & PROT_WRITE && | |
2172 | (current->eflags & MAP_ENTRY_NEEDS_COPY) == 0 && | |
4d4f84f5 | 2173 | current->maptype == VM_MAPTYPE_NORMAL && |
9de48ead MD |
2174 | current->ba.object && |
2175 | current->ba.object->type == OBJT_VNODE) { | |
fa4a12c4 MD |
2176 | struct vnode *vp; |
2177 | ||
9de48ead | 2178 | vp = current->ba.object->handle; |
fa4a12c4 MD |
2179 | if (vp && vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT) == 0) { |
2180 | vfs_timestamp(&vp->v_lastwrite_ts); | |
2181 | vsetflags(vp, VLASTWRITETS); | |
2182 | vn_unlock(vp); | |
2183 | } | |
2184 | } | |
47ec0953 | 2185 | current = vm_map_rb_tree_RB_NEXT(current); |
984263bc MD |
2186 | } |
2187 | ||
2188 | /* | |
2189 | * Go back and fix up protections. [Note that clipping is not | |
2190 | * necessary the second time.] | |
2191 | */ | |
984263bc MD |
2192 | current = entry; |
2193 | ||
67e7cb85 | 2194 | while (current && current->ba.start < end) { |
984263bc MD |
2195 | vm_prot_t old_prot; |
2196 | ||
a108bf71 | 2197 | vm_map_clip_end(map, current, end, &count); |
984263bc MD |
2198 | |
2199 | old_prot = current->protection; | |
1b874851 | 2200 | if (set_max) { |
62cc5940 MD |
2201 | current->max_protection = new_prot; |
2202 | current->protection = new_prot & old_prot; | |
1b874851 | 2203 | } else { |
984263bc | 2204 | current->protection = new_prot; |
1b874851 | 2205 | } |
984263bc MD |
2206 | |
2207 | /* | |
2208 | * Update physical map if necessary. Worry about copy-on-write | |
2209 | * here -- CHECK THIS XXX | |
2210 | */ | |
984263bc MD |
2211 | if (current->protection != old_prot) { |
2212 | #define MASK(entry) (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \ | |
2213 | VM_PROT_ALL) | |
2214 | ||
67e7cb85 MD |
2215 | pmap_protect(map->pmap, current->ba.start, |
2216 | current->ba.end, | |
984263bc MD |
2217 | current->protection & MASK(current)); |
2218 | #undef MASK | |
2219 | } | |
2220 | ||
a108bf71 | 2221 | vm_map_simplify_entry(map, current, &count); |
984263bc | 2222 | |
47ec0953 | 2223 | current = vm_map_rb_tree_RB_NEXT(current); |
984263bc | 2224 | } |
984263bc | 2225 | vm_map_unlock(map); |
a108bf71 | 2226 | vm_map_entry_release(count); |
984263bc MD |
2227 | return (KERN_SUCCESS); |
2228 | } | |
2229 | ||
2230 | /* | |
46754a20 MD |
2231 | * This routine traverses a processes map handling the madvise |
2232 | * system call. Advisories are classified as either those effecting | |
2233 | * the vm_map_entry structure, or those effecting the underlying | |
2234 | * objects. | |
984263bc | 2235 | * |
46754a20 | 2236 | * The <value> argument is used for extended madvise calls. |
afeabdca | 2237 | * |
46754a20 | 2238 | * No requirements. |
984263bc | 2239 | */ |
984263bc | 2240 | int |
afeabdca MD |
2241 | vm_map_madvise(vm_map_t map, vm_offset_t start, vm_offset_t end, |
2242 | int behav, off_t value) | |
984263bc MD |
2243 | { |
2244 | vm_map_entry_t current, entry; | |
2245 | int modify_map = 0; | |
afeabdca | 2246 | int error = 0; |
a108bf71 | 2247 | int count; |
984263bc MD |
2248 | |
2249 | /* | |
2250 | * Some madvise calls directly modify the vm_map_entry, in which case | |
2251 | * we need to use an exclusive lock on the map and we need to perform | |
2252 | * various clipping operations. Otherwise we only need a read-lock | |
2253 | * on the map. | |
2254 | */ | |
a108bf71 MD |
2255 | count = vm_map_entry_reserve(MAP_RESERVE_COUNT); |
2256 | ||
984263bc MD |
2257 | switch(behav) { |
2258 | case MADV_NORMAL: | |
2259 | case MADV_SEQUENTIAL: | |
2260 | case MADV_RANDOM: | |
2261 | case MADV_NOSYNC: | |
2262 | case MADV_AUTOSYNC: | |
2263 | case MADV_NOCORE: | |
2264 | case MADV_CORE: | |
afeabdca | 2265 | case MADV_SETMAP: |
984263bc MD |
2266 | modify_map = 1; |
2267 | vm_map_lock(map); | |
2268 | break; | |
76f1911e | 2269 | case MADV_INVAL: |
984263bc MD |
2270 | case MADV_WILLNEED: |
2271 | case MADV_DONTNEED: | |
2272 | case MADV_FREE: | |
2273 | vm_map_lock_read(map); | |
2274 | break; | |
2275 | default: | |
a108bf71 | 2276 | vm_map_entry_release(count); |
afeabdca | 2277 | return (EINVAL); |
984263bc MD |
2278 | } |
2279 | ||
2280 | /* | |
2281 | * Locate starting entry and clip if necessary. | |
2282 | */ | |
2283 | ||
2284 | VM_MAP_RANGE_CHECK(map, start, end); | |
2285 | ||
2286 | if (vm_map_lookup_entry(map, start, &entry)) { | |
2287 | if (modify_map) | |
a108bf71 | 2288 | vm_map_clip_start(map, entry, start, &count); |
47ec0953 MD |
2289 | } else if (entry) { |
2290 | entry = vm_map_rb_tree_RB_NEXT(entry); | |
984263bc | 2291 | } else { |
47ec0953 | 2292 | entry = RB_MIN(vm_map_rb_tree, &map->rb_root); |
984263bc MD |
2293 | } |
2294 | ||
2295 | if (modify_map) { | |
2296 | /* | |
2297 | * madvise behaviors that are implemented in the vm_map_entry. | |
2298 | * | |
2299 | * We clip the vm_map_entry so that behavioral changes are | |
2300 | * limited to the specified address range. | |
2301 | */ | |
2302 | for (current = entry; | |
67e7cb85 | 2303 | current && current->ba.start < end; |
47ec0953 MD |
2304 | current = vm_map_rb_tree_RB_NEXT(current)) { |
2305 | /* | |
2306 | * Ignore submaps | |
2307 | */ | |
1b874851 | 2308 | if (current->maptype == VM_MAPTYPE_SUBMAP) |
984263bc MD |
2309 | continue; |
2310 | ||
a108bf71 | 2311 | vm_map_clip_end(map, current, end, &count); |
984263bc MD |
2312 | |
2313 | switch (behav) { | |
2314 | case MADV_NORMAL: | |
2315 | vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL); | |
2316 | break; | |
2317 | case MADV_SEQUENTIAL: | |
2318 | vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL); | |
2319 | break; | |
2320 | case MADV_RANDOM: | |
2321 | vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM); | |
2322 | break; | |
2323 | case MADV_NOSYNC: | |
2324 | current->eflags |= MAP_ENTRY_NOSYNC; | |
2325 | break; | |
2326 | case MADV_AUTOSYNC: | |
2327 | current->eflags &= ~MAP_ENTRY_NOSYNC; | |
2328 | break; | |
2329 | case MADV_NOCORE: | |
2330 | current->eflags |= MAP_ENTRY_NOCOREDUMP; | |
2331 | break; | |
2332 | case MADV_CORE: | |
2333 | current->eflags &= ~MAP_ENTRY_NOCOREDUMP; | |
2334 | break; | |
afeabdca MD |
2335 | case MADV_SETMAP: |
2336 | /* | |
2337 | * Set the page directory page for a map | |
4d4f84f5 | 2338 | * governed by a virtual page table. |
afeabdca | 2339 | * |
4d4f84f5 MD |
2340 | * Software virtual page table support has |
2341 | * been removed, this MADV is no longer | |
2342 | * supported. | |
afeabdca | 2343 | */ |
4d4f84f5 | 2344 | error = EINVAL; |
afeabdca | 2345 | break; |
76f1911e MD |
2346 | case MADV_INVAL: |
2347 | /* | |
2348 | * Invalidate the related pmap entries, used | |
2349 | * to flush portions of the real kernel's | |
2350 | * pmap when the caller has removed or | |
2351 | * modified existing mappings in a virtual | |
2352 | * page table. | |
2353 | * | |
fc531fbc MD |
2354 | * (exclusive locked map version does not |
2355 | * need the range interlock). | |
76f1911e MD |
2356 | */ |
2357 | pmap_remove(map->pmap, | |
67e7cb85 | 2358 | current->ba.start, current->ba.end); |
76f1911e | 2359 | break; |
984263bc | 2360 | default: |
afeabdca | 2361 | error = EINVAL; |
984263bc MD |
2362 | break; |
2363 | } | |
a108bf71 | 2364 | vm_map_simplify_entry(map, current, &count); |
984263bc MD |
2365 | } |
2366 | vm_map_unlock(map); | |
2367 | } else { | |
2368 | vm_pindex_t pindex; | |
76f1911e | 2369 | vm_pindex_t delta; |
984263bc MD |
2370 | |
2371 | /* | |
2372 | * madvise behaviors that are implemented in the underlying | |
2373 | * vm_object. | |
2374 | * | |
2375 | * Since we don't clip the vm_map_entry, we have to clip | |
2376 | * the vm_object pindex and count. | |
1b874851 | 2377 | * |
4d4f84f5 | 2378 | * NOTE! These functions are only supported on normal maps. |
9de48ead MD |
2379 | * |
2380 | * NOTE! These functions only apply to the top-most object. | |
2381 | * It is not applicable to backing objects. | |
984263bc MD |
2382 | */ |
2383 | for (current = entry; | |
67e7cb85 | 2384 | current && current->ba.start < end; |
47ec0953 | 2385 | current = vm_map_rb_tree_RB_NEXT(current)) { |
984263bc MD |
2386 | vm_offset_t useStart; |
2387 | ||
4d4f84f5 | 2388 | if (current->maptype != VM_MAPTYPE_NORMAL) |
984263bc MD |
2389 | continue; |
2390 | ||
9de48ead | 2391 | pindex = OFF_TO_IDX(current->ba.offset); |
67e7cb85 MD |
2392 | delta = atop(current->ba.end - current->ba.start); |
2393 | useStart = current->ba.start; | |
984263bc | 2394 | |
67e7cb85 MD |
2395 | if (current->ba.start < start) { |
2396 | pindex += atop(start - current->ba.start); | |
2397 | delta -= atop(start - current->ba.start); | |
984263bc MD |
2398 | useStart = start; |
2399 | } | |
67e7cb85 MD |
2400 | if (current->ba.end > end) |
2401 | delta -= atop(current->ba.end - end); | |
984263bc | 2402 | |
76f1911e | 2403 | if ((vm_spindex_t)delta <= 0) |
984263bc MD |
2404 | continue; |
2405 | ||
76f1911e MD |
2406 | if (behav == MADV_INVAL) { |
2407 | /* | |
2408 | * Invalidate the related pmap entries, used | |
2409 | * to flush portions of the real kernel's | |
2410 | * pmap when the caller has removed or | |
2411 | * modified existing mappings in a virtual | |
2412 | * page table. | |
2413 | * | |
fc531fbc MD |
2414 | * (shared locked map version needs the |
2415 | * interlock, see vm_fault()). | |
76f1911e | 2416 | */ |
fc531fbc MD |
2417 | struct vm_map_ilock ilock; |
2418 | ||
76f1911e MD |
2419 | KASSERT(useStart >= VM_MIN_USER_ADDRESS && |
2420 | useStart + ptoa(delta) <= | |
2421 | VM_MAX_USER_ADDRESS, | |
2422 | ("Bad range %016jx-%016jx (%016jx)", | |
2423 | useStart, useStart + ptoa(delta), | |
2424 | delta)); | |
fc531fbc MD |
2425 | vm_map_interlock(map, &ilock, |
2426 | useStart, | |
2427 | useStart + ptoa(delta)); | |
76f1911e MD |
2428 | pmap_remove(map->pmap, |
2429 | useStart, | |
2430 | useStart + ptoa(delta)); | |
fc531fbc | 2431 | vm_map_deinterlock(map, &ilock); |
76f1911e | 2432 | } else { |
9de48ead | 2433 | vm_object_madvise(current->ba.object, |
76f1911e MD |
2434 | pindex, delta, behav); |
2435 | } | |
afeabdca MD |
2436 | |
2437 | /* | |
4d4f84f5 | 2438 | * Try to pre-populate the page table. |
afeabdca | 2439 | */ |
4d4f84f5 | 2440 | if (behav == MADV_WILLNEED) { |
984263bc | 2441 | pmap_object_init_pt( |
530e94fc | 2442 | map->pmap, current, |
984263bc | 2443 | useStart, |
530e94fc | 2444 | (delta << PAGE_SHIFT), |
984263bc MD |
2445 | MAP_PREFAULT_MADVISE |
2446 | ); | |
2447 | } | |
2448 | } | |
2449 | vm_map_unlock_read(map); | |
2450 | } | |
a108bf71 | 2451 | vm_map_entry_release(count); |
afeabdca | 2452 | return(error); |
984263bc MD |
2453 | } |
2454 | ||
2455 | ||
2456 | /* | |
46754a20 MD |
2457 | * Sets the inheritance of the specified address range in the target map. |
2458 | * Inheritance affects how the map will be shared with child maps at the | |
2459 | * time of vm_map_fork. | |
984263bc MD |
2460 | */ |
2461 | int | |
2462 | vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end, | |
2463 | vm_inherit_t new_inheritance) | |
2464 | { | |
2465 | vm_map_entry_t entry; | |
2466 | vm_map_entry_t temp_entry; | |
a108bf71 | 2467 | int count; |
984263bc MD |
2468 | |
2469 | switch (new_inheritance) { | |
2470 | case VM_INHERIT_NONE: | |
2471 | case VM_INHERIT_COPY: | |
2472 | case VM_INHERIT_SHARE: | |
2473 | break; | |
2474 | default: | |
2475 | return (KERN_INVALID_ARGUMENT); | |
2476 | } | |
2477 | ||
a108bf71 | 2478 | count = vm_map_entry_reserve(MAP_RESERVE_COUNT); |
984263bc MD |
2479 | vm_map_lock(map); |
2480 | ||
2481 | VM_MAP_RANGE_CHECK(map, start, end); | |
2482 | ||
2483 | if (vm_map_lookup_entry(map, start, &temp_entry)) { | |
2484 | entry = temp_entry; | |
a108bf71 | 2485 | vm_map_clip_start(map, entry, start, &count); |
47ec0953 MD |
2486 | } else if (temp_entry) { |
2487 | entry = vm_map_rb_tree_RB_NEXT(temp_entry); | |
2488 | } else { | |
2489 | entry = RB_MIN(vm_map_rb_tree, &map->rb_root); | |
2490 | } | |
984263bc | 2491 | |
67e7cb85 | 2492 | while (entry && entry->ba.start < end) { |
a108bf71 | 2493 | vm_map_clip_end(map, entry, end, &count); |
984263bc MD |
2494 | |
2495 | entry->inheritance = new_inheritance; | |
2496 | ||
a108bf71 | 2497 | vm_map_simplify_entry(map, entry, &count); |
984263bc | 2498 | |
47ec0953 | 2499 | entry = vm_map_rb_tree_RB_NEXT(entry); |
984263bc | 2500 | } |
984263bc | 2501 | vm_map_unlock(map); |
a108bf71 | 2502 | vm_map_entry_release(count); |
984263bc MD |
2503 | return (KERN_SUCCESS); |
2504 | } | |
2505 | ||
2506 | /* | |
2eda01c0 MD |
2507 | * Wiring/Unwiring of memory for user-related operation. |
2508 | * | |
984263bc MD |
2509 | * Implement the semantics of mlock |
2510 | */ | |
2511 | int | |
949c56f8 MD |
2512 | vm_map_user_wiring(vm_map_t map, vm_offset_t start, vm_offset_t real_end, |
2513 | boolean_t new_pageable) | |
984263bc MD |
2514 | { |
2515 | vm_map_entry_t entry; | |
2516 | vm_map_entry_t start_entry; | |
2517 | vm_offset_t end; | |
2518 | int rv = KERN_SUCCESS; | |
a108bf71 | 2519 | int count; |
984263bc | 2520 | |
a108bf71 | 2521 | count = vm_map_entry_reserve(MAP_RESERVE_COUNT); |
984263bc MD |
2522 | vm_map_lock(map); |
2523 | VM_MAP_RANGE_CHECK(map, start, real_end); | |
2524 | end = real_end; | |
2525 | ||
46754a20 MD |
2526 | start_entry = vm_map_clip_range(map, start, end, &count, |
2527 | MAP_CLIP_NO_HOLES); | |
984263bc MD |
2528 | if (start_entry == NULL) { |
2529 | vm_map_unlock(map); | |
a108bf71 | 2530 | vm_map_entry_release(count); |
984263bc MD |
2531 | return (KERN_INVALID_ADDRESS); |
2532 | } | |
2533 | ||
2534 | if (new_pageable == 0) { | |
2535 | entry = start_entry; | |
67e7cb85 | 2536 | while (entry && entry->ba.start < end) { |
984263bc MD |
2537 | vm_offset_t save_start; |
2538 | vm_offset_t save_end; | |
2539 | ||
2540 | /* | |
2541 | * Already user wired or hard wired (trivial cases) | |
2542 | */ | |
2543 | if (entry->eflags & MAP_ENTRY_USER_WIRED) { | |
47ec0953 | 2544 | entry = vm_map_rb_tree_RB_NEXT(entry); |
984263bc MD |
2545 | continue; |
2546 | } | |
2547 | if (entry->wired_count != 0) { | |
2548 | entry->wired_count++; | |
2549 | entry->eflags |= MAP_ENTRY_USER_WIRED; | |
47ec0953 | 2550 | entry = vm_map_rb_tree_RB_NEXT(entry); |
984263bc MD |
2551 | continue; |
2552 | } | |
2553 | ||
2554 | /* | |
2555 | * A new wiring requires instantiation of appropriate | |
2556 | * management structures and the faulting in of the | |
2557 | * page. | |
2558 | */ | |
4d4f84f5 | 2559 | if (entry->maptype == VM_MAPTYPE_NORMAL) { |
46754a20 MD |
2560 | int copyflag = entry->eflags & |
2561 | MAP_ENTRY_NEEDS_COPY; | |
2562 | if (copyflag && ((entry->protection & | |
2563 | VM_PROT_WRITE) != 0)) { | |
5b329e62 | 2564 | vm_map_entry_shadow(entry); |
9de48ead | 2565 | } else if (entry->ba.object == NULL && |
984263bc | 2566 | !map->system_map) { |
53025830 | 2567 | vm_map_entry_allocate_object(entry); |
984263bc MD |
2568 | } |
2569 | } | |
2570 | entry->wired_count++; | |
2571 | entry->eflags |= MAP_ENTRY_USER_WIRED; | |
2572 | ||
2573 | /* | |
f2d22ebf MD |
2574 | * Now fault in the area. Note that vm_fault_wire() |
2575 | * may release the map lock temporarily, it will be | |
2576 | * relocked on return. The in-transition | |
984263bc MD |
2577 | * flag protects the entries. |
2578 | */ | |
67e7cb85 MD |
2579 | save_start = entry->ba.start; |
2580 | save_end = entry->ba.end; | |
06c66eb2 | 2581 | rv = vm_fault_wire(map, entry, TRUE, 0); |
984263bc MD |
2582 | if (rv) { |
2583 | CLIP_CHECK_BACK(entry, save_start); | |
2584 | for (;;) { | |
2585 | KASSERT(entry->wired_count == 1, ("bad wired_count on entry")); | |
2586 | entry->eflags &= ~MAP_ENTRY_USER_WIRED; | |
2587 | entry->wired_count = 0; | |
67e7cb85 | 2588 | if (entry->ba.end == save_end) |
984263bc | 2589 | break; |
47ec0953 MD |
2590 | entry = vm_map_rb_tree_RB_NEXT(entry); |
2591 | KASSERT(entry, | |
2592 | ("bad entry clip during backout")); | |
984263bc MD |
2593 | } |
2594 | end = save_start; /* unwire the rest */ | |
2595 | break; | |
2596 | } | |
2597 | /* | |
2598 | * note that even though the entry might have been | |
2599 | * clipped, the USER_WIRED flag we set prevents | |
2600 | * duplication so we do not have to do a | |
2601 | * clip check. | |
2602 | */ | |
47ec0953 | 2603 | entry = vm_map_rb_tree_RB_NEXT(entry); |
984263bc MD |
2604 | } |
2605 | ||
2606 | /* | |
2607 | * If we failed fall through to the unwiring section to | |
2608 | * unwire what we had wired so far. 'end' has already | |
2609 | * been adjusted. | |
2610 | */ | |
2611 | if (rv) | |
2612 | new_pageable = 1; | |
2613 | ||
2614 | /* | |
2615 | * start_entry might have been clipped if we unlocked the | |
2616 | * map and blocked. No matter how clipped it has gotten | |
2617 | * there should be a fragment that is on our start boundary. | |
2618 | */ | |
2619 | CLIP_CHECK_BACK(start_entry, start); | |
2620 | } | |
2621 | ||
2622 | /* | |
2623 | * Deal with the unwiring case. | |
2624 | */ | |
2625 | if (new_pageable) { | |
2626 | /* | |
2627 | * This is the unwiring case. We must first ensure that the | |
2628 | * range to be unwired is really wired down. We know there | |
2629 | * are no holes. | |
2630 | */ | |
2631 | entry = start_entry; | |
67e7cb85 | 2632 | while (entry && entry->ba.start < end) { |
984263bc MD |
2633 | if ((entry->eflags & MAP_ENTRY_USER_WIRED) == 0) { |
2634 | rv = KERN_INVALID_ARGUMENT; | |
2635 | goto done; | |
2636 | } | |
47ec0953 MD |
2637 | KASSERT(entry->wired_count != 0, |
2638 | ("wired count was 0 with USER_WIRED set! %p", | |
2639 | entry)); | |
2640 | entry = vm_map_rb_tree_RB_NEXT(entry); | |
984263bc MD |
2641 | } |
2642 | ||
2643 | /* | |
2644 | * Now decrement the wiring count for each region. If a region | |
2645 | * becomes completely unwired, unwire its physical pages and | |
2646 | * mappings. | |
2647 | */ | |
b4eddbac DR |
2648 | /* |
2649 | * The map entries are processed in a loop, checking to | |
2650 | * make sure the entry is wired and asserting it has a wired | |
2651 | * count. However, another loop was inserted more-or-less in | |
2652 | * the middle of the unwiring path. This loop picks up the | |
2653 | * "entry" loop variable from the first loop without first | |
2654 | * setting it to start_entry. Naturally, the secound loop | |
2655 | * is never entered and the pages backing the entries are | |
2656 | * never unwired. This can lead to a leak of wired pages. | |
2657 | */ | |
2658 | entry = start_entry; | |
67e7cb85 | 2659 | while (entry && entry->ba.start < end) { |
f2d22ebf MD |
2660 | KASSERT(entry->eflags & MAP_ENTRY_USER_WIRED, |
2661 | ("expected USER_WIRED on entry %p", entry)); | |
984263bc MD |
2662 | entry->eflags &= ~MAP_ENTRY_USER_WIRED; |
2663 | entry->wired_count--; | |
2664 | if (entry->wired_count == 0) | |
f2d22ebf | 2665 | vm_fault_unwire(map, entry); |
47ec0953 | 2666 | entry = vm_map_rb_tree_RB_NEXT(entry); |
984263bc MD |
2667 | } |
2668 | } | |
2669 | done: | |
a108bf71 | 2670 | vm_map_unclip_range(map, start_entry, start, real_end, &count, |
c936cb6f | 2671 | MAP_CLIP_NO_HOLES); |
984263bc | 2672 | vm_map_unlock(map); |
a108bf71 | 2673 | vm_map_entry_release(count); |
641f3b0a | 2674 | |
984263bc MD |
2675 | return (rv); |
2676 | } | |
2677 | ||
2678 | /* | |
2eda01c0 MD |
2679 | * Wiring/Unwiring of memory for kernel-related operation. |
2680 | * | |
46754a20 MD |
2681 | * Sets the pageability of the specified address range in the target map. |
2682 | * Regions specified as not pageable require locked-down physical | |
2683 | * memory and physical page maps. | |
984263bc | 2684 | * |
46754a20 MD |
2685 | * The map must not be locked, but a reference must remain to the map |
2686 | * throughout the call. | |
984263bc | 2687 | * |
46754a20 MD |
2688 | * This function may be called via the zalloc path and must properly |
2689 | * reserve map entries for kernel_map. | |
a108bf71 | 2690 | * |
46754a20 | 2691 | * No requirements. |
984263bc MD |
2692 | */ |
2693 | int | |
949c56f8 MD |
2694 | vm_map_kernel_wiring(vm_map_t map, vm_offset_t start, |
2695 | vm_offset_t real_end, int kmflags) | |
984263bc MD |
2696 | { |
2697 | vm_map_entry_t entry; | |
2698 | vm_map_entry_t start_entry; | |
2699 | vm_offset_t end; | |
2700 | int rv = KERN_SUCCESS; | |
a108bf71 | 2701 | int count; |
984263bc | 2702 | |
e1359933 | 2703 | if (kmflags & KM_KRESERVE) |
a108bf71 | 2704 | count = vm_map_entry_kreserve(MAP_RESERVE_COUNT); |
a108bf71 MD |
2705 | else |
2706 | count = vm_map_entry_reserve(MAP_RESERVE_COUNT); | |
984263bc MD |
2707 | vm_map_lock(map); |
2708 | VM_MAP_RANGE_CHECK(map, start, real_end); | |
2709 | end = real_end; | |
2710 | ||
46754a20 MD |
2711 | start_entry = vm_map_clip_range(map, start, end, &count, |
2712 | MAP_CLIP_NO_HOLES); | |
984263bc MD |
2713 | if (start_entry == NULL) { |
2714 | vm_map_unlock(map); | |
a108bf71 MD |
2715 | rv = KERN_INVALID_ADDRESS; |
2716 | goto failure; | |
984263bc | 2717 | } |
e1359933 | 2718 | if ((kmflags & KM_PAGEABLE) == 0) { |
984263bc MD |
2719 | /* |
2720 | * Wiring. | |
2721 | * | |
2722 | * 1. Holding the write lock, we create any shadow or zero-fill | |
2723 | * objects that need to be created. Then we clip each map | |
2724 | * entry to the region to be wired and increment its wiring | |
2725 | * count. We create objects before clipping the map entries | |
2726 | * to avoid object proliferation. | |
2727 | * | |
2728 | * 2. We downgrade to a read lock, and call vm_fault_wire to | |
2729 | * fault in the pages for any newly wired area (wired_count is | |
2730 | * 1). | |
2731 | * | |
2732 | * Downgrading to a read lock for vm_fault_wire avoids a | |
2733 | * possible deadlock with another process that may have faulted | |
2734 | * on one of the pages to be wired (it would mark the page busy, | |
2735 | * blocking us, then in turn block on the map lock that we | |
2736 | * hold). Because of problems in the recursive lock package, | |
2737 | * we cannot upgrade to a write lock in vm_map_lookup. Thus, | |
2738 | * any actions that require the write lock must be done | |
2739 | * beforehand. Because we keep the read lock on the map, the | |
2740 | * copy-on-write status of the entries we modify here cannot | |
2741 | * change. | |
2742 | */ | |
984263bc | 2743 | entry = start_entry; |
67e7cb85 | 2744 | while (entry && entry->ba.start < end) { |
984263bc MD |
2745 | /* |
2746 | * Trivial case if the entry is already wired | |
2747 | */ | |
2748 | if (entry->wired_count) { | |
2749 | entry->wired_count++; | |
47ec0953 | 2750 | entry = vm_map_rb_tree_RB_NEXT(entry); |
984263bc MD |
2751 | continue; |
2752 | } | |
2753 | ||
2754 | /* | |
2755 | * The entry is being newly wired, we have to setup | |
2756 | * appropriate management structures. A shadow | |
2757 | * object is required for a copy-on-write region, | |
2758 | * or a normal object for a zero-fill region. We | |
2759 | * do not have to do this for entries that point to sub | |
2760 | * maps because we won't hold the lock on the sub map. | |
2761 | */ | |
4d4f84f5 | 2762 | if (entry->maptype == VM_MAPTYPE_NORMAL) { |
46754a20 MD |
2763 | int copyflag = entry->eflags & |
2764 | MAP_ENTRY_NEEDS_COPY; | |
2765 | if (copyflag && ((entry->protection & | |
2766 | VM_PROT_WRITE) != 0)) { | |
5b329e62 | 2767 | vm_map_entry_shadow(entry); |
9de48ead | 2768 | } else if (entry->ba.object == NULL && |
984263bc | 2769 | !map->system_map) { |
53025830 | 2770 | vm_map_entry_allocate_object(entry); |
984263bc MD |
2771 | } |
2772 | } | |
984263bc | 2773 | entry->wired_count++; |
47ec0953 | 2774 | entry = vm_map_rb_tree_RB_NEXT(entry); |
984263bc MD |
2775 | } |
2776 | ||
2777 | /* | |
2778 | * Pass 2. | |
2779 | */ | |
2780 | ||
2781 | /* | |
2782 | * HACK HACK HACK HACK | |
2783 | * | |
46754a20 MD |
2784 | * vm_fault_wire() temporarily unlocks the map to avoid |
2785 | * deadlocks. The in-transition flag from vm_map_clip_range | |
2786 | * call should protect us from changes while the map is | |
2787 | * unlocked. T | |
2788 | * | |
2789 | * NOTE: Previously this comment stated that clipping might | |
2790 | * still occur while the entry is unlocked, but from | |
2791 | * what I can tell it actually cannot. | |
2792 | * | |
2793 | * It is unclear whether the CLIP_CHECK_*() calls | |
2794 | * are still needed but we keep them in anyway. | |
984263bc MD |
2795 | * |
2796 | * HACK HACK HACK HACK | |
2797 | */ | |
2798 | ||
984263bc | 2799 | entry = start_entry; |
67e7cb85 | 2800 | while (entry && entry->ba.start < end) { |
984263bc MD |
2801 | /* |
2802 | * If vm_fault_wire fails for any page we need to undo | |
2803 | * what has been done. We decrement the wiring count | |
2804 | * for those pages which have not yet been wired (now) | |
2805 | * and unwire those that have (later). | |
2806 | */ | |
67e7cb85 MD |
2807 | vm_offset_t save_start = entry->ba.start; |
2808 | vm_offset_t save_end = entry->ba.end; | |
984263bc MD |
2809 | |
2810 | if (entry->wired_count == 1) | |
06c66eb2 | 2811 | rv = vm_fault_wire(map, entry, FALSE, kmflags); |
984263bc MD |
2812 | if (rv) { |
2813 | CLIP_CHECK_BACK(entry, save_start); | |
2814 | for (;;) { | |
47ec0953 MD |
2815 | KASSERT(entry->wired_count == 1, |
2816 | ("wired_count changed unexpectedly")); | |
984263bc | 2817 | entry->wired_count = 0; |
67e7cb85 | 2818 | if (entry->ba.end == save_end) |
984263bc | 2819 | break; |
47ec0953 MD |
2820 | entry = vm_map_rb_tree_RB_NEXT(entry); |
2821 | KASSERT(entry, | |
2822 | ("bad entry clip during backout")); | |
984263bc MD |
2823 | } |
2824 | end = save_start; | |
2825 | break; | |
2826 | } | |
2827 | CLIP_CHECK_FWD(entry, save_end); | |
47ec0953 | 2828 | entry = vm_map_rb_tree_RB_NEXT(entry); |
984263bc | 2829 | } |
984263bc | 2830 | |
984263bc MD |
2831 | /* |
2832 | * If a failure occured undo everything by falling through | |
2833 | * to the unwiring code. 'end' has already been adjusted | |
2834 | * appropriately. | |
2835 | */ | |
2836 | if (rv) | |
e1359933 | 2837 | kmflags |= KM_PAGEABLE; |
984263bc MD |
2838 | |
2839 | /* | |
f2d22ebf MD |
2840 | * start_entry is still IN_TRANSITION but may have been |
2841 | * clipped since vm_fault_wire() unlocks and relocks the | |
2842 | * map. No matter how clipped it has gotten there should | |
2843 | * be a fragment that is on our start boundary. | |
984263bc MD |
2844 | */ |
2845 | CLIP_CHECK_BACK(start_entry, start); | |
2846 | } | |
2847 | ||
e1359933 | 2848 | if (kmflags & KM_PAGEABLE) { |
984263bc MD |
2849 | /* |
2850 | * This is the unwiring case. We must first ensure that the | |
2851 | * range to be unwired is really wired down. We know there | |
2852 | * are no holes. | |
2853 | */ | |
2854 | entry = start_entry; | |
67e7cb85 | 2855 | while (entry && entry->ba.start < end) { |
984263bc MD |
2856 | if (entry->wired_count == 0) { |
2857 | rv = KERN_INVALID_ARGUMENT; | |
2858 | goto done; | |
2859 | } | |
47ec0953 | 2860 | entry = vm_map_rb_tree_RB_NEXT(entry); |
984263bc MD |
2861 | } |
2862 | ||
2863 | /* | |
2864 | * Now decrement the wiring count for each region. If a region | |
2865 | * becomes completely unwired, unwire its physical pages and | |
2866 | * mappings. | |
2867 | */ | |
2868 | entry = start_entry; | |
67e7cb85 | 2869 | while (entry && entry->ba.start < end) { |
984263bc MD |
2870 | entry->wired_count--; |
2871 | if (entry->wired_count == 0) | |
f2d22ebf | 2872 | vm_fault_unwire(map, entry); |
47ec0953 | 2873 | entry = vm_map_rb_tree_RB_NEXT(entry); |
984263bc MD |
2874 | } |
2875 | } | |
2876 | done: | |
46754a20 MD |
2877 | vm_map_unclip_range(map, start_entry, start, real_end, |
2878 | &count, MAP_CLIP_NO_HOLES); | |
984263bc | 2879 | vm_map_unlock(map); |
a108bf71 | 2880 | failure: |
e1359933 | 2881 | if (kmflags & KM_KRESERVE) |
a108bf71 | 2882 | vm_map_entry_krelease(count); |
a108bf71 MD |
2883 | else |
2884 | vm_map_entry_release(count); | |
984263bc MD |
2885 | return (rv); |
2886 | } | |
2887 | ||
a108bf71 | 2888 | /* |
46754a20 MD |
2889 | * Mark a newly allocated address range as wired but do not fault in |
2890 | * the pages. The caller is expected to load the pages into the object. | |
a108bf71 | 2891 | * |
46754a20 MD |
2892 | * The map must be locked on entry and will remain locked on return. |
2893 | * No other requirements. | |
a108bf71 MD |
2894 | */ |
2895 | void | |
46754a20 MD |
2896 | vm_map_set_wired_quick(vm_map_t map, vm_offset_t addr, vm_size_t size, |
2897 | int *countp) | |
a108bf71 MD |
2898 | { |
2899 | vm_map_entry_t scan; | |
2900 | vm_map_entry_t entry; | |
2901 | ||
46754a20 MD |
2902 | entry = vm_map_clip_range(map, addr, addr + size, |
2903 | countp, MAP_CLIP_NO_HOLES); | |
47ec0953 | 2904 | scan = entry; |
67e7cb85 | 2905 | while (scan && scan->ba.start < addr + size) { |
47ec0953 MD |
2906 | KKASSERT(scan->wired_count == 0); |
2907 | scan->wired_count = 1; | |
2908 | scan = vm_map_rb_tree_RB_NEXT(scan); | |
a108bf71 | 2909 | } |
46754a20 MD |
2910 | vm_map_unclip_range(map, entry, addr, addr + size, |
2911 | countp, MAP_CLIP_NO_HOLES); | |
a108bf71 MD |
2912 | } |
2913 | ||
984263bc | 2914 | /* |
984263bc MD |
2915 | * Push any dirty cached pages in the address range to their pager. |
2916 | * If syncio is TRUE, dirty pages are written synchronously. | |
2917 | * If invalidate is TRUE, any cached pages are freed as well. | |
2918 | * | |
2bc7505b MD |
2919 | * This routine is called by sys_msync() |
2920 | * | |
984263bc | 2921 | * Returns an error if any part of the specified range is not mapped. |
46754a20 MD |
2922 | * |
2923 | * No requirements. | |
984263bc MD |
2924 | */ |
2925 | int | |
2bc7505b MD |
2926 | vm_map_clean(vm_map_t map, vm_offset_t start, vm_offset_t end, |
2927 | boolean_t syncio, boolean_t invalidate) | |
984263bc MD |
2928 | { |
2929 | vm_map_entry_t current; | |
47ec0953 | 2930 | vm_map_entry_t next; |
984263bc | 2931 | vm_map_entry_t entry; |
44293a80 | 2932 | vm_map_backing_t ba; |
984263bc MD |
2933 | vm_size_t size; |
2934 | vm_object_t object; | |
2935 | vm_ooffset_t offset; | |
2936 | ||
2937 | vm_map_lock_read(map); | |
2938 | VM_MAP_RANGE_CHECK(map, start, end); | |
2939 | if (!vm_map_lookup_entry(map, start, &entry)) { | |
2940 | vm_map_unlock_read(map); | |
2941 | return (KERN_INVALID_ADDRESS); | |
2942 | } | |
b12defdc MD |
2943 | lwkt_gettoken(&map->token); |
2944 | ||
984263bc MD |
2945 | /* |
2946 | * Make a first pass to check for holes. | |
2947 | */ | |
47ec0953 | 2948 | current = entry; |
67e7cb85 | 2949 | while (current && current->ba.start < end) { |
1b874851 | 2950 | if (current->maptype == VM_MAPTYPE_SUBMAP) { |
6730ca37 | 2951 | lwkt_reltoken(&map->token); |
984263bc MD |
2952 | vm_map_unlock_read(map); |
2953 | return (KERN_INVALID_ARGUMENT); | |
2954 | } | |
47ec0953 | 2955 | next = vm_map_rb_tree_RB_NEXT(current); |
67e7cb85 | 2956 | if (end > current->ba.end && |
47ec0953 | 2957 | (next == NULL || |
67e7cb85 | 2958 | current->ba.end != next->ba.start)) { |
6730ca37 | 2959 | lwkt_reltoken(&map->token); |
984263bc MD |
2960 | vm_map_unlock_read(map); |
2961 | return (KERN_INVALID_ADDRESS); | |
2962 | } | |
47ec0953 | 2963 | current = next; |
984263bc MD |
2964 | } |
2965 | ||
2966 | if (invalidate) | |
2967 | pmap_remove(vm_map_pmap(map), start, end); | |
46754a20 | 2968 | |
984263bc MD |
2969 | /* |
2970 | * Make a second pass, cleaning/uncaching pages from the indicated | |
2971 | * objects as we go. | |
2972 | */ | |
47ec0953 | 2973 | current = entry; |
67e7cb85 MD |
2974 | while (current && current->ba.start < end) { |
2975 | offset = current->ba.offset + (start - current->ba.start); | |
2976 | size = (end <= current->ba.end ? end : current->ba.end) - start; | |
0adbcbd6 MD |
2977 | |
2978 | switch(current->maptype) { | |
2979 | case VM_MAPTYPE_SUBMAP: | |
2980 | { | |
984263bc MD |
2981 | vm_map_t smap; |
2982 | vm_map_entry_t tentry; | |
2983 | vm_size_t tsize; | |
2984 | ||
9de48ead | 2985 | smap = current->ba.sub_map; |
984263bc | 2986 | vm_map_lock_read(smap); |
418ff780 | 2987 | vm_map_lookup_entry(smap, offset, &tentry); |
47ec0953 MD |
2988 | if (tentry == NULL) { |
2989 | tsize = vm_map_max(smap) - offset; | |
9de48ead | 2990 | ba = NULL; |
47ec0953 MD |
2991 | offset = 0 + (offset - vm_map_min(smap)); |
2992 | } else { | |
67e7cb85 | 2993 | tsize = tentry->ba.end - offset; |
9de48ead MD |
2994 | ba = &tentry->ba; |
2995 | offset = tentry->ba.offset + | |
67e7cb85 | 2996 | (offset - tentry->ba.start); |
47ec0953 MD |
2997 | } |
2998 | vm_map_unlock_read(smap); | |
984263bc MD |
2999 | if (tsize < size) |
3000 | size = tsize; | |
0adbcbd6 MD |
3001 | break; |
3002 | } | |
3003 | case VM_MAPTYPE_NORMAL: | |
9de48ead | 3004 | ba = ¤t->ba; |
0adbcbd6 MD |
3005 | break; |
3006 | default: | |
9de48ead | 3007 | ba = NULL; |
0adbcbd6 | 3008 | break; |
984263bc | 3009 | } |
9de48ead MD |
3010 | if (ba) { |
3011 | object = ba->object; | |
3012 | if (object) | |
3013 | vm_object_hold(object); | |
3014 | } else { | |
3015 | object = NULL; | |
3016 | } | |
b12defdc | 3017 | |
984263bc MD |
3018 | /* |
3019 | * Note that there is absolutely no sense in writing out | |
3020 | * anonymous objects, so we track down the vnode object | |
3021 | * to write out. | |
3022 | * We invalidate (remove) all pages from the address space | |
3023 | * anyway, for semantic correctness. | |
3024 | * | |
3025 | * note: certain anonymous maps, such as MAP_NOSYNC maps, | |
3026 | * may start out with a NULL object. | |
9de48ead MD |
3027 | * |
3028 | * XXX do we really want to stop at the first backing store | |
3029 | * here if there are more? XXX | |
984263bc | 3030 | */ |
9de48ead MD |
3031 | if (ba) { |
3032 | vm_object_t tobj; | |
3033 | ||
3034 | tobj = object; | |
3035 | while (ba->backing_ba != NULL) { | |
67e7cb85 | 3036 | offset -= ba->offset; |
9de48ead MD |
3037 | ba = ba->backing_ba; |
3038 | offset += ba->offset; | |
3039 | tobj = ba->object; | |
3040 | if (tobj->size < OFF_TO_IDX(offset + size)) | |
3041 | size = IDX_TO_OFF(tobj->size) - offset; | |
3042 | break; /* XXX this break is not correct */ | |
3043 | } | |
3044 | if (object != tobj) { | |
3045 | if (object) | |
3046 | vm_object_drop(object); | |
b12defdc | 3047 | object = tobj; |
9de48ead | 3048 | vm_object_hold(object); |
b12defdc | 3049 | } |
984263bc | 3050 | } |
9de48ead | 3051 | |
984263bc | 3052 | if (object && (object->type == OBJT_VNODE) && |
2bc7505b MD |
3053 | (current->protection & VM_PROT_WRITE) && |
3054 | (object->flags & OBJ_NOMSYNC) == 0) { | |
984263bc MD |
3055 | /* |
3056 | * Flush pages if writing is allowed, invalidate them | |
3057 | * if invalidation requested. Pages undergoing I/O | |
3058 | * will be ignored by vm_object_page_remove(). | |
3059 | * | |
3060 | * We cannot lock the vnode and then wait for paging | |
3061 | * to complete without deadlocking against vm_fault. | |
3062 | * Instead we simply call vm_object_page_remove() and | |
3063 | * allow it to block internally on a page-by-page | |
3064 | * basis when it encounters pages undergoing async | |
3065 | * I/O. | |
3066 | */ | |
3067 | int flags; | |
3068 | ||
b12defdc | 3069 | /* no chain wait needed for vnode objects */ |
2de4f77e | 3070 | vm_object_reference_locked(object); |
ca466bae | 3071 | vn_lock(object->handle, LK_EXCLUSIVE | LK_RETRY); |
984263bc MD |
3072 | flags = (syncio || invalidate) ? OBJPC_SYNC : 0; |
3073 | flags |= invalidate ? OBJPC_INVAL : 0; | |
1b874851 | 3074 | |
4d4f84f5 | 3075 | if (current->maptype == VM_MAPTYPE_NORMAL) { |
1b874851 MD |
3076 | vm_object_page_clean(object, |
3077 | OFF_TO_IDX(offset), | |
3078 | OFF_TO_IDX(offset + size + PAGE_MASK), | |
3079 | flags); | |
1b874851 | 3080 | } |
a11aaa81 | 3081 | vn_unlock(((struct vnode *)object->handle)); |
2de4f77e | 3082 | vm_object_deallocate_locked(object); |
984263bc MD |
3083 | } |
3084 | if (object && invalidate && | |
3085 | ((object->type == OBJT_VNODE) || | |
f2c2051e JH |
3086 | (object->type == OBJT_DEVICE) || |
3087 | (object->type == OBJT_MGTDEVICE))) { | |
2f1821ca | 3088 | int clean_only = |
f2c2051e JH |
3089 | ((object->type == OBJT_DEVICE) || |
3090 | (object->type == OBJT_MGTDEVICE)) ? FALSE : TRUE; | |
b12defdc | 3091 | /* no chain wait needed for vnode/device objects */ |
2de4f77e | 3092 | vm_object_reference_locked(object); |
4d4f84f5 | 3093 | if (current->maptype == VM_MAPTYPE_NORMAL) { |
1b874851 MD |
3094 | vm_object_page_remove(object, |
3095 | OFF_TO_IDX(offset), | |
3096 | OFF_TO_IDX(offset + size + PAGE_MASK), | |
3097 | clean_only); | |
1b874851 | 3098 | } |
2de4f77e | 3099 | vm_object_deallocate_locked(object); |
984263bc MD |
3100 | } |
3101 | start += size; | |
b12defdc MD |
3102 | if (object) |
3103 | vm_object_drop(object); | |
47ec0953 | 3104 | current = vm_map_rb_tree_RB_NEXT(current); |
984263bc | 3105 | } |
2de4f77e | 3106 | |
b12defdc | 3107 | lwkt_reltoken(&map->token); |
2de4f77e | 3108 | vm_map_unlock_read(map); |
46754a20 | 3109 | |
984263bc MD |
3110 | return (KERN_SUCCESS); |
3111 | } | |
3112 | ||
3113 | /* | |
46754a20 | 3114 | * Make the region specified by this entry pageable. |
984263bc | 3115 | * |
46754a20 | 3116 | * The vm_map must be exclusively locked. |
984263bc MD |
3117 | */ |
3118 | static void | |
a108bf71 | 3119 | vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry) |
984263bc | 3120 | { |
f2d22ebf | 3121 | entry->eflags &= ~MAP_ENTRY_USER_WIRED; |
984263bc | 3122 | entry->wired_count = 0; |
f2d22ebf | 3123 | vm_fault_unwire(map, entry); |
984263bc MD |
3124 | } |
3125 | ||
3126 | /* | |
46754a20 | 3127 | * Deallocate the given entry from the target map. |
984263bc | 3128 | * |
46754a20 | 3129 | * The vm_map must be exclusively locked. |
984263bc MD |
3130 | */ |
3131 | static void | |
a108bf71 | 3132 | vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry, int *countp) |
984263bc MD |
3133 | { |
3134 | vm_map_entry_unlink(map, entry); | |
67e7cb85 | 3135 | map->size -= entry->ba.end - entry->ba.start; |
a108bf71 | 3136 | vm_map_entry_dispose(map, entry, countp); |
984263bc MD |
3137 | } |
3138 | ||
3139 | /* | |
46754a20 | 3140 | * Deallocates the given address range from the target map. |
984263bc | 3141 | * |
46754a20 | 3142 | * The vm_map must be exclusively locked. |
984263bc MD |
3143 | */ |
3144 | int | |
a108bf71 | 3145 | vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end, int *countp) |
984263bc MD |
3146 | { |
3147 | vm_object_t object; | |
3148 | vm_map_entry_t entry; | |
3149 | vm_map_entry_t first_entry; | |
e6b81333 | 3150 | vm_offset_t hole_start; |
984263bc | 3151 | |
46754a20 | 3152 | ASSERT_VM_MAP_LOCKED(map); |
b12defdc | 3153 | lwkt_gettoken(&map->token); |
686dbf64 | 3154 | again: |
984263bc | 3155 | /* |
686dbf64 MD |
3156 | * Find the start of the region, and clip it. Set entry to point |
3157 | * at the first record containing the requested address or, if no | |
3158 | * such record exists, the next record with a greater address. The | |
3159 | * loop will run from this point until a record beyond the termination | |
3160 | * address is encountered. | |
3161 | * | |
e6b81333 | 3162 | * Adjust freehint[] for either the clip case or the extension case. |
686dbf64 MD |
3163 | * |
3164 | * GGG see other GGG comment. | |
984263bc | 3165 | */ |
686dbf64 | 3166 | if (vm_map_lookup_entry(map, start, &first_entry)) { |
984263bc | 3167 | entry = first_entry; |
a108bf71 | 3168 | vm_map_clip_start(map, entry, start, countp); |
e6b81333 | 3169 | hole_start = start; |
686dbf64 | 3170 | } else { |
47ec0953 MD |
3171 | if (first_entry) { |
3172 | entry = vm_map_rb_tree_RB_NEXT(first_entry); | |
3173 | if (entry == NULL) | |
67e7cb85 | 3174 | hole_start = first_entry->ba.start; |
47ec0953 | 3175 | else |
67e7cb85 | 3176 | hole_start = first_entry->ba.end; |
47ec0953 MD |
3177 | } else { |
3178 | entry = RB_MIN(vm_map_rb_tree, &map->rb_root); | |
3179 | if (entry == NULL) | |
3180 | hole_start = vm_map_min(map); | |
3181 | else | |
3182 | hole_start = vm_map_max(map); | |
3183 | } | |
984263bc MD |
3184 | } |
3185 | ||
3186 | /* | |
3187 | * Step through all entries in this region | |
3188 | */ | |
67e7cb85 | 3189 | while (entry && entry->ba.start < end) { |
984263bc MD |
3190 | vm_map_entry_t next; |
3191 | vm_offset_t s, e; | |
3192 | vm_pindex_t offidxstart, offidxend, count; | |
3193 | ||
3194 | /* | |
3195 | * If we hit an in-transition entry we have to sleep and | |
3196 | * retry. It's easier (and not really slower) to just retry | |
3197 | * since this case occurs so rarely and the hint is already | |
3198 | * pointing at the right place. We have to reset the | |
3199 | * start offset so as not to accidently delete an entry | |
3200 | * another process just created in vacated space. | |
3201 | */ | |
3202 | if (entry->eflags & MAP_ENTRY_IN_TRANSITION) { | |
3203 | entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; | |
67e7cb85 | 3204 | start = entry->ba.start; |
12e4aaff MD |
3205 | ++mycpu->gd_cnt.v_intrans_coll; |
3206 | ++mycpu->gd_cnt.v_intrans_wait; | |
641f3b0a | 3207 | vm_map_transition_wait(map, 1); |
984263bc MD |
3208 | goto again; |
3209 | } | |
a108bf71 | 3210 | vm_map_clip_end(map, entry, end, countp); |
984263bc | 3211 | |
67e7cb85 MD |
3212 | s = entry->ba.start; |
3213 | e = entry->ba.end; | |
47ec0953 | 3214 | next = vm_map_rb_tree_RB_NEXT(entry); |
984263bc | 3215 | |
9de48ead | 3216 | offidxstart = OFF_TO_IDX(entry->ba.offset); |
984263bc | 3217 | count = OFF_TO_IDX(e - s); |
0adbcbd6 MD |
3218 | |
3219 | switch(entry->maptype) { | |
3220 | case VM_MAPTYPE_NORMAL: | |
0adbcbd6 | 3221 | case VM_MAPTYPE_SUBMAP: |
9de48ead | 3222 | object = entry->ba.object; |
0adbcbd6 MD |
3223 | break; |
3224 | default: | |
3225 | object = NULL; | |
3226 | break; | |
3227 | } | |
984263bc MD |
3228 | |
3229 | /* | |
3230 | * Unwire before removing addresses from the pmap; otherwise, | |
3231 | * unwiring will put the entries back in the pmap. | |
5eab490e MD |
3232 | * |
3233 | * Generally speaking, doing a bulk pmap_remove() before | |
3234 | * removing the pages from the VM object is better at | |
3235 | * reducing unnecessary IPIs. The pmap code is now optimized | |
3236 | * to not blindly iterate the range when pt and pd pages | |
3237 | * are missing. | |
984263bc | 3238 | */ |
f2d22ebf | 3239 | if (entry->wired_count != 0) |
984263bc | 3240 | vm_map_entry_unwire(map, entry); |
984263bc MD |
3241 | |
3242 | offidxend = offidxstart + count; | |
3243 | ||
712b6620 | 3244 | if (object == kernel_object) { |
5eab490e | 3245 | pmap_remove(map->pmap, s, e); |
b12defdc | 3246 | vm_object_hold(object); |
46754a20 MD |
3247 | vm_object_page_remove(object, offidxstart, |
3248 | offidxend, FALSE); | |
b12defdc MD |
3249 | vm_object_drop(object); |
3250 | } else if (object && object->type != OBJT_DEFAULT && | |
3251 | object->type != OBJT_SWAP) { | |
3252 | /* | |
ce94514e MD |
3253 | * vnode object routines cannot be chain-locked, |
3254 | * but since we aren't removing pages from the | |
3255 | * object here we can use a shared hold. | |
b12defdc | 3256 | */ |
ce94514e | 3257 | vm_object_hold_shared(object); |
b12defdc MD |
3258 | pmap_remove(map->pmap, s, e); |
3259 | vm_object_drop(object); | |
3260 | } else if (object) { | |
3261 | vm_object_hold(object); | |
984263bc | 3262 | pmap_remove(map->pmap, s, e); |
2de4f77e | 3263 | |
984263bc MD |
3264 | if (object != NULL && |
3265 | object->ref_count != 1 && | |
46754a20 MD |
3266 | (object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) == |
3267 | OBJ_ONEMAPPING && | |
3268 | (object->type == OBJT_DEFAULT || | |
3269 | object->type == OBJT_SWAP)) { | |
641f3b0a MD |
3270 | /* |
3271 | * When ONEMAPPING is set we can destroy the | |
3272 | * pages underlying the entry's range. | |
3273 | */ | |
46754a20 MD |
3274 | vm_object_page_remove(object, offidxstart, |
3275 | offidxend, FALSE); | |
984263bc | 3276 | if (object->type == OBJT_SWAP) { |
46754a20 MD |
3277 | swap_pager_freespace(object, |
3278 | offidxstart, | |
3279 | count); | |
984263bc MD |
3280 | } |
3281 | if (offidxend >= object->size && | |
3282 | offidxstart < object->size) { | |
3283 | object->size = offidxstart; | |
3284 | } | |
3285 | } | |
b12defdc | 3286 | vm_object_drop(object); |
c450821c MD |
3287 | } else if (entry->maptype == VM_MAPTYPE_UKSMAP) { |
3288 | pmap_remove(map->pmap, s, e); | |
984263bc | 3289 | } |
b4460ab3 | 3290 | |
984263bc MD |
3291 | /* |
3292 | * Delete the entry (which may delete the object) only after | |
3293 | * removing all pmap entries pointing to its pages. | |
3294 | * (Otherwise, its page frames may be reallocated, and any | |
3295 | * modify bits will be set in the wrong object!) | |
3296 | */ | |
a108bf71 | 3297 | vm_map_entry_delete(map, entry, countp); |
984263bc MD |
3298 | entry = next; |
3299 | } | |
47ec0953 MD |
3300 | |
3301 | /* | |
3302 | * We either reached the end and use vm_map_max as the end | |
3303 | * address, or we didn't and we use the next entry as the | |
3304 | * end address. | |
3305 | */ | |
3306 | if (entry == NULL) { | |
3307 | vm_map_freehint_hole(map, hole_start, | |
3308 | vm_map_max(map) - hole_start); | |
3309 | } else { | |
e6b81333 | 3310 | vm_map_freehint_hole(map, hole_start, |
67e7cb85 | 3311 | entry->ba.start - hole_start); |
47ec0953 | 3312 | } |
e6b81333 | 3313 | |
b12defdc | 3314 | lwkt_reltoken(&map->token); |
e6b81333 | 3315 | |
984263bc MD |
3316 | return (KERN_SUCCESS); |
3317 | } | |
3318 | ||
3319 | /* | |
46754a20 MD |
3320 | * Remove the given address range from the target map. |
3321 | * This is the exported form of vm_map_delete. | |
984263bc | 3322 | * |
46754a20 | 3323 | * No requirements. |
984263bc MD |
3324 | */ |
3325 | int | |
a108bf71 | 3326 | vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end) |
984263bc | 3327 | { |
03aa8d99 | 3328 | int result; |
a108bf71 | 3329 | int count; |
984263bc | 3330 | |
a108bf71 | 3331 | count = vm_map_entry_reserve(MAP_RESERVE_COUNT); |
984263bc MD |
3332 | vm_map_lock(map); |
3333 | VM_MAP_RANGE_CHECK(map, start, end); | |
a108bf71 | 3334 | result = vm_map_delete(map, start, end, &count); |
984263bc | 3335 | vm_map_unlock(map); |
a108bf71 | 3336 | vm_map_entry_release(count); |
984263bc | 3337 | |
984263bc MD |
3338 | return (result); |
3339 | } | |
3340 | ||
3341 | /* | |
46754a20 MD |
3342 | * Assert that the target map allows the specified privilege on the |
3343 | * entire address region given. The entire region must be allocated. | |
984263bc | 3344 | * |
46754a20 | 3345 | * The caller must specify whether the vm_map is already locked or not. |
984263bc MD |
3346 | */ |
3347 | boolean_t | |
3348 | vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end, | |
46754a20 | 3349 | vm_prot_t protection, boolean_t have_lock) |
984263bc MD |
3350 | { |
3351 | vm_map_entry_t entry; | |
3352 | vm_map_entry_t tmp_entry; | |
46754a20 MD |
3353 | boolean_t result; |
3354 | ||
3355 | if (have_lock == FALSE) | |
3356 | vm_map_lock_read(map); | |
984263bc MD |
3357 | |
3358 | if (!vm_map_lookup_entry(map, start, &tmp_entry)) { | |
46754a20 MD |
3359 | if (have_lock == FALSE) |
3360 | vm_map_unlock_read(map); | |
984263bc MD |
3361 | return (FALSE); |
3362 | } | |
3363 | entry = tmp_entry; | |
3364 | ||
46754a20 | 3365 | result = TRUE; |
984263bc | 3366 | while (start < end) { |
47ec0953 | 3367 | if (entry == NULL) { |
46754a20 MD |
3368 | result = FALSE; |
3369 | break; | |
984263bc | 3370 | } |
47ec0953 | 3371 | |
984263bc MD |
3372 | /* |
3373 | * No holes allowed! | |
3374 | */ | |
3375 | ||
67e7cb85 | 3376 | if (start < entry->ba.start) { |
46754a20 MD |
3377 | result = FALSE; |
3378 | break; | |
984263bc MD |
3379 | } |
3380 | /* | |
3381 | * Check protection associated with entry. | |
3382 | */ | |
3383 | ||
3384 | if ((entry->protection & protection) != protection) { | |
46754a20 MD |
3385 | result = FALSE; |
3386 | break; | |
984263bc MD |
3387 | } |
3388 | /* go to next entry */ | |
67e7cb85 | 3389 | start = entry->ba.end; |
47ec0953 | 3390 | entry = vm_map_rb_tree_RB_NEXT(entry); |
984263bc | 3391 | } |
46754a20 MD |
3392 | if (have_lock == FALSE) |
3393 | vm_map_unlock_read(map); | |
3394 | return (result); | |
984263bc MD |
3395 | } |
3396 | ||
5b329e62 | 3397 | /* |
67e7cb85 MD |
3398 | * vm_map_backing structures are not shared across forks and must be |
3399 | * replicated. | |
3400 | * | |
3401 | * Generally speaking we must reallocate the backing_ba sequence and | |
3402 | * also adjust it for any changes made to the base entry->ba.start and | |
3403 | * entry->ba.end. The first ba in the chain is of course &entry->ba, | |
3404 | * so we only need to adjust subsequent ba's start, end, and offset. | |
5b329e62 MD |
3405 | * |
3406 | * MAP_BACK_CLIPPED - Called as part of a clipping replication. | |
3407 | * Do not clear OBJ_ONEMAPPING. | |
3408 | * | |
3409 | * MAP_BACK_BASEOBJREFD - Called from vm_map_insert(). The base object | |
3410 | * has already been referenced. | |
3411 | */ | |
3412 | static | |
3413 | void | |
3414 | vm_map_backing_replicated(vm_map_t map, vm_map_entry_t entry, int flags) | |
3415 | { | |
3416 | vm_map_backing_t ba; | |
3417 | vm_map_backing_t nba; | |
3418 | vm_object_t object; | |
3419 | ||
3420 | ba = &entry->ba; | |
3421 | for (;;) { | |
67e7cb85 | 3422 | ba->pmap = map->pmap; |
4aa6d05c MD |
3423 | |
3424 | if (ba->map_object) { | |
3425 | switch(entry->maptype) { | |
4aa6d05c MD |
3426 | case VM_MAPTYPE_NORMAL: |
3427 | object = ba->object; | |
3428 | if (ba != &entry->ba || | |
3429 | (flags & MAP_BACK_BASEOBJREFD) == 0) { | |
3430 | vm_object_reference_quick(object); | |
3431 | } | |
3432 | vm_map_backing_attach(entry, ba); | |
3433 | if ((flags & MAP_BACK_CLIPPED) == 0 && | |
3434 | object->ref_count > 1) { | |
3435 | vm_object_clear_flag(object, | |
3436 | OBJ_ONEMAPPING); | |
3437 | } | |
3438 | break; | |
3439 | case VM_MAPTYPE_UKSMAP: | |
3440 | vm_map_backing_attach(entry, ba); | |
3441 | break; | |
3442 | default: | |
3443 | break; | |
5b329e62 MD |
3444 | } |
3445 | } | |
3446 | if (ba->backing_ba == NULL) | |
3447 | break; | |
64b5a8a5 MD |
3448 | |
3449 | /* | |
3450 | * NOTE: The aux_info field is retained. | |
3451 | */ | |
5b329e62 MD |
3452 | nba = kmalloc(sizeof(*nba), M_MAP_BACKING, M_INTWAIT); |
3453 | *nba = *ba->backing_ba; | |
67e7cb85 MD |
3454 | nba->offset += (ba->start - nba->start); /* += (new - old) */ |
3455 | nba->start = ba->start; | |
3456 | nba->end = ba->end; | |
5b329e62 MD |
3457 | ba->backing_ba = nba; |
3458 | ba = nba; | |
67e7cb85 | 3459 | /* pmap is replaced at the top of the loop */ |
5b329e62 | 3460 | } |
5b329e62 MD |
3461 | } |
3462 | ||
67e7cb85 MD |
3463 | static |
3464 | void | |
3465 | vm_map_backing_adjust_start(vm_map_entry_t entry, vm_ooffset_t start) | |
3466 | { | |
3467 | vm_map_backing_t ba; | |
3468 | ||
4d4f84f5 | 3469 | if (entry->maptype == VM_MAPTYPE_NORMAL) { |
67e7cb85 MD |
3470 | for (ba = &entry->ba; ba; ba = ba->backing_ba) { |
3471 | if (ba->object) { | |
567a6398 | 3472 | lockmgr(&ba->object->backing_lk, LK_EXCLUSIVE); |
67e7cb85 MD |
3473 | ba->offset += (start - ba->start); |
3474 | ba->start = start; | |
567a6398 | 3475 | lockmgr(&ba->object->backing_lk, LK_RELEASE); |
67e7cb85 MD |
3476 | } else { |
3477 | ba->offset += (start - ba->start); | |
3478 | ba->start = start; | |
3479 | } | |
3480 | } | |
3481 | } else { | |
3482 | /* not an object and can't be shadowed */ | |
3483 | } | |
3484 | } | |
3485 | ||
3486 | static | |
3487 | void | |
3488 | vm_map_backing_adjust_end(vm_map_entry_t entry, vm_ooffset_t end) | |
3489 | { | |
3490 | vm_map_backing_t ba; | |
3491 | ||
4d4f84f5 | 3492 | if (entry->maptype == VM_MAPTYPE_NORMAL) { |
67e7cb85 MD |
3493 | for (ba = &entry->ba; ba; ba = ba->backing_ba) { |
3494 | if (ba->object) { | |
567a6398 | 3495 | lockmgr(&ba->object->backing_lk, LK_EXCLUSIVE); |
67e7cb85 | 3496 | ba->end = end; |
567a6398 | 3497 | lockmgr(&ba->object->backing_lk, LK_RELEASE); |
67e7cb85 MD |
3498 | } else { |
3499 | ba->end = end; | |
3500 | } | |
3501 | } | |
4d4f84f5 | 3502 | } /* else not an object and/or can't be shadowed */ |
67e7cb85 MD |
3503 | } |
3504 | ||
984263bc | 3505 | /* |
9de48ead | 3506 | * Handles the dirty work of making src_entry and dst_entry copy-on-write |
5b329e62 | 3507 | * after src_entry has been cloned to dst_entry. For normal entries only. |
984263bc | 3508 | * |
d2d8515b | 3509 | * The vm_maps must be exclusively locked. |
b12defdc | 3510 | * The vm_map's token must be held. |
d2d8515b MD |
3511 | * |
3512 | * Because the maps are locked no faults can be in progress during the | |
3513 | * operation. | |
984263bc MD |
3514 | */ |
3515 | static void | |
a108bf71 | 3516 | vm_map_copy_entry(vm_map_t src_map, vm_map_t dst_map, |
b12defdc | 3517 | vm_map_entry_t src_entry, vm_map_entry_t dst_entry) |
984263bc | 3518 | { |
5b329e62 | 3519 | vm_object_t obj; |
984263bc | 3520 | |
4d4f84f5 | 3521 | KKASSERT(dst_entry->maptype == VM_MAPTYPE_NORMAL); |
984263bc | 3522 | |
4d4f84f5 | 3523 | if (src_entry->wired_count) { |
984263bc | 3524 | /* |
44293a80 MD |
3525 | * Of course, wired down pages can't be set copy-on-write. |
3526 | * Cause wired pages to be copied into the new map by | |
3527 | * simulating faults (the new pages are pageable) | |
641f3b0a | 3528 | * |
44293a80 MD |
3529 | * Scrap ba.object (its ref-count has not yet been adjusted |
3530 | * so we can just NULL out the field). Remove the backing | |
3531 | * store. | |
3532 | * | |
3533 | * Then call vm_fault_copy_entry() to create a new object | |
3534 | * in dst_entry and copy the wired pages from src to dst. | |
175f5a88 MD |
3535 | * |
3536 | * The fault-copy code doesn't work with virtual page | |
3537 | * tables. | |
64b5a8a5 MD |
3538 | * |
3539 | * NOTE: obj is not actually an object for all MAPTYPEs, | |
3540 | * just test against NULL. | |
984263bc | 3541 | */ |
64b5a8a5 MD |
3542 | if (dst_entry->ba.map_object != NULL) { |
3543 | vm_map_backing_detach(dst_entry, &dst_entry->ba); | |
3544 | dst_entry->ba.map_object = NULL; | |
3545 | vm_map_entry_dispose_ba(dst_entry, | |
3546 | dst_entry->ba.backing_ba); | |
5b329e62 MD |
3547 | dst_entry->ba.backing_ba = NULL; |
3548 | dst_entry->ba.backing_count = 0; | |
3549 | } | |
44293a80 MD |
3550 | vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry); |
3551 | } else { | |
984263bc | 3552 | if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) { |
44293a80 MD |
3553 | /* |
3554 | * If the source entry is not already marked NEEDS_COPY | |
3555 | * we need to write-protect the PTEs. | |
3556 | */ | |
984263bc | 3557 | pmap_protect(src_map->pmap, |
67e7cb85 MD |
3558 | src_entry->ba.start, |
3559 | src_entry->ba.end, | |
44293a80 | 3560 | src_entry->protection & ~VM_PROT_WRITE); |
984263bc MD |
3561 | } |
3562 | ||
3563 | /* | |
44293a80 MD |
3564 | * dst_entry.ba_object might be stale. Update it (its |
3565 | * ref-count has not yet been updated so just overwrite | |
3566 | * the field). | |
9de48ead MD |
3567 | * |
3568 | * If there is no object then we are golden. Also, in | |
3569 | * this situation if there are no backing_ba linkages then | |
67e7cb85 MD |
3570 | * we can set ba.offset to whatever we want. For now we |
3571 | * set the offset for 0 for make debugging object sizes | |
3572 | * easier. | |
984263bc | 3573 | */ |
5b329e62 | 3574 | obj = src_entry->ba.object; |
641f3b0a | 3575 | |
5b329e62 | 3576 | if (obj) { |
b12defdc MD |
3577 | src_entry->eflags |= (MAP_ENTRY_COW | |
3578 | MAP_ENTRY_NEEDS_COPY); | |
3579 | dst_entry->eflags |= (MAP_ENTRY_COW | | |
3580 | MAP_ENTRY_NEEDS_COPY); | |
44293a80 | 3581 | KKASSERT(dst_entry->ba.offset == src_entry->ba.offset); |
984263bc | 3582 | } else { |
67e7cb85 | 3583 | dst_entry->ba.offset = 0; |
984263bc | 3584 | } |
44293a80 | 3585 | |
984263bc | 3586 | /* |
44293a80 MD |
3587 | * Normal, allow the backing_ba link depth to |
3588 | * increase. | |
984263bc | 3589 | */ |
44293a80 | 3590 | pmap_copy(dst_map->pmap, src_map->pmap, |
67e7cb85 MD |
3591 | dst_entry->ba.start, |
3592 | dst_entry->ba.end - dst_entry->ba.start, | |
3593 | src_entry->ba.start); | |
984263bc MD |
3594 | } |
3595 | } | |
3596 | ||
3597 | /* | |
8492a2fe MD |
3598 | * Create a vmspace for a new process and its related vm_map based on an |
3599 | * existing vmspace. The new map inherits information from the old map | |
3600 | * according to inheritance settings. | |
984263bc MD |
3601 | * |
3602 | * The source map must not be locked. | |
46754a20 | 3603 | * No requirements. |
984263bc | 3604 | */ |
0adbcbd6 MD |
3605 | static void vmspace_fork_normal_entry(vm_map_t old_map, vm_map_t new_map, |
3606 | vm_map_entry_t old_entry, int *countp); | |
4aa6d05c MD |
3607 | static void vmspace_fork_uksmap_entry(struct proc *p2, struct lwp *lp2, |
3608 | vm_map_t old_map, vm_map_t new_map, | |
0adbcbd6 MD |
3609 | vm_map_entry_t old_entry, int *countp); |
3610 | ||
984263bc | 3611 | struct vmspace * |
4aa6d05c | 3612 | vmspace_fork(struct vmspace *vm1, struct proc *p2, struct lwp *lp2) |
984263bc MD |
3613 | { |
3614 | struct vmspace *vm2; | |
3615 | vm_map_t old_map = &vm1->vm_map; | |
3616 | vm_map_t new_map; | |
3617 | vm_map_entry_t old_entry; | |
a108bf71 | 3618 | int count; |
984263bc | 3619 | |
b12defdc | 3620 | lwkt_gettoken(&vm1->vm_map.token); |
984263bc | 3621 | vm_map_lock(old_map); |
984263bc | 3622 | |
47ec0953 | 3623 | vm2 = vmspace_alloc(vm_map_min(old_map), vm_map_max(old_map)); |
b12defdc | 3624 | lwkt_gettoken(&vm2->vm_map.token); |
641f3b0a MD |
3625 | |
3626 | /* | |
3627 | * We must bump the timestamp to force any concurrent fault | |
3628 | * to retry. | |
3629 | */ | |
984263bc | 3630 | bcopy(&vm1->vm_startcopy, &vm2->vm_startcopy, |
641f3b0a | 3631 | (caddr_t)&vm1->vm_endcopy - (caddr_t)&vm1->vm_startcopy); |
984263bc MD |
3632 | new_map = &vm2->vm_map; /* XXX */ |
3633 | new_map->timestamp = 1; | |
3634 | ||
46754a20 MD |
3635 | vm_map_lock(new_map); |
3636 | ||
47ec0953 | 3637 | count = old_map->nentries; |
a108bf71 MD |
3638 | count = vm_map_entry_reserve(count + MAP_RESERVE_COUNT); |
3639 | ||
47ec0953 | 3640 | RB_FOREACH(old_entry, vm_map_rb_tree, &old_map->rb_root) { |
0adbcbd6 MD |
3641 | switch(old_entry->maptype) { |
3642 | case VM_MAPTYPE_SUBMAP: | |
984263bc | 3643 | panic("vm_map_fork: encountered a submap"); |
984263bc | 3644 | break; |
0adbcbd6 | 3645 | case VM_MAPTYPE_UKSMAP: |
4aa6d05c MD |
3646 | vmspace_fork_uksmap_entry(p2, lp2, |
3647 | old_map, new_map, | |
0adbcbd6 | 3648 | old_entry, &count); |
984263bc | 3649 | break; |
0adbcbd6 | 3650 | case VM_MAPTYPE_NORMAL: |
0adbcbd6 MD |
3651 | vmspace_fork_normal_entry(old_map, new_map, |
3652 | old_entry, &count); | |
984263bc | 3653 | break; |
4d4f84f5 MD |
3654 | default: |
3655 | /* nothing to do */ | |
3656 | break; | |
984263bc | 3657 | } |
984263bc MD |
3658 | } |
3659 | ||
3660 | new_map->size = old_map->size; | |
46754a20 | 3661 | vm_map_unlock(new_map); |
9de48ead | 3662 | vm_map_unlock(old_map); |
a108bf71 | 3663 | vm_map_entry_release(count); |
2de4f77e | 3664 | |
b12defdc MD |
3665 | lwkt_reltoken(&vm2->vm_map.token); |
3666 | lwkt_reltoken(&vm1->vm_map.token); | |
984263bc MD |
3667 | |
3668 | return (vm2); | |
3669 | } | |
3670 | ||
0adbcbd6 MD |
3671 | static |
3672 | void | |
3673 | vmspace_fork_normal_entry(vm_map_t old_map, vm_map_t new_map, | |
3674 | vm_map_entry_t old_entry, int *countp) | |
3675 | { | |
3676 | vm_map_entry_t new_entry; | |
1c024bc6 | 3677 | vm_map_backing_t ba; |
0adbcbd6 MD |
3678 | vm_object_t object; |
3679 | ||
1c024bc6 MD |
3680 | /* |
3681 | * If the backing_ba link list gets too long then fault it | |
3682 | * all into the head object and dispose of the list. We do | |
3683 | * this in old_entry prior to cloning in order to benefit both | |
3684 | * parent and child. | |
3685 | * | |
3686 | * We can test our fronting object's size against its | |
3687 | * resident_page_count for a really cheap (but probably not perfect) | |
3688 | * all-shadowed test, allowing us to disconnect the backing_ba | |
3689 | * link list early. | |
3690 | */ | |
3691 | object = old_entry->ba.object; | |
3692 | if (old_entry->ba.backing_ba && | |
3693 | (old_entry->ba.backing_count >= vm_map_backing_limit || | |
3694 | (vm_map_backing_shadow_test && object && | |
3695 | object->size == object->resident_page_count))) { | |
3696 | /* | |
3697 | * If there are too many backing_ba linkages we | |
3698 | * collapse everything into the head | |
3699 | * | |
3700 | * This will also remove all the pte's. | |
3701 | */ | |
3702 | if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) | |
5b329e62 | 3703 | vm_map_entry_shadow(old_entry); |
1c024bc6 MD |
3704 | if (object == NULL) |
3705 | vm_map_entry_allocate_object(old_entry); | |
3706 | if (vm_fault_collapse(old_map, old_entry) == KERN_SUCCESS) { | |
3707 | ba = old_entry->ba.backing_ba; | |
3708 | old_entry->ba.backing_ba = NULL; | |
3709 | old_entry->ba.backing_count = 0; | |
64b5a8a5 | 3710 | vm_map_entry_dispose_ba(old_entry, ba); |
1c024bc6 MD |
3711 | } |
3712 | } | |
3713 | object = NULL; /* object variable is now invalid */ | |
3714 | ||
3715 | /* | |
3716 | * Fork the entry | |
3717 | */ | |
0adbcbd6 MD |
3718 | switch (old_entry->inheritance) { |
3719 | case VM_INHERIT_NONE: | |
3720 | break; | |
3721 | case VM_INHERIT_SHARE: | |
3722 | /* | |
9de48ead MD |
3723 | * Clone the entry as a shared entry. This will look like |
3724 | * shared memory across the old and the new process. We must | |
3725 | * ensure that the object is allocated. | |
0adbcbd6 | 3726 | */ |
9de48ead | 3727 | if (old_entry->ba.object == NULL) |
0adbcbd6 MD |
3728 | vm_map_entry_allocate_object(old_entry); |
3729 | ||
3730 | if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) { | |
3731 | /* | |
9de48ead MD |
3732 | * Create the fronting vm_map_backing for |
3733 | * an entry which needs a copy, plus an extra | |
3734 | * ref because we are going to duplicate it | |
3735 | * in the fork. | |
3736 | * | |
3737 | * The call to vm_map_entry_shadow() will also clear | |
0adbcbd6 | 3738 | * OBJ_ONEMAPPING. |
9de48ead MD |
3739 | * |
3740 | * XXX no more collapse. Still need extra ref | |
3741 | * for the fork. | |
0adbcbd6 | 3742 | */ |
5b329e62 | 3743 | vm_map_entry_shadow(old_entry); |
9de48ead | 3744 | } else if (old_entry->ba.object) { |
9de48ead | 3745 | object = old_entry->ba.object; |
0adbcbd6 MD |
3746 | } |
3747 | ||
3748 | /* | |
3749 | * Clone the entry. We've already bumped the ref on | |
9de48ead | 3750 | * the vm_object for our new entry. |
0adbcbd6 | 3751 | */ |
5b329e62 | 3752 | new_entry = vm_map_entry_create(countp); |
0adbcbd6 | 3753 | *new_entry = *old_entry; |
9de48ead | 3754 | |
0adbcbd6 MD |
3755 | new_entry->eflags &= ~MAP_ENTRY_USER_WIRED; |
3756 | new_entry->wired_count = 0; | |
3757 | ||
3758 | /* | |
5b329e62 MD |
3759 | * Replicate and index the vm_map_backing. Don't share |
3760 | * the vm_map_backing across vm_map's (only across clips). | |
3761 | * | |
0adbcbd6 MD |
3762 | * Insert the entry into the new map -- we know we're |
3763 | * inserting at the end of the new map. | |
3764 | */ | |
5b329e62 | 3765 | vm_map_backing_replicated(new_map, new_entry, 0); |
47ec0953 | 3766 | vm_map_entry_link(new_map, new_entry); |
0adbcbd6 MD |
3767 | |
3768 | /* | |
3769 | * Update the physical map | |
3770 | */ | |
3771 | pmap_copy(new_map->pmap, old_map->pmap, | |
67e7cb85 MD |
3772 | new_entry->ba.start, |
3773 | (old_entry->ba.end - old_entry->ba.start), | |
3774 | old_entry->ba.start); | |
0adbcbd6 MD |
3775 | break; |
3776 | case VM_INHERIT_COPY: | |
3777 | /* | |
9de48ead MD |
3778 | * Clone the entry and link the copy into the new map. |
3779 | * | |
3780 | * Note that ref-counting adjustment for old_entry->ba.object | |
3781 | * (if it isn't a special map that is) is handled by | |
3782 | * vm_map_copy_entry(). | |
0adbcbd6 | 3783 | */ |
5b329e62 | 3784 | new_entry = vm_map_entry_create(countp); |
0adbcbd6 | 3785 | *new_entry = *old_entry; |
9de48ead | 3786 | |
0adbcbd6 MD |
3787 | new_entry->eflags &= ~MAP_ENTRY_USER_WIRED; |
3788 | new_entry->wired_count = 0; | |
9de48ead | 3789 | |
5b329e62 | 3790 | vm_map_backing_replicated(new_map, new_entry, 0); |
47ec0953 | 3791 | vm_map_entry_link(new_map, new_entry); |
9de48ead MD |
3792 | |
3793 | /* | |
3794 | * This does the actual dirty work of making both entries | |
3795 | * copy-on-write, and will also handle the fronting object. | |
3796 | */ | |
3797 | vm_map_copy_entry(old_map, new_map, old_entry, new_entry); | |
0adbcbd6 MD |
3798 | break; |
3799 | } | |
3800 | } | |
3801 | ||
3802 | /* | |
3803 | * When forking user-kernel shared maps, the map might change in the | |
3804 | * child so do not try to copy the underlying pmap entries. | |
3805 | */ | |
3806 | static | |
3807 | void | |
4aa6d05c MD |
3808 | vmspace_fork_uksmap_entry(struct proc *p2, struct lwp *lp2, |
3809 | vm_map_t old_map, vm_map_t new_map, | |
0adbcbd6 MD |
3810 | vm_map_entry_t old_entry, int *countp) |
3811 | { | |
3812 | vm_map_entry_t new_entry; | |
3813 | ||
4aa6d05c MD |
3814 | /* |
3815 | * Do not fork lpmap entries whos TIDs do not match lp2's tid. | |
3816 | * | |
3817 | * XXX if p2 is NULL and lp2 is non-NULL, we retain the lpmap entry | |
3818 | * (this is for e.g. resident'ing vmspace's) but set the field | |
3819 | * to NULL. Upon restore it should be restored. XXX NOT IMPL YET | |
3820 | */ | |
3821 | if (old_entry->aux.dev) { | |
3822 | switch(minor(old_entry->aux.dev)) { | |
3823 | case 5: | |
3824 | break; | |
3825 | case 6: | |
3826 | break; | |
3827 | case 7: | |
3828 | if (lp2 == NULL) | |
3829 | return; | |
3830 | if (old_entry->ba.aux_info == NULL) | |
3831 | return; | |
3832 | if (((struct lwp *)old_entry->ba.aux_info)->lwp_tid != | |
3833 | lp2->lwp_tid) | |
3834 | return; | |
3835 | break; | |
3836 | } | |
3837 | } | |
3838 | ||
5b329e62 | 3839 | new_entry = vm_map_entry_create(countp); |
0adbcbd6 | 3840 | *new_entry = *old_entry; |
9de48ead | 3841 | |
0adbcbd6 MD |
3842 | new_entry->eflags &= ~MAP_ENTRY_USER_WIRED; |
3843 | new_entry->wired_count = 0; | |
5b329e62 | 3844 | KKASSERT(new_entry->ba.backing_ba == NULL); |
4aa6d05c MD |
3845 | |
3846 | if (new_entry->aux.dev) { | |
3847 | switch(minor(new_entry->aux.dev)) { | |
3848 | case 5: | |
3849 | /* | |
3850 | * upmap | |
3851 | */ | |
3852 | new_entry->ba.aux_info = p2; | |
3853 | break; | |
3854 | case 6: | |
3855 | /* | |
3856 | * kpmap | |
3857 | */ | |
3858 | new_entry->ba.aux_info = NULL; | |
3859 | break; | |
3860 | case 7: | |
3861 | /* | |
3862 | * lpmap | |
3863 | */ | |
3864 | new_entry->ba.aux_info = lp2; | |
3865 | break; | |
3866 | } | |
3867 | } else { | |
3868 | new_entry->ba.aux_info = NULL; | |
3869 | } | |
3870 | ||
5b329e62 | 3871 | vm_map_backing_replicated(new_map, new_entry, 0); |
9de48ead | 3872 | |
47ec0953 | 3873 | vm_map_entry_link(new_map, new_entry); |
0adbcbd6 MD |
3874 | } |
3875 | ||
46754a20 MD |
3876 | /* |
3877 | * Create an auto-grow stack entry | |
3878 | * | |
3879 | * No requirements. | |
3880 | */ | |
984263bc | 3881 | int |
d6924570 | 3882 | vm_map_stack (vm_map_t map, vm_offset_t *addrbos, vm_size_t max_ssize, |
c809941b | 3883 | int flags, vm_prot_t prot, vm_prot_t max, int cow) |
984263bc | 3884 | { |
85d25bcf | 3885 | vm_map_entry_t prev_entry; |
47ec0953 | 3886 | vm_map_entry_t next; |
85d25bcf MD |
3887 | vm_size_t init_ssize; |
3888 | int rv; | |
a108bf71 | 3889 | int count; |
85d25bcf | 3890 | vm_offset_t tmpaddr; |
984263bc | 3891 | |
c809941b | 3892 | cow |= MAP_IS_STACK; |
984263bc MD |
3893 | |
3894 | if (max_ssize < sgrowsiz) | |
3895 | init_ssize = max_ssize; | |
3896 | else | |
3897 | init_ssize = sgrowsiz; | |
3898 | ||
a108bf71 | 3899 | count = vm_map_entry_reserve(MAP_RESERVE_COUNT); |
984263bc MD |
3900 | vm_map_lock(map); |
3901 | ||
85d25bcf MD |
3902 | /* |
3903 | * Find space for the mapping | |
3904 | */ | |
cadb984b | 3905 | if ((flags & (MAP_FIXED | MAP_TRYFIXED)) == 0) { |
d6924570 | 3906 | if (vm_map_findspace(map, *addrbos, max_ssize, 1, |
c809941b | 3907 | flags, &tmpaddr)) { |
85d25bcf MD |
3908 | vm_map_unlock(map); |
3909 | vm_map_entry_release(count); | |
3910 | return (KERN_NO_SPACE); | |
3911 | } | |
d6924570 | 3912 | *addrbos = tmpaddr; |
85d25bcf MD |
3913 | } |
3914 | ||
984263bc | 3915 | /* If addr is already mapped, no go */ |
d6924570 | 3916 | if (vm_map_lookup_entry(map, *addrbos, &prev_entry)) { |
984263bc | 3917 | vm_map_unlock(map); |
a108bf71 | 3918 | vm_map_entry_release(count); |
984263bc MD |
3919 | return (KERN_NO_SPACE); |
3920 | } | |
3921 | ||
85d25bcf MD |
3922 | #if 0 |
3923 | /* XXX already handled by kern_mmap() */ | |
984263bc MD |
3924 | /* If we would blow our VMEM resource limit, no go */ |
3925 | if (map->size + init_ssize > | |
3926 | curproc->p_rlimit[RLIMIT_VMEM].rlim_cur) { | |
3927 | vm_map_unlock(map); | |
a108bf71 | 3928 | vm_map_entry_release(count); |
984263bc MD |
3929 | return (KERN_NO_SPACE); |
3930 | } | |
85d25bcf | 3931 | #endif |
984263bc | 3932 | |
85d25bcf MD |
3933 | /* |
3934 | * If we can't accomodate max_ssize in the current mapping, | |
984263bc MD |
3935 | * no go. However, we need to be aware that subsequent user |
3936 | * mappings might map into the space we have reserved for | |
3937 | * stack, and currently this space is not protected. | |
3938 | * | |
3939 | * Hopefully we will at least detect this condition | |
3940 | * when we try to grow the stack. | |
3941 | */ | |
47ec0953 MD |
3942 | if (prev_entry) |
3943 | next = vm_map_rb_tree_RB_NEXT(prev_entry); | |
3944 | else | |
3945 | next = RB_MIN(vm_map_rb_tree, &map->rb_root); | |
3946 | ||
67e7cb85 | 3947 | if (next && next->ba.start < *addrbos + max_ssize) { |
984263bc | 3948 | vm_map_unlock(map); |
a108bf71 | 3949 | vm_map_entry_release(count); |
984263bc MD |
3950 | return (KERN_NO_SPACE); |
3951 | } | |
3952 | ||
85d25bcf MD |
3953 | /* |
3954 | * We initially map a stack of only init_ssize. We will | |
984263bc MD |
3955 | * grow as needed later. Since this is to be a grow |
3956 | * down stack, we map at the top of the range. | |
3957 | * | |
3958 | * Note: we would normally expect prot and max to be | |
3959 | * VM_PROT_ALL, and cow to be 0. Possibly we should | |
3960 | * eliminate these as input parameters, and just | |
3961 | * pass these values here in the insert call. | |
3962 | */ | |
64b5a8a5 MD |
3963 | rv = vm_map_insert(map, &count, |
3964 | NULL, NULL, | |
3965 | 0, NULL, | |
3966 | *addrbos + max_ssize - init_ssize, | |
d6924570 | 3967 | *addrbos + max_ssize, |
1b874851 | 3968 | VM_MAPTYPE_NORMAL, |
3091de50 | 3969 | VM_SUBSYS_STACK, prot, max, cow); |
984263bc MD |
3970 | |
3971 | /* Now set the avail_ssize amount */ | |
517e1666 | 3972 | if (rv == KERN_SUCCESS) { |
47ec0953 MD |
3973 | if (prev_entry) |
3974 | next = vm_map_rb_tree_RB_NEXT(prev_entry); | |
3975 | else | |
3976 | next = RB_MIN(vm_map_rb_tree, &map->rb_root); | |
3977 | if (prev_entry != NULL) { | |
3978 | vm_map_clip_end(map, | |
3979 | prev_entry, | |
d6924570 | 3980 | *addrbos + max_ssize - init_ssize, |
47ec0953 MD |
3981 | &count); |
3982 | } | |
67e7cb85 MD |
3983 | if (next->ba.end != *addrbos + max_ssize || |
3984 | next->ba.start != *addrbos + max_ssize - init_ssize){ | |
984263bc | 3985 | panic ("Bad entry start/end for new stack entry"); |
47ec0953 MD |
3986 | } else { |
3987 | next->aux.avail_ssize = max_ssize - init_ssize; | |
3988 | } | |
984263bc MD |
3989 | } |
3990 | ||
3991 | vm_map_unlock(map); | |
a108bf71 | 3992 | vm_map_entry_release(count); |
984263bc MD |
3993 | return (rv); |
3994 | } | |
3995 | ||
46754a20 MD |
3996 | /* |
3997 | * Attempts to grow a vm stack entry. Returns KERN_SUCCESS if the | |
984263bc MD |
3998 | * desired address is already mapped, or if we successfully grow |
3999 | * the stack. Also returns KERN_SUCCESS if addr is outside the | |
4000 | * stack range (this is strange, but preserves compatibility with | |
4001 | * the grow function in vm_machdep.c). | |
46754a20 MD |
4002 | * |
4003 | * No requirements. | |
984263bc MD |
4004 | */ |
4005 | int | |
95270b7e | 4006 | vm_map_growstack (vm_map_t map, vm_offset_t addr) |
984263bc MD |
4007 | { |
4008 | vm_map_entry_t prev_entry; | |
4009 | vm_map_entry_t stack_entry; | |
47ec0953 | 4010 | vm_map_entry_t next; |
95270b7e MD |
4011 | struct vmspace *vm; |
4012 | struct lwp *lp; | |
4013 | struct proc *p; | |
984263bc | 4014 | vm_offset_t end; |
a108bf71 MD |
4015 | int grow_amount; |
4016 | int rv = KERN_SUCCESS; | |
4017 | int is_procstack; | |
4018 | int use_read_lock = 1; | |
4019 | int count; | |
984263bc | 4020 | |
95270b7e MD |
4021 | /* |
4022 | * Find the vm | |
4023 | */ | |
4024 | lp = curthread->td_lwp; | |
4025 | p = curthread->td_proc; | |
4026 | KKASSERT(lp != NULL); | |
4027 | vm = lp->lwp_vmspace; | |
0caf6628 MD |
4028 | |
4029 | /* | |
4030 | * Growstack is only allowed on the current process. We disallow | |
4031 | * other use cases, e.g. trying to access memory via procfs that | |
4032 | * the stack hasn't grown into. | |
4033 | */ | |
4034 | if (map != &vm->vm_map) { | |
4035 | return KERN_FAILURE; | |
4036 | } | |
95270b7e | 4037 | |
a108bf71 | 4038 | count = vm_map_entry_reserve(MAP_RESERVE_COUNT); |
984263bc MD |
4039 | Retry: |
4040 | if (use_read_lock) | |
4041 | vm_map_lock_read(map); | |
4042 | else | |
4043 | vm_map_lock(map); | |
4044 | ||
47ec0953 MD |
4045 | /* |
4046 | * If addr is already in the entry range, no need to grow. | |
4047 | * prev_entry returns NULL if addr is at the head. | |
4048 | */ | |
984263bc MD |
4049 | if (vm_map_lookup_entry(map, addr, &prev_entry)) |
4050 | goto done; | |
47ec0953 MD |
4051 | if (prev_entry) |
4052 | stack_entry = vm_map_rb_tree_RB_NEXT(prev_entry); | |
4053 | else | |
4054 | stack_entry = RB_MIN(vm_map_rb_tree, &map->rb_root); | |
984263bc | 4055 | |
47ec0953 | 4056 | if (stack_entry == NULL) |
984263bc | 4057 | goto done; |
47ec0953 | 4058 | if (prev_entry == NULL) |
67e7cb85 | 4059 | end = stack_entry->ba.start - stack_entry->aux.avail_ssize; |
984263bc | 4060 | else |
67e7cb85 | 4061 | end = prev_entry->ba.end; |
984263bc | 4062 | |
c809941b MD |
4063 | /* |
4064 | * This next test mimics the old grow function in vm_machdep.c. | |
984263bc MD |
4065 | * It really doesn't quite make sense, but we do it anyway |
4066 | * for compatibility. | |
4067 | * | |
4068 | * If not growable stack, return success. This signals the | |
4069 | * caller to proceed as he would normally with normal vm. | |
4070 | */ | |
afeabdca | 4071 | if (stack_entry->aux.avail_ssize < 1 || |
67e7cb85 MD |
4072 | addr >= stack_entry->ba.start || |
4073 | addr < stack_entry->ba.start - stack_entry->aux.avail_ssize) { | |
984263bc MD |
4074 | goto done; |
4075 | } | |
4076 | ||
4077 | /* Find the minimum grow amount */ | |
67e7cb85 | 4078 | grow_amount = roundup (stack_entry->ba.start - addr, PAGE_SIZE); |
afeabdca | 4079 | if (grow_amount > stack_entry->aux.avail_ssize) { |
984263bc MD |
4080 | rv = KERN_NO_SPACE; |
4081 | goto done; | |
4082 | } | |
4083 | ||
c809941b MD |
4084 | /* |
4085 | * If there is no longer enough space between the entries | |
984263bc MD |
4086 | * nogo, and adjust the available space. Note: this |
4087 | * should only happen if the user has mapped into the | |
4088 | * stack area after the stack was created, and is | |
4089 | * probably an error. | |
4090 | * | |
4091 | * This also effectively destroys any guard page the user | |
4092 | * might have intended by limiting the stack size. | |
4093 | */ | |
67e7cb85 | 4094 | if (grow_amount > stack_entry->ba.start - end) { |
984263bc | 4095 | if (use_read_lock && vm_map_lock_upgrade(map)) { |
aacb506b | 4096 | /* lost lock */ |
984263bc MD |
4097 | use_read_lock = 0; |
4098 | goto Retry; | |
4099 | } | |
4100 | use_read_lock = 0; | |
67e7cb85 | 4101 | stack_entry->aux.avail_ssize = stack_entry->ba.start - end; |
984263bc MD |
4102 | rv = KERN_NO_SPACE; |
4103 | goto done; | |
4104 | } | |
4105 | ||
4106 | is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr; | |
4107 | ||
4108 | /* If this is the main process stack, see if we're over the | |
4109 | * stack limit. | |
4110 | */ | |
4b566556 | 4111 | if (is_procstack && (vm->vm_ssize + grow_amount > |
984263bc MD |
4112 | p->p_rlimit[RLIMIT_STACK].rlim_cur)) { |
4113 | rv = KERN_NO_SPACE; | |
4114 | goto done; | |
4115 | } | |
4116 | ||
4117 | /* Round up the grow amount modulo SGROWSIZ */ | |
4118 | grow_amount = roundup (grow_amount, sgrowsiz); | |
afeabdca MD |
4119 | if (grow_amount > stack_entry->aux.avail_ssize) { |
4120 | grow_amount = stack_entry->aux.avail_ssize; | |
984263bc | 4121 | } |
4b566556 | 4122 | if (is_procstack && (vm->vm_ssize + grow_amount > |
984263bc | 4123 | p->p_rlimit[RLIMIT_STACK].rlim_cur)) { |
4b566556 | 4124 | grow_amount = p->p_rlimit[RLIMIT_STACK].rlim_cur - vm->vm_ssize; |
984263bc MD |
4125 | } |
4126 | ||
4127 | /* If we would blow our VMEM resource limit, no go */ | |
be77b5f9 | 4128 | if (map->size + grow_amount > p->p_rlimit[RLIMIT_VMEM].rlim_cur) { |
984263bc MD |
4129 | rv = KERN_NO_SPACE; |
4130 | goto done; | |
4131 | } | |
4132 | ||
4133 | if (use_read_lock && vm_map_lock_upgrade(map)) { | |
aacb506b | 4134 | /* lost lock */ |
984263bc MD |
4135 | use_read_lock = 0; |
4136 | goto Retry; | |
4137 | } | |
4138 | use_read_lock = 0; | |
4139 | ||
4140 | /* Get the preliminary new entry start value */ | |
67e7cb85 | 4141 | addr = stack_entry->ba.start - grow_amount; |
984263bc MD |
4142 | |
4143 | /* If this puts us into the previous entry, cut back our growth | |
4144 | * to the available space. Also, see the note above. | |
4145 | */ | |
4146 | if (addr < end) { | |
67e7cb85 | 4147 | stack_entry->aux.avail_ssize = stack_entry->ba.start - end; |
984263bc MD |
4148 | addr = end; |
4149 | } | |
4150 | ||
64b5a8a5 MD |
4151 | rv = vm_map_insert(map, &count, |
4152 | NULL, NULL, | |
4153 | 0, NULL, | |
4154 | addr, stack_entry->ba.start, | |
1b874851 | 4155 | VM_MAPTYPE_NORMAL, |
3091de50 | 4156 | VM_SUBSYS_STACK, VM_PROT_ALL, VM_PROT_ALL, 0); |
984263bc MD |
4157 | |
4158 | /* Adjust the available stack space by the amount we grew. */ | |
4159 | if (rv == KERN_SUCCESS) { | |
47ec0953 | 4160 | if (prev_entry) { |
a108bf71 | 4161 | vm_map_clip_end(map, prev_entry, addr, &count); |
47ec0953 MD |
4162 | next = vm_map_rb_tree_RB_NEXT(prev_entry); |
4163 | } else { | |
4164 | next = RB_MIN(vm_map_rb_tree, &map->rb_root); | |
4165 | } | |
67e7cb85 MD |
4166 | if (next->ba.end != stack_entry->ba.start || |
4167 | next->ba.start != addr) { | |
984263bc | 4168 | panic ("Bad stack grow start/end in new stack entry"); |
47ec0953 MD |
4169 | } else { |
4170 | next->aux.avail_ssize = | |
afeabdca | 4171 | stack_entry->aux.avail_ssize - |
67e7cb85 | 4172 | (next->ba.end - next->ba.start); |
4b566556 | 4173 | if (is_procstack) { |
67e7cb85 MD |
4174 | vm->vm_ssize += next->ba.end - |
4175 | next->ba.start; | |
4b566556 | 4176 | } |
984263bc | 4177 | } |
7c553423 | 4178 | |
949c56f8 MD |
4179 | if (map->flags & MAP_WIREFUTURE) { |
4180 | vm_map_user_wiring(map, | |
4181 | next->ba.start, | |
4182 | next->ba.end, | |
4183 | FALSE); | |
4184 | } | |
984263bc MD |
4185 | } |
4186 | ||
4187 | done: | |
4188 | if (use_read_lock) | |
4189 | vm_map_unlock_read(map); | |
4190 | else | |
4191 | vm_map_unlock(map); | |
a108bf71 | 4192 | vm_map_entry_release(count); |
984263bc MD |
4193 | return (rv); |
4194 | } | |
4195 | ||
4196 | /* | |
4197 | * Unshare the specified VM space for exec. If other processes are | |
4198 | * mapped to it, then create a new one. The new vmspace is null. | |
46754a20 MD |
4199 | * |
4200 | * No requirements. | |
984263bc | 4201 | */ |
984263bc | 4202 | void |
29802dbb | 4203 | vmspace_exec(struct proc *p, struct vmspace *vmcopy) |
a108bf71 | 4204 | { |
984263bc MD |
4205 | struct vmspace *oldvmspace = p->p_vmspace; |
4206 | struct vmspace *newvmspace; | |
4207 | vm_map_t map = &p->p_vmspace->vm_map; | |
4208 | ||
29802dbb MD |
4209 | /* |
4210 | * If we are execing a resident vmspace we fork it, otherwise | |
7adb15b6 SW |
4211 | * we create a new vmspace. Note that exitingcnt is not |
4212 | * copied to the new vmspace. | |
29802dbb | 4213 | */ |
b12defdc | 4214 | lwkt_gettoken(&oldvmspace->vm_map.token); |
29802dbb | 4215 | if (vmcopy) { |
4aa6d05c | 4216 | newvmspace = vmspace_fork(vmcopy, NULL, NULL); |
b12defdc | 4217 | lwkt_gettoken(&newvmspace->vm_map.token); |
29802dbb | 4218 | } else { |
47ec0953 | 4219 | newvmspace = vmspace_alloc(vm_map_min(map), vm_map_max(map)); |
b12defdc | 4220 | lwkt_gettoken(&newvmspace->vm_map.token); |
46754a20 MD |
4221 | bcopy(&oldvmspace->vm_startcopy, &newvmspace->vm_startcopy, |
4222 | (caddr_t)&oldvmspace->vm_endcopy - | |
4223 | (caddr_t)&oldvmspace->vm_startcopy); | |
29802dbb MD |
4224 | } |
4225 | ||
984263bc | 4226 | /* |
e3161323 MD |
4227 | * Finish initializing the vmspace before assigning it |
4228 | * to the process. The vmspace will become the current vmspace | |
4229 | * if p == curproc. | |
984263bc | 4230 | */ |
984263bc | 4231 | pmap_pinit2(vmspace_pmap(newvmspace)); |
e3161323 | 4232 | pmap_replacevm(p, newvmspace, 0); |
b12defdc MD |
4233 | lwkt_reltoken(&newvmspace->vm_map.token); |
4234 | lwkt_reltoken(&oldvmspace->vm_map.token); | |
93f86408 | 4235 | vmspace_rel(oldvmspace); |
984263bc MD |
4236 | } |
4237 | ||
4238 | /* | |
4239 | * Unshare the specified VM space for forcing COW. This | |
4240 | * is called by rfork, for the (RFMEM|RFPROC) == 0 case. | |
4241 | */ | |
984263bc | 4242 | void |
a108bf71 MD |
4243 | vmspace_unshare(struct proc *p) |
4244 | { | |
984263bc MD |
4245 | struct vmspace *oldvmspace = p->p_vmspace; |
4246 | struct vmspace *newvmspace; | |
4247 | ||
b12defdc | 4248 | lwkt_gettoken(&oldvmspace->vm_map.token); |
93f86408 | 4249 | if (vmspace_getrefs(oldvmspace) == 1) { |
b12defdc | 4250 | lwkt_reltoken(&oldvmspace->vm_map.token); |
984263bc | 4251 | return; |
b12defdc | 4252 | } |
4aa6d05c | 4253 | newvmspace = vmspace_fork(oldvmspace, NULL, NULL); |
b12defdc | 4254 | lwkt_gettoken(&newvmspace->vm_map.token); |
984263bc | 4255 | pmap_pinit2(vmspace_pmap(newvmspace)); |
e3161323 | 4256 | pmap_replacevm(p, newvmspace, 0); |
b12defdc MD |
4257 | lwkt_reltoken(&newvmspace->vm_map.token); |
4258 | lwkt_reltoken(&oldvmspace->vm_map.token); | |
93f86408 | 4259 | vmspace_rel(oldvmspace); |
984263bc | 4260 | } |
984263bc | 4261 | |
911e30e2 AH |
4262 | /* |
4263 | * vm_map_hint: return the beginning of the best area suitable for | |
4264 | * creating a new mapping with "prot" protection. | |
4265 | * | |
4266 | * No requirements. | |
4267 | */ | |
4268 | vm_offset_t | |
4269 | vm_map_hint(struct proc *p, vm_offset_t addr, vm_prot_t prot) | |
4270 | { | |
4271 | struct vmspace *vms = p->p_vmspace; | |
4b566556 MD |
4272 | struct rlimit limit; |
4273 | rlim_t dsiz; | |
4274 | ||
4275 | /* | |
4276 | * Acquire datasize limit for mmap() operation, | |
4277 | * calculate nearest power of 2. | |
4278 | */ | |
4279 | if (kern_getrlimit(RLIMIT_DATA, &limit)) | |
4280 | limit.rlim_cur = maxdsiz; | |
4281 | dsiz = limit.rlim_cur; | |
911e30e2 | 4282 | |
d9c783bb | 4283 | if (!randomize_mmap || addr != 0) { |
911e30e2 AH |
4284 | /* |
4285 | * Set a reasonable start point for the hint if it was | |
4286 | * not specified or if it falls within the heap space. | |
4287 | * Hinted mmap()s do not allocate out of the heap space. | |
4288 | */ | |
4289 | if (addr == 0 || | |
4290 | (addr >= round_page((vm_offset_t)vms->vm_taddr) && | |
4b566556 MD |
4291 | addr < round_page((vm_offset_t)vms->vm_daddr + dsiz))) { |
4292 | addr = round_page((vm_offset_t)vms->vm_daddr + dsiz); | |
911e30e2 AH |
4293 | } |
4294 | ||
4295 | return addr; | |
4296 | } | |
911e30e2 | 4297 | |
4b566556 MD |
4298 | /* |
4299 | * randomize_mmap && addr == 0. For now randomize the | |
4300 | * address within a dsiz range beyond the data limit. | |
4301 | */ | |
4302 | addr = (vm_offset_t)vms->vm_daddr + dsiz; | |
4303 | if (dsiz) | |
4304 | addr += (karc4random64() & 0x7FFFFFFFFFFFFFFFLU) % dsiz; | |
911e30e2 AH |
4305 | return (round_page(addr)); |
4306 | } | |
4307 | ||
984263bc | 4308 | /* |
46754a20 MD |
4309 | * Finds the VM object, offset, and protection for a given virtual address |
4310 | * in the specified map, assuming a page fault of the type specified. | |
984263bc | 4311 | * |
46754a20 MD |
4312 | * Leaves the map in question locked for read; return values are guaranteed |
4313 | * until a vm_map_lookup_done call is performed. Note that the map argument | |
4314 | * is in/out; the returned map must be used in the call to vm_map_lookup_done. | |
984263bc | 4315 | * |
46754a20 MD |
4316 | * A handle (out_entry) is returned for use in vm_map_lookup_done, to make |
4317 | * that fast. | |
984263bc | 4318 | * |
46754a20 MD |
4319 | * If a lookup is requested with "write protection" specified, the map may |
4320 | * be changed to perform virtual copying operations, although the data | |
4321 | * referenced will remain the same. | |
984263bc | 4322 | * |
46754a20 | 4323 | * No requirements. |
984263bc MD |
4324 | */ |
4325 | int | |
4326 | vm_map_lookup(vm_map_t *var_map, /* IN/OUT */ | |
4327 | vm_offset_t vaddr, | |
4328 | vm_prot_t fault_typea, | |
4329 | vm_map_entry_t *out_entry, /* OUT */ | |
44293a80 | 4330 | struct vm_map_backing **bap, /* OUT */ |
984263bc | 4331 | vm_pindex_t *pindex, /* OUT */ |
01251219 | 4332 | vm_pindex_t *pcount, /* OUT */ |
984263bc | 4333 | vm_prot_t *out_prot, /* OUT */ |
7a45978d | 4334 | int *wflags) /* OUT */ |
984263bc MD |
4335 | { |
4336 | vm_map_entry_t entry; | |
4337 | vm_map_t map = *var_map; | |
4338 | vm_prot_t prot; | |
4339 | vm_prot_t fault_type = fault_typea; | |
4340 | int use_read_lock = 1; | |
4341 | int rv = KERN_SUCCESS; | |
ce5d7a1c | 4342 | int count; |
adbd6814 | 4343 | thread_t td = curthread; |
984263bc | 4344 | |
adbd6814 MD |
4345 | /* |
4346 | * vm_map_entry_reserve() implements an important mitigation | |
4347 | * against mmap() span running the kernel out of vm_map_entry | |
4348 | * structures, but it can also cause an infinite call recursion. | |
4349 | * Use td_nest_count to prevent an infinite recursion (allows | |
4350 | * the vm_map code to dig into the pcpu vm_map_entry reserve). | |
4351 | */ | |
ce5d7a1c | 4352 | count = 0; |
adbd6814 MD |
4353 | if (td->td_nest_count == 0) { |
4354 | ++td->td_nest_count; | |
ce5d7a1c | 4355 | count = vm_map_entry_reserve(MAP_RESERVE_COUNT); |
adbd6814 MD |
4356 | --td->td_nest_count; |
4357 | } | |
984263bc MD |
4358 | RetryLookup: |
4359 | if (use_read_lock) | |
4360 | vm_map_lock_read(map); | |
4361 | else | |
4362 | vm_map_lock(map); | |
4363 | ||
4364 | /* | |
e6b81333 MD |
4365 | * Always do a full lookup. The hint doesn't get us much anymore |
4366 | * now that the map is RB'd. | |
984263bc | 4367 | */ |
aacb506b | 4368 | cpu_ccfence(); |
47ec0953 | 4369 | *out_entry = NULL; |
9de48ead | 4370 | *bap = NULL; |
984263bc | 4371 | |
e6b81333 | 4372 | { |
984263bc MD |
4373 | vm_map_entry_t tmp_entry; |
4374 | ||
984263bc MD |
4375 | if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) { |
4376 | rv = KERN_INVALID_ADDRESS; | |
4377 | goto done; | |
4378 | } | |
984263bc MD |
4379 | entry = tmp_entry; |
4380 | *out_entry = entry; | |
4381 | } | |
4382 | ||
4383 | /* | |
4384 | * Handle submaps. | |
4385 | */ | |
1b874851 | 4386 | if (entry->maptype == VM_MAPTYPE_SUBMAP) { |
984263bc MD |
4387 | vm_map_t old_map = map; |
4388 | ||
9de48ead | 4389 | *var_map = map = entry->ba.sub_map; |
984263bc MD |
4390 | if (use_read_lock) |
4391 | vm_map_unlock_read(old_map); | |
4392 | else | |
4393 | vm_map_unlock(old_map); | |
4394 | use_read_lock = 1; | |
4395 | goto RetryLookup; | |
4396 | } | |
4397 | ||
4398 | /* | |
4399 | * Check whether this task is allowed to have this page. | |
62cc5940 MD |
4400 | * Note the special case for MAP_ENTRY_COW pages with an override. |
4401 | * This is to implement a forced COW for debuggers. | |
984263bc | 4402 | */ |
984263bc MD |
4403 | if (fault_type & VM_PROT_OVERRIDE_WRITE) |
4404 | prot = entry->max_protection; | |
4405 | else | |
4406 | prot = entry->protection; | |
4407 | ||
4408 | fault_type &= (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE); | |
4409 | if ((fault_type & prot) != fault_type) { | |
4410 | rv = KERN_PROTECTION_FAILURE; | |
4411 | goto done; | |
4412 | } | |
4413 | ||
4414 | if ((entry->eflags & MAP_ENTRY_USER_WIRED) && | |
4415 | (entry->eflags & MAP_ENTRY_COW) && | |
4416 | (fault_type & VM_PROT_WRITE) && | |
4417 | (fault_typea & VM_PROT_OVERRIDE_WRITE) == 0) { | |
4418 | rv = KERN_PROTECTION_FAILURE; | |
4419 | goto done; | |
4420 | } | |
4421 | ||
4422 | /* | |
c936cb6f MD |
4423 | * Flag regular pages that are supposed to be wired. Remove prior |
4424 | * semantics that disallowed protection changes for such pages. | |
4425 | * | |
4426 | * The prior semantics are not used by modern systems. Applications | |
4427 | * do not assume an inability to change protection modes and may | |
4428 | * operate incorrectly if we try to prevent protection changes. | |
4429 | * | |
4430 | * Modern applications are aware that even for locked memory, | |
4431 | * changing protection modes, modifying MAP_PRIVATE mappings, | |
4432 | * or fork() may still cause page faults on the locked memory. | |
984263bc | 4433 | */ |
7a45978d MD |
4434 | *wflags = 0; |
4435 | if (entry->wired_count) { | |
4436 | *wflags |= FW_WIRED; | |
c936cb6f | 4437 | #if 0 |
984263bc | 4438 | prot = fault_type = entry->protection; |
c936cb6f | 4439 | #endif |
7a45978d | 4440 | } |
984263bc | 4441 | |
a86ce0cd MD |
4442 | if (curthread->td_lwp && curthread->td_lwp->lwp_vmspace && |
4443 | pmap_emulate_ad_bits(&curthread->td_lwp->lwp_vmspace->vm_pmap)) { | |
4444 | if ((prot & VM_PROT_WRITE) == 0) | |
4445 | fault_type |= VM_PROT_WRITE; | |
4446 | } | |
4447 | ||
0adbcbd6 | 4448 | /* |
4d4f84f5 | 4449 | * Only NORMAL maps are object-based. UKSMAPs are not. |
0adbcbd6 | 4450 | */ |
4d4f84f5 | 4451 | if (entry->maptype != VM_MAPTYPE_NORMAL) { |
9de48ead | 4452 | *bap = NULL; |
0adbcbd6 MD |
4453 | goto skip; |
4454 | } | |
4455 | ||
568e6804 MD |
4456 | /* |
4457 | * If the entry was copy-on-write, we either ... | |
4458 | */ | |
984263bc MD |
4459 | if (entry->eflags & MAP_ENTRY_NEEDS_COPY) { |
4460 | /* | |
4461 | * If we want to write the page, we may as well handle that | |
4462 | * now since we've got the map locked. | |
4463 | * | |
4464 | * If we don't need to write the page, we just demote the | |
4465 | * permissions allowed. | |
4466 | */ | |
984263bc | 4467 | if (fault_type & VM_PROT_WRITE) { |
efad0641 CR |
4468 | /* |
4469 | * Not allowed if TDF_NOFAULT is set as the shadowing | |
4470 | * operation can deadlock against the faulting | |
4471 | * function due to the copy-on-write. | |
4472 | */ | |
4473 | if (curthread->td_flags & TDF_NOFAULT) { | |
4474 | rv = KERN_FAILURE_NOFAULT; | |
4475 | goto done; | |
4476 | } | |
4477 | ||
984263bc | 4478 | /* |
9de48ead MD |
4479 | * Make a new vm_map_backing + object, and place it |
4480 | * in the object chain. Note that no new references | |
4481 | * have appeared -- one just moved from the map to | |
4482 | * the new object. | |
984263bc | 4483 | */ |
984263bc | 4484 | if (use_read_lock && vm_map_lock_upgrade(map)) { |
aacb506b | 4485 | /* lost lock */ |
984263bc MD |
4486 | use_read_lock = 0; |
4487 | goto RetryLookup; | |
4488 | } | |
4489 | use_read_lock = 0; | |
5b329e62 | 4490 | vm_map_entry_shadow(entry); |
7a45978d | 4491 | *wflags |= FW_DIDCOW; |
984263bc MD |
4492 | } else { |
4493 | /* | |
4494 | * We're attempting to read a copy-on-write page -- | |
4495 | * don't allow writes. | |
4496 | */ | |
984263bc MD |
4497 | prot &= ~VM_PROT_WRITE; |
4498 | } | |
4499 | } | |
4500 | ||
4501 | /* | |
ce5d7a1c MD |
4502 | * Create an object if necessary. This code also handles |
4503 | * partitioning large entries to improve vm_fault performance. | |
984263bc | 4504 | */ |
9de48ead | 4505 | if (entry->ba.object == NULL && !map->system_map) { |
984263bc | 4506 | if (use_read_lock && vm_map_lock_upgrade(map)) { |
aacb506b | 4507 | /* lost lock */ |
984263bc MD |
4508 | use_read_lock = 0; |
4509 | goto RetryLookup; | |
4510 | } | |
4511 | use_read_lock = 0; | |
ce5d7a1c MD |
4512 | |
4513 | /* | |
4514 | * Partition large entries, giving each its own VM object, | |
4515 | * to improve concurrent fault performance. This is only | |
4516 | * applicable to userspace. | |
4517 | */ | |
1eeaf6b2 | 4518 | if (map != kernel_map && |
ce5d7a1c | 4519 | entry->maptype == VM_MAPTYPE_NORMAL && |
67e7cb85 MD |
4520 | ((entry->ba.start ^ entry->ba.end) & |
4521 | ~MAP_ENTRY_PARTITION_MASK) && | |
641f3b0a | 4522 | vm_map_partition_enable) { |
ce5d7a1c MD |
4523 | if (entry->eflags & MAP_ENTRY_IN_TRANSITION) { |
4524 | entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; | |
4525 | ++mycpu->gd_cnt.v_intrans_coll; | |
4526 | ++mycpu->gd_cnt.v_intrans_wait; | |
641f3b0a | 4527 | vm_map_transition_wait(map, 0); |
ce5d7a1c MD |
4528 | goto RetryLookup; |
4529 | } | |
4530 | vm_map_entry_partition(map, entry, vaddr, &count); | |
4531 | } | |
53025830 | 4532 | vm_map_entry_allocate_object(entry); |
984263bc MD |
4533 | } |
4534 | ||
4535 | /* | |
4536 | * Return the object/offset from this entry. If the entry was | |
4537 | * copy-on-write or empty, it has been fixed up. | |
4538 | */ | |
9de48ead | 4539 | *bap = &entry->ba; |
984263bc | 4540 | |
0adbcbd6 | 4541 | skip: |
67e7cb85 | 4542 | *pindex = OFF_TO_IDX((vaddr - entry->ba.start) + entry->ba.offset); |
01251219 | 4543 | *pcount = OFF_TO_IDX(entry->ba.end - trunc_page(vaddr)); |
984263bc MD |
4544 | |
4545 | /* | |
4546 | * Return whether this is the only map sharing this data. On | |
4547 | * success we return with a read lock held on the map. On failure | |
4548 | * we return with the map unlocked. | |
4549 | */ | |
4550 | *out_prot = prot; | |
4551 | done: | |
4552 | if (rv == KERN_SUCCESS) { | |
4553 | if (use_read_lock == 0) | |
4554 | vm_map_lock_downgrade(map); | |
4555 | } else if (use_read_lock) { | |
4556 | vm_map_unlock_read(map); | |
4557 | } else { | |
4558 | vm_map_unlock(map); | |
4559 | } | |
adbd6814 | 4560 | if (count > 0) |
ce5d7a1c MD |
4561 | vm_map_entry_release(count); |
4562 | ||
984263bc MD |
4563 | return (rv); |
4564 | } | |
4565 | ||
4566 | /* | |
46754a20 MD |
4567 | * Releases locks acquired by a vm_map_lookup() |
4568 | * (according to the handle returned by that lookup). | |
984263bc | 4569 | * |
46754a20 | 4570 | * No other requirements. |
984263bc | 4571 | */ |
984263bc | 4572 | void |
a108bf71 | 4573 | vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry, int count) |
984263bc MD |
4574 | { |
4575 | /* | |
4576 | * Unlock the main-level map | |
4577 | */ | |
984263bc | 4578 | vm_map_unlock_read(map); |
a108bf71 MD |
4579 | if (count) |
4580 | vm_map_entry_release(count); | |
984263bc MD |
4581 | } |
4582 | ||
ce5d7a1c MD |
4583 | static void |
4584 | vm_map_entry_partition(vm_map_t map, vm_map_entry_t entry, | |
4585 | vm_offset_t vaddr, int *countp) | |
4586 | { | |
4587 | vaddr &= ~MAP_ENTRY_PARTITION_MASK; | |
4588 | vm_map_clip_start(map, entry, vaddr, countp); | |
4589 | vaddr += MAP_ENTRY_PARTITION_SIZE; | |
4590 | vm_map_clip_end(map, entry, vaddr, countp); | |
4591 | } | |
4592 | ||
fc531fbc MD |
4593 | /* |
4594 | * Quick hack, needs some help to make it more SMP friendly. | |
4595 | */ | |
4596 | void | |
4597 | vm_map_interlock(vm_map_t map, struct vm_map_ilock *ilock, | |
4598 | vm_offset_t ran_beg, vm_offset_t ran_end) | |
4599 | { | |
4600 | struct vm_map_ilock *scan; | |
4601 | ||
4602 | ilock->ran_beg = ran_beg; | |
4603 | ilock->ran_end = ran_end; | |
4604 | ilock->flags = 0; | |
4605 | ||
4606 | spin_lock(&map->ilock_spin); | |
4607 | restart: | |
4608 | for (scan = map->ilock_base; scan; scan = scan->next) { | |
4609 | if (ran_end > scan->ran_beg && ran_beg < scan->ran_end) { | |
4610 | scan->flags |= ILOCK_WAITING; | |
4611 | ssleep(scan, &map->ilock_spin, 0, "ilock", 0); | |
4612 | goto restart; | |
4613 | } | |
4614 | } | |
4615 | ilock->next = map->ilock_base; | |
4616 | map->ilock_base = ilock; | |
4617 | spin_unlock(&map->ilock_spin); | |
4618 | } | |
4619 | ||
4620 | void | |
4621 | vm_map_deinterlock(vm_map_t map, struct vm_map_ilock *ilock) | |
4622 | { | |
4623 | struct vm_map_ilock *scan; | |
4624 | struct vm_map_ilock **scanp; | |
4625 | ||
4626 | spin_lock(&map->ilock_spin); | |
4627 | scanp = &map->ilock_base; | |
4628 | while ((scan = *scanp) != NULL) { | |
4629 | if (scan == ilock) { | |
4630 | *scanp = ilock->next; | |
4631 | spin_unlock(&map->ilock_spin); | |
4632 | if (ilock->flags & ILOCK_WAITING) | |
4633 | wakeup(ilock); | |
4634 | return; | |
4635 | } | |
4636 | scanp = &scan->next; | |
4637 | } | |
4638 | spin_unlock(&map->ilock_spin); | |
4639 | panic("vm_map_deinterlock: missing ilock!"); | |
4640 | } | |
4641 | ||
984263bc MD |
4642 | #include "opt_ddb.h" |
4643 | #ifdef DDB | |
984263bc MD |
4644 | #include <ddb/ddb.h> |
4645 | ||
4646 | /* | |
46754a20 | 4647 | * Debugging only |
984263bc MD |
4648 | */ |
4649 | DB_SHOW_COMMAND(map, vm_map_print) | |
4650 | { | |
4651 | static int nlines; | |
4652 | /* XXX convert args. */ | |
4653 | vm_map_t map = (vm_map_t)addr; | |
4654 | boolean_t full = have_addr; | |
4655 | ||
4656 | vm_map_entry_t entry; | |
4657 | ||
4658 | db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n", | |
4659 | (void *)map, | |
4660 | (void *)map->pmap, map->nentries, map->timestamp); | |
4661 | nlines++; | |
4662 | ||
4663 | if (!full && db_indent) | |
4664 | return; | |
4665 | ||
4666 | db_indent += 2; | |
47ec0953 | 4667 | RB_FOREACH(entry, vm_map_rb_tree, &map->rb_root) { |
984263bc | 4668 | db_iprintf("map entry %p: start=%p, end=%p\n", |
67e7cb85 MD |
4669 | (void *)entry, |
4670 | (void *)entry->ba.start, (void *)entry->ba.end); | |
984263bc MD |
4671 | nlines++; |
4672 | { | |
4673 | static char *inheritance_name[4] = | |
4674 | {"share", "copy", "none", "donate_copy"}; | |
4675 | ||
4676 | db_iprintf(" prot=%x/%x/%s", | |
4677 | entry->protection, | |
4678 | entry->max_protection, | |
62cc5940 MD |
4679 | inheritance_name[(int)(unsigned char) |
4680 | entry->inheritance]); | |
984263bc MD |
4681 | if (entry->wired_count != 0) |
4682 | db_printf(", wired"); | |
4683 | } | |
0adbcbd6 MD |
4684 | switch(entry->maptype) { |
4685 | case VM_MAPTYPE_SUBMAP: | |
9de48ead | 4686 | /* XXX no %qd in kernel. Truncate entry->ba.offset. */ |
984263bc | 4687 | db_printf(", share=%p, offset=0x%lx\n", |
9de48ead MD |
4688 | (void *)entry->ba.sub_map, |
4689 | (long)entry->ba.offset); | |
984263bc | 4690 | nlines++; |
47ec0953 MD |
4691 | |
4692 | db_indent += 2; | |
9de48ead | 4693 | vm_map_print((db_expr_t)(intptr_t)entry->ba.sub_map, |
47ec0953 MD |
4694 | full, 0, NULL); |
4695 | db_indent -= 2; | |
0adbcbd6 MD |
4696 | break; |
4697 | case VM_MAPTYPE_NORMAL: | |
9de48ead | 4698 | /* XXX no %qd in kernel. Truncate entry->ba.offset. */ |
984263bc | 4699 | db_printf(", object=%p, offset=0x%lx", |
9de48ead MD |
4700 | (void *)entry->ba.object, |
4701 | (long)entry->ba.offset); | |
984263bc MD |
4702 | if (entry->eflags & MAP_ENTRY_COW) |
4703 | db_printf(", copy (%s)", | |
4d4f84f5 MD |
4704 | ((entry->eflags & MAP_ENTRY_NEEDS_COPY) ? |
4705 | "needed" : "done")); | |
984263bc MD |
4706 | db_printf("\n"); |
4707 | nlines++; | |
4708 | ||
9de48ead | 4709 | if (entry->ba.object) { |
984263bc MD |
4710 | db_indent += 2; |
4711 | vm_object_print((db_expr_t)(intptr_t) | |
9de48ead | 4712 | entry->ba.object, |
60233e58 | 4713 | full, 0, NULL); |
984263bc MD |
4714 | nlines += 4; |
4715 | db_indent -= 2; | |
4716 | } | |
0adbcbd6 MD |
4717 | break; |
4718 | case VM_MAPTYPE_UKSMAP: | |
4719 | db_printf(", uksmap=%p, offset=0x%lx", | |
9de48ead MD |
4720 | (void *)entry->ba.uksmap, |
4721 | (long)entry->ba.offset); | |
0adbcbd6 MD |
4722 | if (entry->eflags & MAP_ENTRY_COW) |
4723 | db_printf(", copy (%s)", | |
4724 | (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done"); | |
4725 | db_printf("\n"); | |
4726 | nlines++; | |
4727 | break; | |
4728 | default: | |
4729 | break; | |
984263bc MD |
4730 | } |
4731 | } | |
4732 | db_indent -= 2; | |
4733 | if (db_indent == 0) | |
4734 | nlines = 0; | |
4735 | } | |
4736 | ||
46754a20 MD |
4737 | /* |
4738 | * Debugging only | |
4739 | */ | |
984263bc MD |
4740 | DB_SHOW_COMMAND(procvm, procvm) |
4741 | { | |
4742 | struct proc *p; | |
4743 | ||
4744 | if (have_addr) { | |
4745 | p = (struct proc *) addr; | |
4746 | } else { | |
4747 | p = curproc; | |
4748 | } | |
4749 | ||
4750 | db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n", | |
4751 | (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map, | |
4752 | (void *)vmspace_pmap(p->p_vmspace)); | |
4753 | ||
4754 | vm_map_print((db_expr_t)(intptr_t)&p->p_vmspace->vm_map, 1, 0, NULL); | |
4755 | } | |
4756 | ||
4757 | #endif /* DDB */ |