Commit | Line | Data |
---|---|---|
984263bc | 1 | /* |
c936cb6f | 2 | * Copyright (c) 2003-2022 The DragonFly Project. All rights reserved. |
0adbcbd6 MD |
3 | * |
4 | * This code is derived from software contributed to The DragonFly Project | |
5 | * by Matthew Dillon <dillon@backplane.com> | |
6 | * | |
7 | * Redistribution and use in source and binary forms, with or without | |
8 | * modification, are permitted provided that the following conditions | |
9 | * are met: | |
10 | * | |
11 | * 1. Redistributions of source code must retain the above copyright | |
12 | * notice, this list of conditions and the following disclaimer. | |
13 | * 2. Redistributions in binary form must reproduce the above copyright | |
14 | * notice, this list of conditions and the following disclaimer in | |
15 | * the documentation and/or other materials provided with the | |
16 | * distribution. | |
17 | * 3. Neither the name of The DragonFly Project nor the names of its | |
18 | * contributors may be used to endorse or promote products derived | |
19 | * from this software without specific, prior written permission. | |
20 | * | |
21 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
22 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
23 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS | |
24 | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE | |
25 | * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, | |
26 | * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, | |
27 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
28 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED | |
29 | * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
30 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT | |
31 | * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
32 | * SUCH DAMAGE. | |
33 | * | |
34 | * --- | |
9ad0147b | 35 | * |
984263bc MD |
36 | * Copyright (c) 1991, 1993 |
37 | * The Regents of the University of California. All rights reserved. | |
38 | * Copyright (c) 1994 John S. Dyson | |
39 | * All rights reserved. | |
40 | * Copyright (c) 1994 David Greenman | |
41 | * All rights reserved. | |
42 | * | |
43 | * | |
44 | * This code is derived from software contributed to Berkeley by | |
45 | * The Mach Operating System project at Carnegie-Mellon University. | |
46 | * | |
47 | * Redistribution and use in source and binary forms, with or without | |
48 | * modification, are permitted provided that the following conditions | |
49 | * are met: | |
50 | * 1. Redistributions of source code must retain the above copyright | |
51 | * notice, this list of conditions and the following disclaimer. | |
52 | * 2. Redistributions in binary form must reproduce the above copyright | |
53 | * notice, this list of conditions and the following disclaimer in the | |
54 | * documentation and/or other materials provided with the distribution. | |
dc71b7ab | 55 | * 3. Neither the name of the University nor the names of its contributors |
984263bc MD |
56 | * may be used to endorse or promote products derived from this software |
57 | * without specific prior written permission. | |
58 | * | |
59 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND | |
60 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
61 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
62 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | |
63 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
64 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
65 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
66 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
67 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
68 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
69 | * SUCH DAMAGE. | |
70 | * | |
0adbcbd6 | 71 | * --- |
984263bc MD |
72 | * |
73 | * Copyright (c) 1987, 1990 Carnegie-Mellon University. | |
74 | * All rights reserved. | |
75 | * | |
76 | * Authors: Avadis Tevanian, Jr., Michael Wayne Young | |
77 | * | |
78 | * Permission to use, copy, modify and distribute this software and | |
79 | * its documentation is hereby granted, provided that both the copyright | |
80 | * notice and this permission notice appear in all copies of the | |
81 | * software, derivative works or modified versions, and any portions | |
82 | * thereof, and that both notices appear in supporting documentation. | |
83 | * | |
84 | * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" | |
85 | * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND | |
86 | * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. | |
87 | * | |
88 | * Carnegie Mellon requests users of this software to return to | |
89 | * | |
90 | * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU | |
91 | * School of Computer Science | |
92 | * Carnegie Mellon University | |
93 | * Pittsburgh PA 15213-3890 | |
94 | * | |
95 | * any improvements or extensions that they make and grant Carnegie the | |
96 | * rights to redistribute these changes. | |
984263bc MD |
97 | */ |
98 | ||
99 | /* | |
100 | * Page fault handling module. | |
101 | */ | |
102 | ||
0389db13 SW |
103 | #include "opt_vm.h" |
104 | ||
984263bc MD |
105 | #include <sys/param.h> |
106 | #include <sys/systm.h> | |
46311ac2 | 107 | #include <sys/kernel.h> |
984263bc MD |
108 | #include <sys/proc.h> |
109 | #include <sys/vnode.h> | |
110 | #include <sys/resourcevar.h> | |
111 | #include <sys/vmmeter.h> | |
75f59a66 | 112 | #include <sys/vkernel.h> |
75f59a66 | 113 | #include <sys/lock.h> |
bc823b32 | 114 | #include <sys/sysctl.h> |
984263bc | 115 | |
5c5185ae SG |
116 | #include <cpu/lwbuf.h> |
117 | ||
984263bc MD |
118 | #include <vm/vm.h> |
119 | #include <vm/vm_param.h> | |
984263bc MD |
120 | #include <vm/pmap.h> |
121 | #include <vm/vm_map.h> | |
122 | #include <vm/vm_object.h> | |
123 | #include <vm/vm_page.h> | |
124 | #include <vm/vm_pageout.h> | |
125 | #include <vm/vm_kern.h> | |
126 | #include <vm/vm_pager.h> | |
127 | #include <vm/vnode_pager.h> | |
9de48ead | 128 | #include <vm/swap_pager.h> |
984263bc | 129 | #include <vm/vm_extern.h> |
654a39f0 | 130 | |
12e4aaff | 131 | #include <vm/vm_page2.h> |
984263bc | 132 | |
01251219 MD |
133 | #define VM_FAULT_MAX_QUICK 16 |
134 | ||
984263bc | 135 | struct faultstate { |
01251219 | 136 | vm_page_t mary[VM_FAULT_MAX_QUICK]; |
44293a80 | 137 | vm_map_backing_t ba; |
72579d2e | 138 | vm_prot_t prot; |
984263bc | 139 | vm_page_t first_m; |
44293a80 | 140 | vm_map_backing_t first_ba; |
72579d2e | 141 | vm_prot_t first_prot; |
984263bc MD |
142 | vm_map_t map; |
143 | vm_map_entry_t entry; | |
1c024bc6 | 144 | int lookup_still_valid; /* 0=inv 1=valid/rel -1=valid/atomic */ |
568e6804 | 145 | int hardfault; |
568e6804 | 146 | int fault_flags; |
ce94514e | 147 | int shared; |
70f3bb08 | 148 | int msoftonly; |
501747bf | 149 | int first_shared; |
7a45978d | 150 | int wflags; |
1c024bc6 | 151 | int first_ba_held; /* 0=unlocked 1=locked/rel -1=lock/atomic */ |
984263bc MD |
152 | struct vnode *vp; |
153 | }; | |
154 | ||
aac423a7 | 155 | __read_mostly static int debug_fault = 0; |
bb0d6093 | 156 | SYSCTL_INT(_vm, OID_AUTO, debug_fault, CTLFLAG_RW, &debug_fault, 0, ""); |
aac423a7 | 157 | __read_mostly static int debug_cluster = 0; |
cf1bb2a8 | 158 | SYSCTL_INT(_vm, OID_AUTO, debug_cluster, CTLFLAG_RW, &debug_cluster, 0, ""); |
9de48ead MD |
159 | #if 0 |
160 | static int virtual_copy_enable = 1; | |
641f3b0a MD |
161 | SYSCTL_INT(_vm, OID_AUTO, virtual_copy_enable, CTLFLAG_RW, |
162 | &virtual_copy_enable, 0, ""); | |
9de48ead | 163 | #endif |
aac423a7 | 164 | __read_mostly int vm_shared_fault = 1; |
501747bf | 165 | TUNABLE_INT("vm.shared_fault", &vm_shared_fault); |
641f3b0a MD |
166 | SYSCTL_INT(_vm, OID_AUTO, shared_fault, CTLFLAG_RW, |
167 | &vm_shared_fault, 0, "Allow shared token on vm_object"); | |
c2830aa6 MD |
168 | __read_mostly static int vm_fault_bypass_count = 1; |
169 | TUNABLE_INT("vm.fault_bypass", &vm_fault_bypass_count); | |
170 | SYSCTL_INT(_vm, OID_AUTO, fault_bypass, CTLFLAG_RW, | |
171 | &vm_fault_bypass_count, 0, "Allow fast vm_fault shortcut"); | |
e32fb2aa MD |
172 | |
173 | /* | |
174 | * Define here for debugging ioctls. Note that these are globals, so | |
175 | * they were cause a ton of cache line bouncing. Only use for debugging | |
176 | * purposes. | |
177 | */ | |
178 | /*#define VM_FAULT_QUICK_DEBUG */ | |
e05899ce | 179 | #ifdef VM_FAULT_QUICK_DEBUG |
c2830aa6 MD |
180 | static long vm_fault_bypass_success_count = 0; |
181 | SYSCTL_LONG(_vm, OID_AUTO, fault_bypass_success_count, CTLFLAG_RW, | |
182 | &vm_fault_bypass_success_count, 0, ""); | |
183 | static long vm_fault_bypass_failure_count1 = 0; | |
184 | SYSCTL_LONG(_vm, OID_AUTO, fault_bypass_failure_count1, CTLFLAG_RW, | |
185 | &vm_fault_bypass_failure_count1, 0, ""); | |
186 | static long vm_fault_bypass_failure_count2 = 0; | |
187 | SYSCTL_LONG(_vm, OID_AUTO, fault_bypass_failure_count2, CTLFLAG_RW, | |
188 | &vm_fault_bypass_failure_count2, 0, ""); | |
189 | static long vm_fault_bypass_failure_count3 = 0; | |
190 | SYSCTL_LONG(_vm, OID_AUTO, fault_bypass_failure_count3, CTLFLAG_RW, | |
191 | &vm_fault_bypass_failure_count3, 0, ""); | |
192 | static long vm_fault_bypass_failure_count4 = 0; | |
193 | SYSCTL_LONG(_vm, OID_AUTO, fault_bypass_failure_count4, CTLFLAG_RW, | |
194 | &vm_fault_bypass_failure_count4, 0, ""); | |
e05899ce | 195 | #endif |
70f3bb08 | 196 | |
c2830aa6 | 197 | static int vm_fault_bypass(struct faultstate *fs, vm_pindex_t first_pindex, |
01251219 | 198 | vm_pindex_t first_count, int *mextcountp, |
70f3bb08 | 199 | vm_prot_t fault_type); |
43320d68 | 200 | static int vm_fault_object(struct faultstate *, vm_pindex_t, vm_prot_t, int); |
2421aac7 | 201 | static void vm_set_nosync(vm_page_t m, vm_map_entry_t entry); |
54341a3b MD |
202 | static void vm_prefault(pmap_t pmap, vm_offset_t addra, |
203 | vm_map_entry_t entry, int prot, int fault_flags); | |
204 | static void vm_prefault_quick(pmap_t pmap, vm_offset_t addra, | |
205 | vm_map_entry_t entry, int prot, int fault_flags); | |
568e6804 | 206 | |
984263bc MD |
207 | static __inline void |
208 | release_page(struct faultstate *fs) | |
209 | { | |
01251219 MD |
210 | vm_page_deactivate(fs->mary[0]); |
211 | vm_page_wakeup(fs->mary[0]); | |
212 | fs->mary[0] = NULL; | |
984263bc MD |
213 | } |
214 | ||
215 | static __inline void | |
216 | unlock_map(struct faultstate *fs) | |
217 | { | |
9de48ead MD |
218 | if (fs->ba != fs->first_ba) |
219 | vm_object_drop(fs->ba->object); | |
1c024bc6 | 220 | if (fs->first_ba && fs->first_ba_held == 1) { |
9de48ead MD |
221 | vm_object_drop(fs->first_ba->object); |
222 | fs->first_ba_held = 0; | |
1c024bc6 | 223 | fs->first_ba = NULL; |
9de48ead MD |
224 | } |
225 | fs->ba = NULL; | |
1c024bc6 MD |
226 | |
227 | /* | |
228 | * NOTE: If lookup_still_valid == -1 the map is assumed to be locked | |
229 | * and caller expects it to remain locked atomically. | |
230 | */ | |
231 | if (fs->lookup_still_valid == 1 && fs->map) { | |
a108bf71 | 232 | vm_map_lookup_done(fs->map, fs->entry, 0); |
1c024bc6 | 233 | fs->lookup_still_valid = 0; |
9de48ead | 234 | fs->entry = NULL; |
984263bc MD |
235 | } |
236 | } | |
237 | ||
75f59a66 MD |
238 | /* |
239 | * Clean up after a successful call to vm_fault_object() so another call | |
240 | * to vm_fault_object() can be made. | |
241 | */ | |
984263bc | 242 | static void |
9de48ead | 243 | cleanup_fault(struct faultstate *fs) |
984263bc | 244 | { |
501747bf MD |
245 | /* |
246 | * We allocated a junk page for a COW operation that did | |
247 | * not occur, the page must be freed. | |
248 | */ | |
9de48ead | 249 | if (fs->ba != fs->first_ba) { |
501747bf | 250 | KKASSERT(fs->first_shared == 0); |
9de48ead MD |
251 | |
252 | /* | |
253 | * first_m could be completely valid and we got here | |
254 | * because of a PG_RAM, don't mistakenly free it! | |
255 | */ | |
256 | if ((fs->first_m->valid & VM_PAGE_BITS_ALL) == | |
257 | VM_PAGE_BITS_ALL) { | |
258 | vm_page_wakeup(fs->first_m); | |
259 | } else { | |
260 | vm_page_free(fs->first_m); | |
261 | } | |
262 | vm_object_pip_wakeup(fs->ba->object); | |
984263bc | 263 | fs->first_m = NULL; |
501747bf | 264 | |
9de48ead | 265 | /* |
4d4f84f5 MD |
266 | * Reset fs->ba without calling unlock_map(), so we need a |
267 | * little duplication. | |
9de48ead MD |
268 | */ |
269 | vm_object_drop(fs->ba->object); | |
270 | fs->ba = fs->first_ba; | |
75f59a66 MD |
271 | } |
272 | } | |
273 | ||
274 | static void | |
9de48ead | 275 | unlock_things(struct faultstate *fs) |
75f59a66 | 276 | { |
9de48ead | 277 | cleanup_fault(fs); |
984263bc MD |
278 | unlock_map(fs); |
279 | if (fs->vp != NULL) { | |
280 | vput(fs->vp); | |
281 | fs->vp = NULL; | |
282 | } | |
283 | } | |
284 | ||
9de48ead | 285 | #if 0 |
641f3b0a MD |
286 | /* |
287 | * Virtual copy tests. Used by the fault code to determine if a | |
288 | * page can be moved from an orphan vm_object into its shadow | |
289 | * instead of copying its contents. | |
290 | */ | |
291 | static __inline int | |
292 | virtual_copy_test(struct faultstate *fs) | |
293 | { | |
294 | /* | |
295 | * Must be holding exclusive locks | |
296 | */ | |
297 | if (fs->first_shared || fs->shared || virtual_copy_enable == 0) | |
298 | return 0; | |
299 | ||
300 | /* | |
301 | * Map, if present, has not changed | |
302 | */ | |
303 | if (fs->map && fs->map_generation != fs->map->timestamp) | |
304 | return 0; | |
305 | ||
306 | /* | |
6f76a56d | 307 | * No refs, except us |
641f3b0a | 308 | */ |
9de48ead | 309 | if (fs->ba->object->ref_count != 1) |
641f3b0a MD |
310 | return 0; |
311 | ||
312 | /* | |
313 | * No one else can look this object up | |
314 | */ | |
9de48ead | 315 | if (fs->ba->object->handle != NULL) |
641f3b0a MD |
316 | return 0; |
317 | ||
318 | /* | |
319 | * No other ways to look the object up | |
320 | */ | |
9de48ead MD |
321 | if (fs->ba->object->type != OBJT_DEFAULT && |
322 | fs->ba->object->type != OBJT_SWAP) | |
641f3b0a MD |
323 | return 0; |
324 | ||
325 | /* | |
326 | * We don't chase down the shadow chain | |
327 | */ | |
9de48ead | 328 | if (fs->ba != fs->first_ba->backing_ba) |
641f3b0a MD |
329 | return 0; |
330 | ||
331 | return 1; | |
332 | } | |
333 | ||
334 | static __inline int | |
335 | virtual_copy_ok(struct faultstate *fs) | |
336 | { | |
337 | if (virtual_copy_test(fs)) { | |
338 | /* | |
339 | * Grab the lock and re-test changeable items. | |
340 | */ | |
1c024bc6 | 341 | if (fs->lookup_still_valid == 0 && fs->map) { |
641f3b0a MD |
342 | if (lockmgr(&fs->map->lock, LK_EXCLUSIVE|LK_NOWAIT)) |
343 | return 0; | |
1c024bc6 | 344 | fs->lookup_still_valid = 1; |
641f3b0a MD |
345 | if (virtual_copy_test(fs)) { |
346 | fs->map_generation = ++fs->map->timestamp; | |
347 | return 1; | |
348 | } | |
1c024bc6 | 349 | fs->lookup_still_valid = 0; |
641f3b0a MD |
350 | lockmgr(&fs->map->lock, LK_RELEASE); |
351 | } | |
352 | } | |
353 | return 0; | |
354 | } | |
9de48ead | 355 | #endif |
641f3b0a | 356 | |
984263bc | 357 | /* |
568e6804 MD |
358 | * TRYPAGER |
359 | * | |
360 | * Determine if the pager for the current object *might* contain the page. | |
984263bc | 361 | * |
568e6804 MD |
362 | * We only need to try the pager if this is not a default object (default |
363 | * objects are zero-fill and have no real pager), and if we are not taking | |
364 | * a wiring fault or if the FS entry is wired. | |
984263bc | 365 | */ |
568e6804 | 366 | #define TRYPAGER(fs) \ |
9de48ead | 367 | (fs->ba->object->type != OBJT_DEFAULT && \ |
7a45978d MD |
368 | (((fs->fault_flags & VM_FAULT_WIRE_MASK) == 0) || \ |
369 | (fs->wflags & FW_WIRED))) | |
984263bc MD |
370 | |
371 | /* | |
568e6804 | 372 | * vm_fault: |
984263bc | 373 | * |
568e6804 MD |
374 | * Handle a page fault occuring at the given address, requiring the given |
375 | * permissions, in the map specified. If successful, the page is inserted | |
376 | * into the associated physical map. | |
984263bc | 377 | * |
568e6804 | 378 | * NOTE: The given address should be truncated to the proper page address. |
984263bc | 379 | * |
568e6804 MD |
380 | * KERN_SUCCESS is returned if the page fault is handled; otherwise, |
381 | * a standard error specifying why the fault is fatal is returned. | |
984263bc | 382 | * |
568e6804 MD |
383 | * The map in question must be referenced, and remains so. |
384 | * The caller may hold no locks. | |
9ad0147b | 385 | * No other requirements. |
984263bc MD |
386 | */ |
387 | int | |
388 | vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags) | |
389 | { | |
72579d2e | 390 | vm_pindex_t first_pindex; |
01251219 | 391 | vm_pindex_t first_count; |
984263bc | 392 | struct faultstate fs; |
54341a3b | 393 | struct lwp *lp; |
0389db13 | 394 | #if !defined(NO_SWAPPING) |
534ee349 | 395 | struct proc *p; |
0389db13 | 396 | #endif |
534ee349 | 397 | thread_t td; |
01251219 | 398 | int mextcount; |
8d496bf9 | 399 | int growstack; |
43320d68 | 400 | int retry = 0; |
06c66eb2 | 401 | int inherit_prot; |
01251219 MD |
402 | int result; |
403 | int n; | |
984263bc | 404 | |
06c66eb2 | 405 | inherit_prot = fault_type & VM_PROT_NOSYNC; |
568e6804 MD |
406 | fs.hardfault = 0; |
407 | fs.fault_flags = fault_flags; | |
54341a3b | 408 | fs.vp = NULL; |
501747bf MD |
409 | fs.shared = vm_shared_fault; |
410 | fs.first_shared = vm_shared_fault; | |
8d496bf9 | 411 | growstack = 1; |
984263bc | 412 | |
501747bf MD |
413 | /* |
414 | * vm_map interactions | |
415 | */ | |
534ee349 MD |
416 | td = curthread; |
417 | if ((lp = td->td_lwp) != NULL) | |
4643740a | 418 | lp->lwp_flags |= LWP_PAGING; |
b12defdc | 419 | |
06ecca5a | 420 | RetryFault: |
70f3bb08 | 421 | /* |
c2830aa6 | 422 | * vm_fault_bypass() can shortcut us. |
70f3bb08 MD |
423 | */ |
424 | fs.msoftonly = 0; | |
9de48ead | 425 | fs.first_ba_held = 0; |
01251219 | 426 | mextcount = 1; |
70f3bb08 | 427 | |
984263bc | 428 | /* |
568e6804 MD |
429 | * Find the vm_map_entry representing the backing store and resolve |
430 | * the top level object and page index. This may have the side | |
31efdff0 MD |
431 | * effect of executing a copy-on-write on the map entry, |
432 | * creating a shadow object, or splitting an anonymous entry for | |
433 | * performance, but will not COW any actual VM pages. | |
568e6804 MD |
434 | * |
435 | * On success fs.map is left read-locked and various other fields | |
436 | * are initialized but not otherwise referenced or locked. | |
437 | * | |
4e7c41c5 | 438 | * NOTE! vm_map_lookup will try to upgrade the fault_type to |
5947157e MD |
439 | * VM_FAULT_WRITE if the map entry is a virtual page table |
440 | * and also writable, so we can set the 'A'accessed bit in | |
441 | * the virtual page table entry. | |
984263bc MD |
442 | */ |
443 | fs.map = map; | |
568e6804 | 444 | result = vm_map_lookup(&fs.map, vaddr, fault_type, |
9de48ead | 445 | &fs.entry, &fs.first_ba, |
01251219 MD |
446 | &first_pindex, &first_count, |
447 | &fs.first_prot, &fs.wflags); | |
568e6804 MD |
448 | |
449 | /* | |
450 | * If the lookup failed or the map protections are incompatible, | |
efad0641 CR |
451 | * the fault generally fails. |
452 | * | |
453 | * The failure could be due to TDF_NOFAULT if vm_map_lookup() | |
454 | * tried to do a COW fault. | |
455 | * | |
456 | * If the caller is trying to do a user wiring we have more work | |
457 | * to do. | |
568e6804 MD |
458 | */ |
459 | if (result != KERN_SUCCESS) { | |
efad0641 CR |
460 | if (result == KERN_FAILURE_NOFAULT) { |
461 | result = KERN_FAILURE; | |
462 | goto done; | |
463 | } | |
8d496bf9 MD |
464 | if (result != KERN_PROTECTION_FAILURE || |
465 | (fs.fault_flags & VM_FAULT_WIRE_MASK) != VM_FAULT_USER_WIRE) | |
466 | { | |
467 | if (result == KERN_INVALID_ADDRESS && growstack && | |
1eeaf6b2 | 468 | map != kernel_map && curproc != NULL) { |
95270b7e | 469 | result = vm_map_growstack(map, vaddr); |
b12defdc MD |
470 | if (result == KERN_SUCCESS) { |
471 | growstack = 0; | |
43320d68 | 472 | ++retry; |
b12defdc MD |
473 | goto RetryFault; |
474 | } | |
475 | result = KERN_FAILURE; | |
8d496bf9 | 476 | } |
b12defdc | 477 | goto done; |
8d496bf9 | 478 | } |
984263bc MD |
479 | |
480 | /* | |
b443039b MD |
481 | * If we are user-wiring a r/w segment, and it is COW, then |
482 | * we need to do the COW operation. Note that we don't | |
568e6804 | 483 | * currently COW RO sections now, because it is NOT desirable |
b443039b MD |
484 | * to COW .text. We simply keep .text from ever being COW'ed |
485 | * and take the heat that one cannot debug wired .text sections. | |
7a45978d MD |
486 | * |
487 | * XXX Try to allow the above by specifying OVERRIDE_WRITE. | |
b443039b | 488 | */ |
984263bc | 489 | result = vm_map_lookup(&fs.map, vaddr, |
c936cb6f | 490 | VM_PROT_READ | VM_PROT_WRITE | |
568e6804 | 491 | VM_PROT_OVERRIDE_WRITE, |
9de48ead | 492 | &fs.entry, &fs.first_ba, |
01251219 MD |
493 | &first_pindex, &first_count, |
494 | &fs.first_prot, &fs.wflags); | |
b12defdc | 495 | if (result != KERN_SUCCESS) { |
efad0641 | 496 | /* could also be KERN_FAILURE_NOFAULT */ |
b12defdc MD |
497 | result = KERN_FAILURE; |
498 | goto done; | |
499 | } | |
984263bc MD |
500 | |
501 | /* | |
502 | * If we don't COW now, on a user wire, the user will never | |
503 | * be able to write to the mapping. If we don't make this | |
504 | * restriction, the bookkeeping would be nearly impossible. | |
dda969a8 MD |
505 | * |
506 | * XXX We have a shared lock, this will have a MP race but | |
507 | * I don't see how it can hurt anything. | |
984263bc | 508 | */ |
c936cb6f | 509 | if ((fs.first_prot & VM_PROT_WRITE) == 0) { |
b443039b MD |
510 | atomic_clear_char(&fs.entry->max_protection, |
511 | VM_PROT_WRITE); | |
512 | } | |
984263bc MD |
513 | } |
514 | ||
568e6804 MD |
515 | /* |
516 | * fs.map is read-locked | |
517 | * | |
518 | * Misc checks. Save the map generation number to detect races. | |
519 | */ | |
1c024bc6 | 520 | fs.lookup_still_valid = 1; |
862481e5 | 521 | fs.first_m = NULL; |
9de48ead | 522 | fs.ba = fs.first_ba; /* so unlock_things() works */ |
0adbcbd6 | 523 | fs.prot = fs.first_prot; /* default (used by uksmap) */ |
984263bc | 524 | |
e40cfbd7 MD |
525 | if (fs.entry->eflags & (MAP_ENTRY_NOFAULT | MAP_ENTRY_KSTACK)) { |
526 | if (fs.entry->eflags & MAP_ENTRY_NOFAULT) { | |
527 | panic("vm_fault: fault on nofault entry, addr: %p", | |
528 | (void *)vaddr); | |
529 | } | |
530 | if ((fs.entry->eflags & MAP_ENTRY_KSTACK) && | |
67e7cb85 MD |
531 | vaddr >= fs.entry->ba.start && |
532 | vaddr < fs.entry->ba.start + PAGE_SIZE) { | |
e40cfbd7 MD |
533 | panic("vm_fault: fault on stack guard, addr: %p", |
534 | (void *)vaddr); | |
535 | } | |
984263bc MD |
536 | } |
537 | ||
0adbcbd6 MD |
538 | /* |
539 | * A user-kernel shared map has no VM object and bypasses | |
540 | * everything. We execute the uksmap function with a temporary | |
541 | * fictitious vm_page. The address is directly mapped with no | |
542 | * management. | |
543 | */ | |
544 | if (fs.entry->maptype == VM_MAPTYPE_UKSMAP) { | |
545 | struct vm_page fakem; | |
546 | ||
547 | bzero(&fakem, sizeof(fakem)); | |
548 | fakem.pindex = first_pindex; | |
831a8507 | 549 | fakem.flags = PG_FICTITIOUS | PG_UNQUEUED; |
bc0aa189 | 550 | fakem.busy_count = PBUSY_LOCKED; |
0adbcbd6 MD |
551 | fakem.valid = VM_PAGE_BITS_ALL; |
552 | fakem.pat_mode = VM_MEMATTR_DEFAULT; | |
64b5a8a5 MD |
553 | if (fs.entry->ba.uksmap(&fs.entry->ba, UKSMAPOP_FAULT, |
554 | fs.entry->aux.dev, &fakem)) { | |
0adbcbd6 MD |
555 | result = KERN_FAILURE; |
556 | unlock_things(&fs); | |
557 | goto done2; | |
558 | } | |
559 | pmap_enter(fs.map->pmap, vaddr, &fakem, fs.prot | inherit_prot, | |
7a45978d | 560 | (fs.wflags & FW_WIRED), fs.entry); |
0adbcbd6 MD |
561 | goto done_success; |
562 | } | |
563 | ||
c40f2b75 MD |
564 | /* |
565 | * A system map entry may return a NULL object. No object means | |
566 | * no pager means an unrecoverable kernel fault. | |
567 | */ | |
9de48ead | 568 | if (fs.first_ba == NULL) { |
c40f2b75 MD |
569 | panic("vm_fault: unrecoverable fault at %p in entry %p", |
570 | (void *)vaddr, fs.entry); | |
571 | } | |
572 | ||
862481e5 MD |
573 | /* |
574 | * Fail here if not a trivial anonymous page fault and TDF_NOFAULT | |
575 | * is set. | |
10c39de2 MD |
576 | * |
577 | * Unfortunately a deadlock can occur if we are forced to page-in | |
578 | * from swap, but diving all the way into the vm_pager_get_page() | |
579 | * function to find out is too much. Just check the object type. | |
580 | * | |
581 | * The deadlock is a CAM deadlock on a busy VM page when trying | |
582 | * to finish an I/O if another process gets stuck in | |
583 | * vop_helper_read_shortcut() due to a swap fault. | |
862481e5 | 584 | */ |
534ee349 | 585 | if ((td->td_flags & TDF_NOFAULT) && |
43320d68 | 586 | (retry || |
9de48ead MD |
587 | fs.first_ba->object->type == OBJT_VNODE || |
588 | fs.first_ba->object->type == OBJT_SWAP || | |
589 | fs.first_ba->backing_ba)) { | |
862481e5 MD |
590 | result = KERN_FAILURE; |
591 | unlock_things(&fs); | |
592 | goto done2; | |
593 | } | |
594 | ||
54341a3b | 595 | /* |
c936cb6f MD |
596 | * If the entry is wired the page protection level is limited to |
597 | * what the vm_map_lookup() allowed us. | |
598 | * | |
599 | * XXX it is unclear if this code is still needed as vm_map_lookup() | |
600 | * no longer prevents protection changes on locked memory. REMOVE | |
601 | * IF WE DETERMINE THAT THIS CODE IS NO LONGER NEEDED. | |
54341a3b | 602 | */ |
7a45978d | 603 | if (fs.wflags & FW_WIRED) |
501747bf | 604 | fault_type = fs.first_prot; |
54341a3b | 605 | |
984263bc | 606 | /* |
501747bf MD |
607 | * We generally want to avoid unnecessary exclusive modes on backing |
608 | * and terminal objects because this can seriously interfere with | |
609 | * heavily fork()'d processes (particularly /bin/sh scripts). | |
610 | * | |
611 | * However, we also want to avoid unnecessary retries due to needed | |
612 | * shared->exclusive promotion for common faults. Exclusive mode is | |
613 | * always needed if any page insertion, rename, or free occurs in an | |
614 | * object (and also indirectly if any I/O is done). | |
615 | * | |
616 | * The main issue here is going to be fs.first_shared. If the | |
617 | * first_object has a backing object which isn't shadowed and the | |
618 | * process is single-threaded we might as well use an exclusive | |
619 | * lock/chain right off the bat. | |
984263bc | 620 | */ |
6f76a56d MD |
621 | #if 0 |
622 | /* WORK IN PROGRESS, CODE REMOVED */ | |
501747bf MD |
623 | if (fs.first_shared && fs.first_object->backing_object && |
624 | LIST_EMPTY(&fs.first_object->shadow_head) && | |
534ee349 | 625 | td->td_proc && td->td_proc->p_nthreads == 1) { |
501747bf MD |
626 | fs.first_shared = 0; |
627 | } | |
6f76a56d | 628 | #endif |
984263bc | 629 | |
61dac052 | 630 | /* |
ceb0e493 MD |
631 | * VM_FAULT_UNSWAP - swap_pager_unswapped() needs an exclusive object |
632 | * VM_FAULT_DIRTY - may require swap_pager_unswapped() later, but | |
633 | * we can try shared first. | |
61dac052 | 634 | */ |
9de48ead | 635 | if (fault_flags & VM_FAULT_UNSWAP) |
61dac052 | 636 | fs.first_shared = 0; |
984263bc | 637 | |
70f3bb08 MD |
638 | /* |
639 | * Try to shortcut the entire mess and run the fault lockless. | |
01251219 | 640 | * This will burst in multiple pages via fs->mary[]. |
70f3bb08 | 641 | */ |
c2830aa6 MD |
642 | if (vm_fault_bypass_count && |
643 | vm_fault_bypass(&fs, first_pindex, first_count, | |
01251219 | 644 | &mextcount, fault_type) == KERN_SUCCESS) { |
70f3bb08 MD |
645 | fault_flags &= ~VM_FAULT_BURST; |
646 | goto success; | |
647 | } | |
648 | ||
9de48ead MD |
649 | /* |
650 | * Exclusive heuristic (alloc page vs page exists) | |
651 | */ | |
652 | if (fs.first_ba->flags & VM_MAP_BACK_EXCL_HEUR) | |
653 | fs.first_shared = 0; | |
654 | ||
984263bc | 655 | /* |
501747bf MD |
656 | * Obtain a top-level object lock, shared or exclusive depending |
657 | * on fs.first_shared. If a shared lock winds up being insufficient | |
658 | * we will retry with an exclusive lock. | |
659 | * | |
660 | * The vnode pager lock is always shared. | |
984263bc | 661 | */ |
501747bf | 662 | if (fs.first_shared) |
9de48ead | 663 | vm_object_hold_shared(fs.first_ba->object); |
501747bf | 664 | else |
9de48ead | 665 | vm_object_hold(fs.first_ba->object); |
501747bf | 666 | if (fs.vp == NULL) |
9de48ead MD |
667 | fs.vp = vnode_pager_lock(fs.first_ba); |
668 | fs.first_ba_held = 1; | |
984263bc | 669 | |
568e6804 | 670 | /* |
4d4f84f5 | 671 | * The page we want is at (first_object, first_pindex). |
75f59a66 | 672 | * |
568e6804 MD |
673 | * Now we have the actual (object, pindex), fault in the page. If |
674 | * vm_fault_object() fails it will unlock and deallocate the FS | |
9de48ead MD |
675 | * data. If it succeeds everything remains locked and fs->ba->object |
676 | * will have an additional PIP count if fs->ba != fs->first_ba. | |
4e7c41c5 MD |
677 | * |
678 | * vm_fault_object will set fs->prot for the pmap operation. It is | |
679 | * allowed to set VM_PROT_WRITE if fault_type == VM_PROT_READ if the | |
680 | * page can be safely written. However, it will force a read-only | |
681 | * mapping for a read fault if the memory is managed by a virtual | |
682 | * page table. | |
ce94514e MD |
683 | * |
684 | * If the fault code uses the shared object lock shortcut | |
685 | * we must not try to burst (we can't allocate VM pages). | |
568e6804 | 686 | */ |
43320d68 | 687 | result = vm_fault_object(&fs, first_pindex, fault_type, 1); |
bb0d6093 MD |
688 | |
689 | if (debug_fault > 0) { | |
690 | --debug_fault; | |
9c6895e0 | 691 | kprintf("VM_FAULT result %d addr=%jx type=%02x flags=%02x " |
7a45978d | 692 | "fs.m=%p fs.prot=%02x fs.wflags=%02x fs.entry=%p\n", |
9c6895e0 | 693 | result, (intmax_t)vaddr, fault_type, fault_flags, |
01251219 | 694 | fs.mary[0], fs.prot, fs.wflags, fs.entry); |
bb0d6093 MD |
695 | } |
696 | ||
2a418930 | 697 | if (result == KERN_TRY_AGAIN) { |
43320d68 | 698 | ++retry; |
568e6804 | 699 | goto RetryFault; |
2a418930 | 700 | } |
fc531fbc | 701 | if (result != KERN_SUCCESS) { |
b12defdc | 702 | goto done; |
fc531fbc | 703 | } |
568e6804 | 704 | |
70f3bb08 | 705 | success: |
568e6804 | 706 | /* |
75f59a66 | 707 | * On success vm_fault_object() does not unlock or deallocate, and fs.m |
9de48ead | 708 | * will contain a busied page. It does drop fs->ba if appropriate. |
568e6804 MD |
709 | * |
710 | * Enter the page into the pmap and do pmap-related adjustments. | |
e05899ce MD |
711 | * |
712 | * WARNING! Soft-busied fs.m's can only be manipulated in limited | |
713 | * ways. | |
568e6804 | 714 | */ |
1c024bc6 | 715 | KKASSERT(fs.lookup_still_valid != 0); |
01251219 MD |
716 | vm_page_flag_set(fs.mary[0], PG_REFERENCED); |
717 | ||
718 | for (n = 0; n < mextcount; ++n) { | |
719 | pmap_enter(fs.map->pmap, vaddr + (n << PAGE_SHIFT), | |
720 | fs.mary[n], fs.prot | inherit_prot, | |
721 | fs.wflags & FW_WIRED, fs.entry); | |
722 | } | |
568e6804 | 723 | |
568e6804 MD |
724 | /* |
725 | * If the page is not wired down, then put it where the pageout daemon | |
726 | * can find it. | |
e05899ce MD |
727 | * |
728 | * NOTE: We cannot safely wire, unwire, or adjust queues for a | |
729 | * soft-busied page. | |
568e6804 | 730 | */ |
01251219 MD |
731 | for (n = 0; n < mextcount; ++n) { |
732 | if (fs.msoftonly) { | |
733 | KKASSERT(fs.mary[n]->busy_count & PBUSY_MASK); | |
734 | KKASSERT((fs.fault_flags & VM_FAULT_WIRE_MASK) == 0); | |
735 | vm_page_sbusy_drop(fs.mary[n]); | |
e05899ce | 736 | } else { |
01251219 MD |
737 | if (fs.fault_flags & VM_FAULT_WIRE_MASK) { |
738 | if (fs.wflags & FW_WIRED) | |
739 | vm_page_wire(fs.mary[n]); | |
740 | else | |
741 | vm_page_unwire(fs.mary[n], 1); | |
742 | } else { | |
743 | vm_page_activate(fs.mary[n]); | |
744 | } | |
745 | KKASSERT(fs.mary[n]->busy_count & PBUSY_LOCKED); | |
746 | vm_page_wakeup(fs.mary[n]); | |
e05899ce | 747 | } |
70f3bb08 | 748 | } |
ce2ac249 MD |
749 | |
750 | /* | |
751 | * Burst in a few more pages if possible. The fs.map should still | |
752 | * be locked. To avoid interlocking against a vnode->getblk | |
753 | * operation we had to be sure to unbusy our primary vm_page above | |
754 | * first. | |
501747bf MD |
755 | * |
756 | * A normal burst can continue down backing store, only execute | |
757 | * if we are holding an exclusive lock, otherwise the exclusive | |
758 | * locks the burst code gets might cause excessive SMP collisions. | |
759 | * | |
760 | * A quick burst can be utilized when there is no backing object | |
761 | * (i.e. a shared file mmap). | |
ce2ac249 | 762 | */ |
501747bf MD |
763 | if ((fault_flags & VM_FAULT_BURST) && |
764 | (fs.fault_flags & VM_FAULT_WIRE_MASK) == 0 && | |
7a45978d | 765 | (fs.wflags & FW_WIRED) == 0) { |
501747bf | 766 | if (fs.first_shared == 0 && fs.shared == 0) { |
54341a3b MD |
767 | vm_prefault(fs.map->pmap, vaddr, |
768 | fs.entry, fs.prot, fault_flags); | |
501747bf | 769 | } else { |
54341a3b MD |
770 | vm_prefault_quick(fs.map->pmap, vaddr, |
771 | fs.entry, fs.prot, fault_flags); | |
ce2ac249 MD |
772 | } |
773 | } | |
774 | ||
0adbcbd6 | 775 | done_success: |
ce2ac249 MD |
776 | /* |
777 | * Unlock everything, and return | |
778 | */ | |
779 | unlock_things(&fs); | |
568e6804 | 780 | |
01251219 | 781 | mycpu->gd_cnt.v_vm_faults++; |
534ee349 | 782 | if (td->td_lwp) { |
568e6804 | 783 | if (fs.hardfault) { |
01251219 | 784 | ++td->td_lwp->lwp_ru.ru_majflt; |
568e6804 | 785 | } else { |
01251219 | 786 | ++td->td_lwp->lwp_ru.ru_minflt; |
568e6804 MD |
787 | } |
788 | } | |
789 | ||
9de48ead | 790 | /*vm_object_deallocate(fs.first_ba->object);*/ |
a491077e | 791 | /*fs.m = NULL; */ |
b12defdc MD |
792 | |
793 | result = KERN_SUCCESS; | |
794 | done: | |
1c024bc6 | 795 | if (fs.first_ba && fs.first_ba->object && fs.first_ba_held == 1) { |
9de48ead MD |
796 | vm_object_drop(fs.first_ba->object); |
797 | fs.first_ba_held = 0; | |
798 | } | |
862481e5 | 799 | done2: |
54341a3b | 800 | if (lp) |
4643740a | 801 | lp->lwp_flags &= ~LWP_PAGING; |
534ee349 MD |
802 | |
803 | #if !defined(NO_SWAPPING) | |
804 | /* | |
805 | * Check the process RSS limit and force deactivation and | |
806 | * (asynchronous) paging if necessary. This is a complex operation, | |
807 | * only do it for direct user-mode faults, for now. | |
808 | * | |
809 | * To reduce overhead implement approximately a ~16MB hysteresis. | |
810 | */ | |
811 | p = td->td_proc; | |
812 | if ((fault_flags & VM_FAULT_USERMODE) && lp && | |
486b807a | 813 | p->p_limit && map->pmap && vm_pageout_memuse_mode >= 1 && |
1eeaf6b2 | 814 | map != kernel_map) { |
534ee349 MD |
815 | vm_pindex_t limit; |
816 | vm_pindex_t size; | |
817 | ||
818 | limit = OFF_TO_IDX(qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur, | |
819 | p->p_rlimit[RLIMIT_RSS].rlim_max)); | |
820 | size = pmap_resident_tlnw_count(map->pmap); | |
821 | if (limit >= 0 && size > 4096 && size - 4096 >= limit) { | |
822 | vm_pageout_map_deactivate_pages(map, limit); | |
823 | } | |
824 | } | |
825 | #endif | |
826 | ||
9de48ead MD |
827 | if (result != KERN_SUCCESS && debug_fault < 0) { |
828 | kprintf("VM_FAULT %d:%d (%s) result %d " | |
829 | "addr=%jx type=%02x flags=%02x " | |
830 | "fs.m=%p fs.prot=%02x fs.wflags=%02x fs.entry=%p\n", | |
831 | (curthread->td_proc ? curthread->td_proc->p_pid : -1), | |
832 | (curthread->td_lwp ? curthread->td_lwp->lwp_tid : -1), | |
833 | curthread->td_comm, | |
834 | result, | |
835 | (intmax_t)vaddr, fault_type, fault_flags, | |
01251219 | 836 | fs.mary[0], fs.prot, fs.wflags, fs.entry); |
9de48ead MD |
837 | while (debug_fault < 0 && (debug_fault & 1)) |
838 | tsleep(&debug_fault, 0, "DEBUG", hz); | |
839 | } | |
840 | ||
b12defdc | 841 | return (result); |
568e6804 MD |
842 | } |
843 | ||
70f3bb08 MD |
844 | /* |
845 | * Attempt a lockless vm_fault() shortcut. The stars have to align for this | |
846 | * to work. But if it does we can get our page only soft-busied and not | |
847 | * have to touch the vm_object or vnode locks at all. | |
848 | */ | |
849 | static | |
850 | int | |
c2830aa6 | 851 | vm_fault_bypass(struct faultstate *fs, vm_pindex_t first_pindex, |
01251219 | 852 | vm_pindex_t first_count, int *mextcountp, |
70f3bb08 MD |
853 | vm_prot_t fault_type) |
854 | { | |
855 | vm_page_t m; | |
856 | vm_object_t obj; /* NOT LOCKED */ | |
01251219 MD |
857 | int n; |
858 | int nlim; | |
70f3bb08 MD |
859 | |
860 | /* | |
861 | * Don't waste time if the object is only being used by one vm_map. | |
862 | */ | |
9de48ead | 863 | obj = fs->first_ba->object; |
01251219 | 864 | #if 0 |
70f3bb08 MD |
865 | if (obj->flags & OBJ_ONEMAPPING) |
866 | return KERN_FAILURE; | |
01251219 | 867 | #endif |
70f3bb08 | 868 | |
e05899ce MD |
869 | /* |
870 | * This will try to wire/unwire a page, which can't be done with | |
871 | * a soft-busied page. | |
872 | */ | |
873 | if (fs->fault_flags & VM_FAULT_WIRE_MASK) | |
874 | return KERN_FAILURE; | |
875 | ||
70f3bb08 MD |
876 | /* |
877 | * Ok, try to get the vm_page quickly via the hash table. The | |
878 | * page will be soft-busied on success (NOT hard-busied). | |
879 | */ | |
880 | m = vm_page_hash_get(obj, first_pindex); | |
881 | if (m == NULL) { | |
e05899ce | 882 | #ifdef VM_FAULT_QUICK_DEBUG |
c2830aa6 | 883 | ++vm_fault_bypass_failure_count2; |
e05899ce | 884 | #endif |
70f3bb08 MD |
885 | return KERN_FAILURE; |
886 | } | |
887 | if ((obj->flags & OBJ_DEAD) || | |
888 | m->valid != VM_PAGE_BITS_ALL || | |
68cf8881 | 889 | m->queue - m->pc != PQ_ACTIVE || |
70f3bb08 MD |
890 | (m->flags & PG_SWAPPED)) { |
891 | vm_page_sbusy_drop(m); | |
e05899ce | 892 | #ifdef VM_FAULT_QUICK_DEBUG |
c2830aa6 | 893 | ++vm_fault_bypass_failure_count3; |
e05899ce | 894 | #endif |
70f3bb08 MD |
895 | return KERN_FAILURE; |
896 | } | |
897 | ||
898 | /* | |
899 | * The page is already fully valid, ACTIVE, and is not PG_SWAPPED. | |
900 | * | |
901 | * Don't map the page writable when emulating the dirty bit, a | |
902 | * fault must be taken for proper emulation (vkernel). | |
903 | */ | |
904 | if (curthread->td_lwp && curthread->td_lwp->lwp_vmspace && | |
905 | pmap_emulate_ad_bits(&curthread->td_lwp->lwp_vmspace->vm_pmap)) { | |
906 | if ((fault_type & VM_PROT_WRITE) == 0) | |
907 | fs->prot &= ~VM_PROT_WRITE; | |
908 | } | |
909 | ||
910 | /* | |
e05899ce MD |
911 | * If this is a write fault the object and the page must already |
912 | * be writable. Since we don't hold an object lock and only a | |
913 | * soft-busy on the page, we cannot manipulate the object or | |
914 | * the page state (other than the page queue). | |
70f3bb08 MD |
915 | */ |
916 | if (fs->prot & VM_PROT_WRITE) { | |
917 | if ((obj->flags & (OBJ_WRITEABLE | OBJ_MIGHTBEDIRTY)) != | |
918 | (OBJ_WRITEABLE | OBJ_MIGHTBEDIRTY) || | |
919 | m->dirty != VM_PAGE_BITS_ALL) { | |
920 | vm_page_sbusy_drop(m); | |
e05899ce | 921 | #ifdef VM_FAULT_QUICK_DEBUG |
c2830aa6 | 922 | ++vm_fault_bypass_failure_count4; |
e05899ce | 923 | #endif |
70f3bb08 MD |
924 | return KERN_FAILURE; |
925 | } | |
926 | vm_set_nosync(m, fs->entry); | |
927 | } | |
e05899ce MD |
928 | |
929 | /* | |
01251219 MD |
930 | * Set page and potentially burst in more |
931 | * | |
e05899ce MD |
932 | * Even though we are only soft-busied we can still move pages |
933 | * around in the normal queue(s). The soft-busy prevents the | |
934 | * page from being removed from the object, etc (normal operation). | |
e3c330f0 MD |
935 | * |
936 | * However, in this fast path it is excessively important to avoid | |
937 | * any hard locks, so we use a special passive version of activate. | |
e05899ce | 938 | */ |
70f3bb08 | 939 | fs->msoftonly = 1; |
01251219 MD |
940 | fs->mary[0] = m; |
941 | vm_page_soft_activate(m); | |
942 | ||
c2830aa6 MD |
943 | if (vm_fault_bypass_count > 1) { |
944 | nlim = vm_fault_bypass_count; | |
01251219 MD |
945 | if (nlim > VM_FAULT_MAX_QUICK) /* array limit(+1) */ |
946 | nlim = VM_FAULT_MAX_QUICK; | |
947 | if (nlim > first_count) /* user limit */ | |
948 | nlim = first_count; | |
949 | ||
950 | for (n = 1; n < nlim; ++n) { | |
951 | m = vm_page_hash_get(obj, first_pindex + n); | |
952 | if (m == NULL) | |
953 | break; | |
954 | if (m->valid != VM_PAGE_BITS_ALL || | |
955 | m->queue - m->pc != PQ_ACTIVE || | |
956 | (m->flags & PG_SWAPPED)) { | |
957 | vm_page_sbusy_drop(m); | |
958 | break; | |
959 | } | |
960 | if (fs->prot & VM_PROT_WRITE) { | |
961 | if ((obj->flags & (OBJ_WRITEABLE | | |
962 | OBJ_MIGHTBEDIRTY)) != | |
963 | (OBJ_WRITEABLE | OBJ_MIGHTBEDIRTY) || | |
964 | m->dirty != VM_PAGE_BITS_ALL) { | |
965 | vm_page_sbusy_drop(m); | |
966 | break; | |
967 | } | |
968 | } | |
969 | vm_page_soft_activate(m); | |
970 | fs->mary[n] = m; | |
971 | } | |
972 | *mextcountp = n; | |
973 | } | |
974 | ||
e05899ce | 975 | #ifdef VM_FAULT_QUICK_DEBUG |
c2830aa6 | 976 | ++vm_fault_bypass_success_count; |
e05899ce | 977 | #endif |
70f3bb08 MD |
978 | |
979 | return KERN_SUCCESS; | |
980 | } | |
981 | ||
4e158347 | 982 | /* |
5a0e2a66 MD |
983 | * Fault in the specified virtual address in the current process map, |
984 | * returning a held VM page or NULL. See vm_fault_page() for more | |
985 | * information. | |
9ad0147b MD |
986 | * |
987 | * No requirements. | |
5a0e2a66 MD |
988 | */ |
989 | vm_page_t | |
dc039ae0 MD |
990 | vm_fault_page_quick(vm_offset_t va, vm_prot_t fault_type, |
991 | int *errorp, int *busyp) | |
5a0e2a66 | 992 | { |
287ebb09 | 993 | struct lwp *lp = curthread->td_lwp; |
5a0e2a66 MD |
994 | vm_page_t m; |
995 | ||
287ebb09 | 996 | m = vm_fault_page(&lp->lwp_vmspace->vm_map, va, |
dc039ae0 MD |
997 | fault_type, VM_FAULT_NORMAL, |
998 | errorp, busyp); | |
5a0e2a66 MD |
999 | return(m); |
1000 | } | |
1001 | ||
1002 | /* | |
1003 | * Fault in the specified virtual address in the specified map, doing all | |
4e158347 MD |
1004 | * necessary manipulation of the object store and all necessary I/O. Return |
1005 | * a held VM page or NULL, and set *errorp. The related pmap is not | |
1006 | * updated. | |
1007 | * | |
dc039ae0 MD |
1008 | * If busyp is not NULL then *busyp will be set to TRUE if this routine |
1009 | * decides to return a busied page (aka VM_PROT_WRITE), or FALSE if it | |
95270b7e MD |
1010 | * does not (VM_PROT_WRITE not specified or busyp is NULL). If busyp is |
1011 | * NULL the returned page is only held. | |
dc039ae0 MD |
1012 | * |
1013 | * If the caller has no intention of writing to the page's contents, busyp | |
1014 | * can be passed as NULL along with VM_PROT_WRITE to force a COW operation | |
1015 | * without busying the page. | |
1016 | * | |
1017 | * The returned page will also be marked PG_REFERENCED. | |
17cde63e MD |
1018 | * |
1019 | * If the page cannot be faulted writable and VM_PROT_WRITE was specified, an | |
1020 | * error will be returned. | |
9ad0147b MD |
1021 | * |
1022 | * No requirements. | |
4e158347 MD |
1023 | */ |
1024 | vm_page_t | |
1025 | vm_fault_page(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, | |
dc039ae0 | 1026 | int fault_flags, int *errorp, int *busyp) |
4e158347 | 1027 | { |
4e158347 | 1028 | vm_pindex_t first_pindex; |
01251219 | 1029 | vm_pindex_t first_count; |
4e158347 | 1030 | struct faultstate fs; |
17cde63e | 1031 | int result; |
dc039ae0 | 1032 | int retry; |
b443039b | 1033 | int growstack; |
7a45978d | 1034 | int didcow; |
17cde63e | 1035 | vm_prot_t orig_fault_type = fault_type; |
4e158347 | 1036 | |
dc039ae0 | 1037 | retry = 0; |
7a45978d | 1038 | didcow = 0; |
4e158347 MD |
1039 | fs.hardfault = 0; |
1040 | fs.fault_flags = fault_flags; | |
1041 | KKASSERT((fault_flags & VM_FAULT_WIRE_MASK) == 0); | |
1042 | ||
a86ce0cd MD |
1043 | /* |
1044 | * Dive the pmap (concurrency possible). If we find the | |
1045 | * appropriate page we can terminate early and quickly. | |
dc039ae0 MD |
1046 | * |
1047 | * This works great for normal programs but will always return | |
1048 | * NULL for host lookups of vkernel maps in VMM mode. | |
31efdff0 MD |
1049 | * |
1050 | * NOTE: pmap_fault_page_quick() might not busy the page. If | |
7a45978d MD |
1051 | * VM_PROT_WRITE is set in fault_type and pmap_fault_page_quick() |
1052 | * returns non-NULL, it will safely dirty the returned vm_page_t | |
1053 | * for us. We cannot safely dirty it here (it might not be | |
1054 | * busy). | |
a86ce0cd | 1055 | */ |
01251219 MD |
1056 | fs.mary[0] = pmap_fault_page_quick(map->pmap, vaddr, fault_type, busyp); |
1057 | if (fs.mary[0]) { | |
a86ce0cd | 1058 | *errorp = 0; |
01251219 | 1059 | return(fs.mary[0]); |
a86ce0cd MD |
1060 | } |
1061 | ||
1062 | /* | |
1063 | * Otherwise take a concurrency hit and do a formal page | |
1064 | * fault. | |
1065 | */ | |
b443039b | 1066 | fs.vp = NULL; |
501747bf MD |
1067 | fs.shared = vm_shared_fault; |
1068 | fs.first_shared = vm_shared_fault; | |
70f3bb08 | 1069 | fs.msoftonly = 0; |
b443039b | 1070 | growstack = 1; |
b12defdc | 1071 | |
61dac052 | 1072 | /* |
ceb0e493 MD |
1073 | * VM_FAULT_UNSWAP - swap_pager_unswapped() needs an exclusive object |
1074 | * VM_FAULT_DIRTY - may require swap_pager_unswapped() later, but | |
1075 | * we can try shared first. | |
61dac052 | 1076 | */ |
ceb0e493 | 1077 | if (fault_flags & VM_FAULT_UNSWAP) { |
61dac052 MD |
1078 | fs.first_shared = 0; |
1079 | } | |
1080 | ||
4e158347 MD |
1081 | RetryFault: |
1082 | /* | |
1083 | * Find the vm_map_entry representing the backing store and resolve | |
1084 | * the top level object and page index. This may have the side | |
1085 | * effect of executing a copy-on-write on the map entry and/or | |
1086 | * creating a shadow object, but will not COW any actual VM pages. | |
1087 | * | |
1088 | * On success fs.map is left read-locked and various other fields | |
1089 | * are initialized but not otherwise referenced or locked. | |
1090 | * | |
1091 | * NOTE! vm_map_lookup will upgrade the fault_type to VM_FAULT_WRITE | |
5947157e MD |
1092 | * if the map entry is a virtual page table and also writable, |
1093 | * so we can set the 'A'accessed bit in the virtual page table | |
1094 | * entry. | |
4e158347 MD |
1095 | */ |
1096 | fs.map = map; | |
9de48ead | 1097 | fs.first_ba_held = 0; |
4e158347 | 1098 | result = vm_map_lookup(&fs.map, vaddr, fault_type, |
9de48ead | 1099 | &fs.entry, &fs.first_ba, |
01251219 MD |
1100 | &first_pindex, &first_count, |
1101 | &fs.first_prot, &fs.wflags); | |
4e158347 MD |
1102 | |
1103 | if (result != KERN_SUCCESS) { | |
b443039b MD |
1104 | if (result == KERN_FAILURE_NOFAULT) { |
1105 | *errorp = KERN_FAILURE; | |
01251219 | 1106 | fs.mary[0] = NULL; |
b443039b MD |
1107 | goto done; |
1108 | } | |
1109 | if (result != KERN_PROTECTION_FAILURE || | |
1110 | (fs.fault_flags & VM_FAULT_WIRE_MASK) != VM_FAULT_USER_WIRE) | |
1111 | { | |
1112 | if (result == KERN_INVALID_ADDRESS && growstack && | |
1eeaf6b2 | 1113 | map != kernel_map && curproc != NULL) { |
95270b7e | 1114 | result = vm_map_growstack(map, vaddr); |
b443039b MD |
1115 | if (result == KERN_SUCCESS) { |
1116 | growstack = 0; | |
1117 | ++retry; | |
1118 | goto RetryFault; | |
1119 | } | |
1120 | result = KERN_FAILURE; | |
1121 | } | |
01251219 | 1122 | fs.mary[0] = NULL; |
b443039b MD |
1123 | *errorp = result; |
1124 | goto done; | |
1125 | } | |
1126 | ||
1127 | /* | |
1128 | * If we are user-wiring a r/w segment, and it is COW, then | |
1129 | * we need to do the COW operation. Note that we don't | |
1130 | * currently COW RO sections now, because it is NOT desirable | |
1131 | * to COW .text. We simply keep .text from ever being COW'ed | |
1132 | * and take the heat that one cannot debug wired .text sections. | |
1133 | */ | |
1134 | result = vm_map_lookup(&fs.map, vaddr, | |
c936cb6f | 1135 | VM_PROT_READ | VM_PROT_WRITE | |
b443039b | 1136 | VM_PROT_OVERRIDE_WRITE, |
9de48ead | 1137 | &fs.entry, &fs.first_ba, |
01251219 MD |
1138 | &first_pindex, &first_count, |
1139 | &fs.first_prot, &fs.wflags); | |
b443039b MD |
1140 | if (result != KERN_SUCCESS) { |
1141 | /* could also be KERN_FAILURE_NOFAULT */ | |
1142 | *errorp = KERN_FAILURE; | |
01251219 | 1143 | fs.mary[0] = NULL; |
b443039b MD |
1144 | goto done; |
1145 | } | |
1146 | ||
1147 | /* | |
1148 | * If we don't COW now, on a user wire, the user will never | |
1149 | * be able to write to the mapping. If we don't make this | |
1150 | * restriction, the bookkeeping would be nearly impossible. | |
1151 | * | |
1152 | * XXX We have a shared lock, this will have a MP race but | |
1153 | * I don't see how it can hurt anything. | |
1154 | */ | |
c936cb6f | 1155 | if ((fs.first_prot & VM_PROT_WRITE) == 0) { |
b443039b MD |
1156 | atomic_clear_char(&fs.entry->max_protection, |
1157 | VM_PROT_WRITE); | |
1158 | } | |
4e158347 MD |
1159 | } |
1160 | ||
1161 | /* | |
1162 | * fs.map is read-locked | |
1163 | * | |
1164 | * Misc checks. Save the map generation number to detect races. | |
1165 | */ | |
1c024bc6 | 1166 | fs.lookup_still_valid = 1; |
862481e5 | 1167 | fs.first_m = NULL; |
9de48ead | 1168 | fs.ba = fs.first_ba; |
4e158347 MD |
1169 | |
1170 | if (fs.entry->eflags & MAP_ENTRY_NOFAULT) { | |
1171 | panic("vm_fault: fault on nofault entry, addr: %lx", | |
1172 | (u_long)vaddr); | |
1173 | } | |
1174 | ||
6a5c487b MD |
1175 | /* |
1176 | * A user-kernel shared map has no VM object and bypasses | |
1177 | * everything. We execute the uksmap function with a temporary | |
1178 | * fictitious vm_page. The address is directly mapped with no | |
1179 | * management. | |
1180 | */ | |
1181 | if (fs.entry->maptype == VM_MAPTYPE_UKSMAP) { | |
1182 | struct vm_page fakem; | |
1183 | ||
1184 | bzero(&fakem, sizeof(fakem)); | |
1185 | fakem.pindex = first_pindex; | |
831a8507 | 1186 | fakem.flags = PG_FICTITIOUS | PG_UNQUEUED; |
bc0aa189 | 1187 | fakem.busy_count = PBUSY_LOCKED; |
6a5c487b MD |
1188 | fakem.valid = VM_PAGE_BITS_ALL; |
1189 | fakem.pat_mode = VM_MEMATTR_DEFAULT; | |
64b5a8a5 MD |
1190 | if (fs.entry->ba.uksmap(&fs.entry->ba, UKSMAPOP_FAULT, |
1191 | fs.entry->aux.dev, &fakem)) { | |
6a5c487b | 1192 | *errorp = KERN_FAILURE; |
01251219 | 1193 | fs.mary[0] = NULL; |
6a5c487b MD |
1194 | unlock_things(&fs); |
1195 | goto done2; | |
1196 | } | |
01251219 MD |
1197 | fs.mary[0] = PHYS_TO_VM_PAGE(fakem.phys_addr); |
1198 | vm_page_hold(fs.mary[0]); | |
dc039ae0 MD |
1199 | if (busyp) |
1200 | *busyp = 0; /* don't need to busy R or W */ | |
6a5c487b MD |
1201 | unlock_things(&fs); |
1202 | *errorp = 0; | |
1203 | goto done; | |
1204 | } | |
1205 | ||
1206 | ||
4e158347 MD |
1207 | /* |
1208 | * A system map entry may return a NULL object. No object means | |
1209 | * no pager means an unrecoverable kernel fault. | |
1210 | */ | |
9de48ead | 1211 | if (fs.first_ba == NULL) { |
4e158347 MD |
1212 | panic("vm_fault: unrecoverable fault at %p in entry %p", |
1213 | (void *)vaddr, fs.entry); | |
1214 | } | |
1215 | ||
862481e5 MD |
1216 | /* |
1217 | * Fail here if not a trivial anonymous page fault and TDF_NOFAULT | |
1218 | * is set. | |
10c39de2 MD |
1219 | * |
1220 | * Unfortunately a deadlock can occur if we are forced to page-in | |
1221 | * from swap, but diving all the way into the vm_pager_get_page() | |
1222 | * function to find out is too much. Just check the object type. | |
862481e5 MD |
1223 | */ |
1224 | if ((curthread->td_flags & TDF_NOFAULT) && | |
43320d68 | 1225 | (retry || |
9de48ead MD |
1226 | fs.first_ba->object->type == OBJT_VNODE || |
1227 | fs.first_ba->object->type == OBJT_SWAP || | |
1228 | fs.first_ba->backing_ba)) { | |
d846739a | 1229 | *errorp = KERN_FAILURE; |
862481e5 | 1230 | unlock_things(&fs); |
01251219 | 1231 | fs.mary[0] = NULL; |
862481e5 MD |
1232 | goto done2; |
1233 | } | |
1234 | ||
501747bf | 1235 | /* |
c936cb6f MD |
1236 | * If the entry is wired the page protection level is limited to |
1237 | * what the vm_map_lookup() allowed us. | |
1238 | * | |
1239 | * XXX it is unclear if this code is still needed as vm_map_lookup() | |
1240 | * no longer prevents protection changes on locked memory. REMOVE | |
1241 | * IF WE DETERMINE THAT THIS CODE IS NO LONGER NEEDED. | |
501747bf | 1242 | */ |
7a45978d | 1243 | if (fs.wflags & FW_WIRED) |
501747bf MD |
1244 | fault_type = fs.first_prot; |
1245 | ||
4e158347 MD |
1246 | /* |
1247 | * Make a reference to this object to prevent its disposal while we | |
1248 | * are messing with it. Once we have the reference, the map is free | |
1249 | * to be diddled. Since objects reference their shadows (and copies), | |
1250 | * they will stay around as well. | |
1251 | * | |
b12defdc MD |
1252 | * The reference should also prevent an unexpected collapse of the |
1253 | * parent that might move pages from the current object into the | |
1254 | * parent unexpectedly, resulting in corruption. | |
1255 | * | |
4e158347 MD |
1256 | * Bump the paging-in-progress count to prevent size changes (e.g. |
1257 | * truncation operations) during I/O. This must be done after | |
1258 | * obtaining the vnode lock in order to avoid possible deadlocks. | |
1259 | */ | |
9de48ead MD |
1260 | if (fs.first_ba->flags & VM_MAP_BACK_EXCL_HEUR) |
1261 | fs.first_shared = 0; | |
1262 | ||
501747bf | 1263 | if (fs.first_shared) |
9de48ead | 1264 | vm_object_hold_shared(fs.first_ba->object); |
501747bf | 1265 | else |
9de48ead MD |
1266 | vm_object_hold(fs.first_ba->object); |
1267 | fs.first_ba_held = 1; | |
501747bf | 1268 | if (fs.vp == NULL) |
9de48ead | 1269 | fs.vp = vnode_pager_lock(fs.first_ba); /* shared */ |
4e158347 MD |
1270 | |
1271 | /* | |
4d4f84f5 | 1272 | * The page we want is at (first_object, first_pindex). |
4e158347 | 1273 | * |
4e158347 MD |
1274 | * Now we have the actual (object, pindex), fault in the page. If |
1275 | * vm_fault_object() fails it will unlock and deallocate the FS | |
9de48ead MD |
1276 | * data. If it succeeds everything remains locked and fs->ba->object |
1277 | * will have an additinal PIP count if fs->ba != fs->first_ba. | |
4e158347 | 1278 | */ |
01251219 | 1279 | fs.mary[0] = NULL; |
43320d68 | 1280 | result = vm_fault_object(&fs, first_pindex, fault_type, 1); |
4e158347 | 1281 | |
b12defdc | 1282 | if (result == KERN_TRY_AGAIN) { |
9de48ead | 1283 | KKASSERT(fs.first_ba_held == 0); |
43320d68 | 1284 | ++retry; |
7a45978d | 1285 | didcow |= fs.wflags & FW_DIDCOW; |
4e158347 | 1286 | goto RetryFault; |
b12defdc | 1287 | } |
4e158347 MD |
1288 | if (result != KERN_SUCCESS) { |
1289 | *errorp = result; | |
01251219 | 1290 | fs.mary[0] = NULL; |
b12defdc | 1291 | goto done; |
4e158347 MD |
1292 | } |
1293 | ||
17cde63e MD |
1294 | if ((orig_fault_type & VM_PROT_WRITE) && |
1295 | (fs.prot & VM_PROT_WRITE) == 0) { | |
1296 | *errorp = KERN_PROTECTION_FAILURE; | |
9de48ead | 1297 | unlock_things(&fs); |
01251219 | 1298 | fs.mary[0] = NULL; |
b12defdc | 1299 | goto done; |
17cde63e MD |
1300 | } |
1301 | ||
d2d8515b | 1302 | /* |
7a45978d MD |
1303 | * Generally speaking we don't want to update the pmap because |
1304 | * this routine can be called many times for situations that do | |
1305 | * not require updating the pmap, not to mention the page might | |
1306 | * already be in the pmap. | |
385c96e4 | 1307 | * |
7a45978d MD |
1308 | * However, if our vm_map_lookup() results in a COW, we need to |
1309 | * at least remove the pte from the pmap to guarantee proper | |
1310 | * visibility of modifications made to the process. For example, | |
1311 | * modifications made by vkernel uiocopy/related routines and | |
1312 | * modifications made by ptrace(). | |
d2d8515b | 1313 | */ |
01251219 | 1314 | vm_page_flag_set(fs.mary[0], PG_REFERENCED); |
385c96e4 | 1315 | #if 0 |
01251219 | 1316 | pmap_enter(fs.map->pmap, vaddr, fs.mary[0], fs.prot, |
7a45978d | 1317 | fs.wflags & FW_WIRED, NULL); |
54341a3b MD |
1318 | mycpu->gd_cnt.v_vm_faults++; |
1319 | if (curthread->td_lwp) | |
1320 | ++curthread->td_lwp->lwp_ru.ru_minflt; | |
385c96e4 | 1321 | #endif |
fbcf5d2e | 1322 | if ((fs.wflags | didcow) & FW_DIDCOW) { |
7a45978d MD |
1323 | pmap_remove(fs.map->pmap, |
1324 | vaddr & ~PAGE_MASK, | |
1325 | (vaddr & ~PAGE_MASK) + PAGE_SIZE); | |
1326 | } | |
d2d8515b | 1327 | |
4e158347 | 1328 | /* |
01251219 MD |
1329 | * On success vm_fault_object() does not unlock or deallocate, and |
1330 | * fs.mary[0] will contain a busied page. So we must unlock here | |
1331 | * after having messed with the pmap. | |
4e158347 MD |
1332 | */ |
1333 | unlock_things(&fs); | |
1334 | ||
1335 | /* | |
1336 | * Return a held page. We are not doing any pmap manipulation so do | |
5a0e2a66 MD |
1337 | * not set PG_MAPPED. However, adjust the page flags according to |
1338 | * the fault type because the caller may not use a managed pmapping | |
1339 | * (so we don't want to lose the fact that the page will be dirtied | |
1340 | * if a write fault was specified). | |
4e158347 | 1341 | */ |
5a0e2a66 | 1342 | if (fault_type & VM_PROT_WRITE) |
01251219 MD |
1343 | vm_page_dirty(fs.mary[0]); |
1344 | vm_page_activate(fs.mary[0]); | |
4e158347 | 1345 | |
4e158347 MD |
1346 | if (curthread->td_lwp) { |
1347 | if (fs.hardfault) { | |
1348 | curthread->td_lwp->lwp_ru.ru_majflt++; | |
1349 | } else { | |
1350 | curthread->td_lwp->lwp_ru.ru_minflt++; | |
1351 | } | |
1352 | } | |
1353 | ||
1354 | /* | |
dc039ae0 | 1355 | * Unlock everything, and return the held or busied page. |
4e158347 | 1356 | */ |
dc039ae0 | 1357 | if (busyp) { |
7a45978d | 1358 | if (fault_type & VM_PROT_WRITE) { |
01251219 | 1359 | vm_page_dirty(fs.mary[0]); |
dc039ae0 MD |
1360 | *busyp = 1; |
1361 | } else { | |
1362 | *busyp = 0; | |
01251219 MD |
1363 | vm_page_hold(fs.mary[0]); |
1364 | vm_page_wakeup(fs.mary[0]); | |
dc039ae0 MD |
1365 | } |
1366 | } else { | |
01251219 MD |
1367 | vm_page_hold(fs.mary[0]); |
1368 | vm_page_wakeup(fs.mary[0]); | |
dc039ae0 | 1369 | } |
9de48ead | 1370 | /*vm_object_deallocate(fs.first_ba->object);*/ |
4e158347 | 1371 | *errorp = 0; |
b12defdc MD |
1372 | |
1373 | done: | |
9de48ead | 1374 | KKASSERT(fs.first_ba_held == 0); |
862481e5 | 1375 | done2: |
01251219 | 1376 | return(fs.mary[0]); |
4e158347 MD |
1377 | } |
1378 | ||
aa542ad5 | 1379 | /* |
17cde63e MD |
1380 | * Fault in the specified (object,offset), dirty the returned page as |
1381 | * needed. If the requested fault_type cannot be done NULL and an | |
1382 | * error is returned. | |
9ad0147b MD |
1383 | * |
1384 | * A held (but not busied) page is returned. | |
1385 | * | |
501747bf MD |
1386 | * The passed in object must be held as specified by the shared |
1387 | * argument. | |
aa542ad5 MD |
1388 | */ |
1389 | vm_page_t | |
1390 | vm_fault_object_page(vm_object_t object, vm_ooffset_t offset, | |
ce94514e | 1391 | vm_prot_t fault_type, int fault_flags, |
501747bf | 1392 | int *sharedp, int *errorp) |
aa542ad5 MD |
1393 | { |
1394 | int result; | |
1395 | vm_pindex_t first_pindex; | |
01251219 | 1396 | vm_pindex_t first_count; |
aa542ad5 MD |
1397 | struct faultstate fs; |
1398 | struct vm_map_entry entry; | |
1399 | ||
5b329e62 MD |
1400 | /* |
1401 | * Since we aren't actually faulting the page into a | |
1402 | * pmap we can just fake the entry.ba. | |
1403 | */ | |
b12defdc | 1404 | ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); |
aa542ad5 | 1405 | bzero(&entry, sizeof(entry)); |
aa542ad5 MD |
1406 | entry.maptype = VM_MAPTYPE_NORMAL; |
1407 | entry.protection = entry.max_protection = fault_type; | |
9de48ead MD |
1408 | entry.ba.backing_ba = NULL; |
1409 | entry.ba.object = object; | |
1410 | entry.ba.offset = 0; | |
aa542ad5 | 1411 | |
aa542ad5 MD |
1412 | fs.hardfault = 0; |
1413 | fs.fault_flags = fault_flags; | |
1414 | fs.map = NULL; | |
501747bf MD |
1415 | fs.shared = vm_shared_fault; |
1416 | fs.first_shared = *sharedp; | |
70f3bb08 | 1417 | fs.msoftonly = 0; |
501747bf | 1418 | fs.vp = NULL; |
1c024bc6 | 1419 | fs.first_ba_held = -1; /* object held across call, prevent drop */ |
aa542ad5 MD |
1420 | KKASSERT((fault_flags & VM_FAULT_WIRE_MASK) == 0); |
1421 | ||
61dac052 | 1422 | /* |
ceb0e493 MD |
1423 | * VM_FAULT_UNSWAP - swap_pager_unswapped() needs an exclusive object |
1424 | * VM_FAULT_DIRTY - may require swap_pager_unswapped() later, but | |
1425 | * we can try shared first. | |
61dac052 | 1426 | */ |
ceb0e493 | 1427 | if (fs.first_shared && (fault_flags & VM_FAULT_UNSWAP)) { |
61dac052 MD |
1428 | fs.first_shared = 0; |
1429 | vm_object_upgrade(object); | |
1430 | } | |
1431 | ||
1432 | /* | |
1433 | * Retry loop as needed (typically for shared->exclusive transitions) | |
1434 | */ | |
aa542ad5 | 1435 | RetryFault: |
501747bf | 1436 | *sharedp = fs.first_shared; |
aa542ad5 | 1437 | first_pindex = OFF_TO_IDX(offset); |
01251219 | 1438 | first_count = 1; |
9de48ead MD |
1439 | fs.first_ba = &entry.ba; |
1440 | fs.ba = fs.first_ba; | |
aa542ad5 MD |
1441 | fs.entry = &entry; |
1442 | fs.first_prot = fault_type; | |
7a45978d | 1443 | fs.wflags = 0; |
aa542ad5 MD |
1444 | |
1445 | /* | |
1446 | * Make a reference to this object to prevent its disposal while we | |
1447 | * are messing with it. Once we have the reference, the map is free | |
1448 | * to be diddled. Since objects reference their shadows (and copies), | |
1449 | * they will stay around as well. | |
1450 | * | |
b12defdc MD |
1451 | * The reference should also prevent an unexpected collapse of the |
1452 | * parent that might move pages from the current object into the | |
1453 | * parent unexpectedly, resulting in corruption. | |
1454 | * | |
aa542ad5 MD |
1455 | * Bump the paging-in-progress count to prevent size changes (e.g. |
1456 | * truncation operations) during I/O. This must be done after | |
1457 | * obtaining the vnode lock in order to avoid possible deadlocks. | |
1458 | */ | |
501747bf | 1459 | if (fs.vp == NULL) |
9de48ead | 1460 | fs.vp = vnode_pager_lock(fs.first_ba); |
aa542ad5 | 1461 | |
1c024bc6 | 1462 | fs.lookup_still_valid = 1; |
aa542ad5 | 1463 | fs.first_m = NULL; |
aa542ad5 | 1464 | |
aa542ad5 MD |
1465 | /* |
1466 | * Now we have the actual (object, pindex), fault in the page. If | |
1467 | * vm_fault_object() fails it will unlock and deallocate the FS | |
9de48ead MD |
1468 | * data. If it succeeds everything remains locked and fs->ba->object |
1469 | * will have an additinal PIP count if fs->ba != fs->first_ba. | |
501747bf | 1470 | * |
9de48ead | 1471 | * On KERN_TRY_AGAIN vm_fault_object() leaves fs.first_ba intact. |
501747bf | 1472 | * We may have to upgrade its lock to handle the requested fault. |
aa542ad5 | 1473 | */ |
43320d68 | 1474 | result = vm_fault_object(&fs, first_pindex, fault_type, 0); |
aa542ad5 | 1475 | |
501747bf MD |
1476 | if (result == KERN_TRY_AGAIN) { |
1477 | if (fs.first_shared == 0 && *sharedp) | |
1478 | vm_object_upgrade(object); | |
aa542ad5 | 1479 | goto RetryFault; |
501747bf | 1480 | } |
aa542ad5 MD |
1481 | if (result != KERN_SUCCESS) { |
1482 | *errorp = result; | |
1483 | return(NULL); | |
1484 | } | |
1485 | ||
17cde63e MD |
1486 | if ((fault_type & VM_PROT_WRITE) && (fs.prot & VM_PROT_WRITE) == 0) { |
1487 | *errorp = KERN_PROTECTION_FAILURE; | |
9de48ead | 1488 | unlock_things(&fs); |
17cde63e MD |
1489 | return(NULL); |
1490 | } | |
1491 | ||
aa542ad5 | 1492 | /* |
d2d8515b MD |
1493 | * On success vm_fault_object() does not unlock or deallocate, so we |
1494 | * do it here. Note that the returned fs.m will be busied. | |
aa542ad5 MD |
1495 | */ |
1496 | unlock_things(&fs); | |
1497 | ||
1498 | /* | |
1499 | * Return a held page. We are not doing any pmap manipulation so do | |
1500 | * not set PG_MAPPED. However, adjust the page flags according to | |
1501 | * the fault type because the caller may not use a managed pmapping | |
1502 | * (so we don't want to lose the fact that the page will be dirtied | |
1503 | * if a write fault was specified). | |
1504 | */ | |
01251219 MD |
1505 | vm_page_hold(fs.mary[0]); |
1506 | vm_page_activate(fs.mary[0]); | |
54341a3b | 1507 | if ((fault_type & VM_PROT_WRITE) || (fault_flags & VM_FAULT_DIRTY)) |
01251219 | 1508 | vm_page_dirty(fs.mary[0]); |
9f3543c6 | 1509 | if (fault_flags & VM_FAULT_UNSWAP) |
01251219 | 1510 | swap_pager_unswapped(fs.mary[0]); |
9f3543c6 | 1511 | |
aa542ad5 MD |
1512 | /* |
1513 | * Indicate that the page was accessed. | |
1514 | */ | |
01251219 | 1515 | vm_page_flag_set(fs.mary[0], PG_REFERENCED); |
aa542ad5 | 1516 | |
aa542ad5 MD |
1517 | if (curthread->td_lwp) { |
1518 | if (fs.hardfault) { | |
aa542ad5 MD |
1519 | curthread->td_lwp->lwp_ru.ru_majflt++; |
1520 | } else { | |
1521 | curthread->td_lwp->lwp_ru.ru_minflt++; | |
1522 | } | |
1523 | } | |
1524 | ||
1525 | /* | |
1526 | * Unlock everything, and return the held page. | |
1527 | */ | |
01251219 | 1528 | vm_page_wakeup(fs.mary[0]); |
9de48ead | 1529 | /*vm_object_deallocate(fs.first_ba->object);*/ |
aa542ad5 MD |
1530 | |
1531 | *errorp = 0; | |
01251219 | 1532 | return(fs.mary[0]); |
aa542ad5 MD |
1533 | } |
1534 | ||
568e6804 | 1535 | /* |
9ad0147b MD |
1536 | * This is the core of the vm_fault code. |
1537 | * | |
9de48ead MD |
1538 | * Do all operations required to fault-in (fs.first_ba->object, pindex). |
1539 | * Run through the backing store as necessary and do required COW or virtual | |
568e6804 MD |
1540 | * copy operations. The caller has already fully resolved the vm_map_entry |
1541 | * and, if appropriate, has created a copy-on-write layer. All we need to | |
1542 | * do is iterate the object chain. | |
1543 | * | |
1544 | * On failure (fs) is unlocked and deallocated and the caller may return or | |
75f59a66 | 1545 | * retry depending on the failure code. On success (fs) is NOT unlocked or |
01251219 | 1546 | * deallocated, fs.mary[0] will contained a resolved, busied page, and fs.ba's |
9de48ead MD |
1547 | * object will have an additional PIP count if it is not equal to |
1548 | * fs.first_ba. | |
9ad0147b | 1549 | * |
501747bf MD |
1550 | * If locks based on fs->first_shared or fs->shared are insufficient, |
1551 | * clear the appropriate field(s) and return RETRY. COWs require that | |
1552 | * first_shared be 0, while page allocations (or frees) require that | |
1553 | * shared be 0. Renames require that both be 0. | |
1554 | * | |
ceb0e493 MD |
1555 | * NOTE! fs->[first_]shared might be set with VM_FAULT_DIRTY also set. |
1556 | * we will have to retry with it exclusive if the vm_page is | |
1557 | * PG_SWAPPED. | |
1558 | * | |
9de48ead | 1559 | * fs->first_ba->object must be held on call. |
568e6804 MD |
1560 | */ |
1561 | static | |
1562 | int | |
43320d68 MD |
1563 | vm_fault_object(struct faultstate *fs, vm_pindex_t first_pindex, |
1564 | vm_prot_t fault_type, int allow_nofault) | |
568e6804 | 1565 | { |
44293a80 | 1566 | vm_map_backing_t next_ba; |
72579d2e | 1567 | vm_pindex_t pindex; |
b12defdc | 1568 | int error; |
568e6804 | 1569 | |
9de48ead | 1570 | ASSERT_LWKT_TOKEN_HELD(vm_object_token(fs->first_ba->object)); |
72579d2e | 1571 | fs->prot = fs->first_prot; |
72579d2e | 1572 | pindex = first_pindex; |
9de48ead | 1573 | KKASSERT(fs->ba == fs->first_ba); |
72579d2e | 1574 | |
9de48ead | 1575 | vm_object_pip_add(fs->first_ba->object, 1); |
b12defdc | 1576 | |
4e7c41c5 | 1577 | /* |
5947157e MD |
1578 | * If a read fault occurs we try to upgrade the page protection |
1579 | * and make it also writable if possible. There are three cases | |
1580 | * where we cannot make the page mapping writable: | |
4e7c41c5 MD |
1581 | * |
1582 | * (1) The mapping is read-only or the VM object is read-only, | |
0035dca9 | 1583 | * fs->prot above will simply not have VM_PROT_WRITE set. |
4e7c41c5 | 1584 | * |
4d4f84f5 | 1585 | * (2) If the VM page is read-only or copy-on-write, upgrading would |
4e7c41c5 | 1586 | * just result in an unnecessary COW fault. |
0035dca9 | 1587 | * |
4d4f84f5 | 1588 | * (3) If the pmap specifically requests A/M bit emulation, downgrade |
c50e690b | 1589 | * here. |
4e7c41c5 | 1590 | */ |
a86ce0cd MD |
1591 | if (curthread->td_lwp && curthread->td_lwp->lwp_vmspace && |
1592 | pmap_emulate_ad_bits(&curthread->td_lwp->lwp_vmspace->vm_pmap)) { | |
1593 | if ((fault_type & VM_PROT_WRITE) == 0) | |
1594 | fs->prot &= ~VM_PROT_WRITE; | |
1595 | } | |
1596 | ||
9de48ead | 1597 | /* vm_object_hold(fs->ba->object); implied b/c ba == first_ba */ |
9ad0147b | 1598 | |
568e6804 MD |
1599 | for (;;) { |
1600 | /* | |
1601 | * If the object is dead, we stop here | |
1602 | */ | |
9de48ead MD |
1603 | if (fs->ba->object->flags & OBJ_DEAD) { |
1604 | vm_object_pip_wakeup(fs->first_ba->object); | |
1605 | unlock_things(fs); | |
984263bc MD |
1606 | return (KERN_PROTECTION_FAILURE); |
1607 | } | |
1608 | ||
1609 | /* | |
b12defdc MD |
1610 | * See if the page is resident. Wait/Retry if the page is |
1611 | * busy (lots of stuff may have changed so we can't continue | |
1612 | * in that case). | |
1613 | * | |
1614 | * We can theoretically allow the soft-busy case on a read | |
1615 | * fault if the page is marked valid, but since such | |
1616 | * pages are typically already pmap'd, putting that | |
1617 | * special case in might be more effort then it is | |
1618 | * worth. We cannot under any circumstances mess | |
1619 | * around with a vm_page_t->busy page except, perhaps, | |
1620 | * to pmap it. | |
984263bc | 1621 | */ |
01251219 MD |
1622 | fs->mary[0] = vm_page_lookup_busy_try(fs->ba->object, pindex, |
1623 | TRUE, &error); | |
b12defdc | 1624 | if (error) { |
9de48ead | 1625 | vm_object_pip_wakeup(fs->first_ba->object); |
b12defdc | 1626 | unlock_things(fs); |
01251219 | 1627 | vm_page_sleep_busy(fs->mary[0], TRUE, "vmpfw"); |
b12defdc | 1628 | mycpu->gd_cnt.v_intrans++; |
01251219 | 1629 | fs->mary[0] = NULL; |
b12defdc MD |
1630 | return (KERN_TRY_AGAIN); |
1631 | } | |
01251219 | 1632 | if (fs->mary[0]) { |
984263bc | 1633 | /* |
b12defdc | 1634 | * The page is busied for us. |
984263bc | 1635 | * |
568e6804 MD |
1636 | * If reactivating a page from PQ_CACHE we may have |
1637 | * to rate-limit. | |
1638 | */ | |
01251219 MD |
1639 | int queue = fs->mary[0]->queue; |
1640 | vm_page_unqueue_nowakeup(fs->mary[0]); | |
984263bc | 1641 | |
01251219 | 1642 | if ((queue - fs->mary[0]->pc) == PQ_CACHE && |
e91e64c7 | 1643 | vm_paging_severe()) { |
01251219 MD |
1644 | vm_page_activate(fs->mary[0]); |
1645 | vm_page_wakeup(fs->mary[0]); | |
1646 | fs->mary[0] = NULL; | |
9de48ead MD |
1647 | vm_object_pip_wakeup(fs->first_ba->object); |
1648 | unlock_things(fs); | |
43320d68 MD |
1649 | if (allow_nofault == 0 || |
1650 | (curthread->td_flags & TDF_NOFAULT) == 0) { | |
2c9e2984 MD |
1651 | thread_t td; |
1652 | ||
43320d68 | 1653 | vm_wait_pfault(); |
2c9e2984 MD |
1654 | td = curthread; |
1655 | if (td->td_proc && (td->td_proc->p_flags & P_LOWMEMKILL)) | |
1656 | return (KERN_PROTECTION_FAILURE); | |
43320d68 | 1657 | } |
568e6804 | 1658 | return (KERN_TRY_AGAIN); |
984263bc MD |
1659 | } |
1660 | ||
1661 | /* | |
b12defdc MD |
1662 | * If it still isn't completely valid (readable), |
1663 | * or if a read-ahead-mark is set on the VM page, | |
1664 | * jump to readrest, else we found the page and | |
1665 | * can return. | |
06ecca5a MD |
1666 | * |
1667 | * We can release the spl once we have marked the | |
1668 | * page busy. | |
984263bc | 1669 | */ |
712b6620 | 1670 | if (fs->mary[0]->object != kernel_object) { |
01251219 | 1671 | if ((fs->mary[0]->valid & VM_PAGE_BITS_ALL) != |
cf1bb2a8 MD |
1672 | VM_PAGE_BITS_ALL) { |
1673 | goto readrest; | |
1674 | } | |
01251219 | 1675 | if (fs->mary[0]->flags & PG_RAM) { |
cf1bb2a8 MD |
1676 | if (debug_cluster) |
1677 | kprintf("R"); | |
01251219 | 1678 | vm_page_flag_clear(fs->mary[0], PG_RAM); |
cf1bb2a8 MD |
1679 | goto readrest; |
1680 | } | |
984263bc | 1681 | } |
64b5a8a5 MD |
1682 | atomic_clear_int(&fs->first_ba->flags, |
1683 | VM_MAP_BACK_EXCL_HEUR); | |
568e6804 | 1684 | break; /* break to PAGE HAS BEEN FOUND */ |
984263bc MD |
1685 | } |
1686 | ||
1687 | /* | |
1688 | * Page is not resident, If this is the search termination | |
1689 | * or the pager might contain the page, allocate a new page. | |
1690 | */ | |
9de48ead MD |
1691 | if (TRYPAGER(fs) || fs->ba == fs->first_ba) { |
1692 | /* | |
1693 | * If this is a SWAP object we can use the shared | |
1694 | * lock to check existence of a swap block. If | |
1695 | * there isn't one we can skip to the next object. | |
1696 | * | |
1697 | * However, if this is the first object we allocate | |
1698 | * a page now just in case we need to copy to it | |
1699 | * later. | |
1700 | */ | |
1701 | if (fs->ba != fs->first_ba && | |
1702 | fs->ba->object->type == OBJT_SWAP) { | |
1703 | if (swap_pager_haspage_locked(fs->ba->object, | |
1704 | pindex) == 0) { | |
1705 | goto next; | |
1706 | } | |
1707 | } | |
1708 | ||
501747bf MD |
1709 | /* |
1710 | * Allocating, must be exclusive. | |
1711 | */ | |
64b5a8a5 MD |
1712 | atomic_set_int(&fs->first_ba->flags, |
1713 | VM_MAP_BACK_EXCL_HEUR); | |
9de48ead | 1714 | if (fs->ba == fs->first_ba && fs->first_shared) { |
501747bf | 1715 | fs->first_shared = 0; |
9de48ead MD |
1716 | vm_object_pip_wakeup(fs->first_ba->object); |
1717 | unlock_things(fs); | |
501747bf MD |
1718 | return (KERN_TRY_AGAIN); |
1719 | } | |
9de48ead | 1720 | if (fs->ba != fs->first_ba && fs->shared) { |
501747bf MD |
1721 | fs->first_shared = 0; |
1722 | fs->shared = 0; | |
9de48ead MD |
1723 | vm_object_pip_wakeup(fs->first_ba->object); |
1724 | unlock_things(fs); | |
501747bf MD |
1725 | return (KERN_TRY_AGAIN); |
1726 | } | |
1727 | ||
568e6804 MD |
1728 | /* |
1729 | * If the page is beyond the object size we fail | |
1730 | */ | |
9de48ead MD |
1731 | if (pindex >= fs->ba->object->size) { |
1732 | vm_object_pip_wakeup(fs->first_ba->object); | |
1733 | unlock_things(fs); | |
984263bc MD |
1734 | return (KERN_PROTECTION_FAILURE); |
1735 | } | |
1736 | ||
1737 | /* | |
1738 | * Allocate a new page for this object/offset pair. | |
d2d8515b MD |
1739 | * |
1740 | * It is possible for the allocation to race, so | |
1741 | * handle the case. | |
5ebb17ad MD |
1742 | * |
1743 | * Does not apply to OBJT_MGTDEVICE (e.g. gpu / drm | |
1744 | * subsystem). For OBJT_MGTDEVICE the pages are not | |
1745 | * indexed in the VM object at all but instead directly | |
1746 | * entered into the pmap. | |
984263bc | 1747 | */ |
01251219 | 1748 | fs->mary[0] = NULL; |
5ebb17ad MD |
1749 | if (fs->ba->object->type == OBJT_MGTDEVICE) |
1750 | goto readrest; | |
1751 | ||
e91e64c7 | 1752 | if (!vm_paging_severe()) { |
01251219 MD |
1753 | fs->mary[0] = vm_page_alloc(fs->ba->object, |
1754 | pindex, | |
9de48ead | 1755 | ((fs->vp || fs->ba->backing_ba) ? |
d2d8515b MD |
1756 | VM_ALLOC_NULL_OK | VM_ALLOC_NORMAL : |
1757 | VM_ALLOC_NULL_OK | VM_ALLOC_NORMAL | | |
54341a3b | 1758 | VM_ALLOC_USE_GD | VM_ALLOC_ZERO)); |
984263bc | 1759 | } |
01251219 | 1760 | if (fs->mary[0] == NULL) { |
9de48ead MD |
1761 | vm_object_pip_wakeup(fs->first_ba->object); |
1762 | unlock_things(fs); | |
43320d68 MD |
1763 | if (allow_nofault == 0 || |
1764 | (curthread->td_flags & TDF_NOFAULT) == 0) { | |
2c9e2984 MD |
1765 | thread_t td; |
1766 | ||
43320d68 | 1767 | vm_wait_pfault(); |
2c9e2984 MD |
1768 | td = curthread; |
1769 | if (td->td_proc && (td->td_proc->p_flags & P_LOWMEMKILL)) | |
1770 | return (KERN_PROTECTION_FAILURE); | |
43320d68 | 1771 | } |
568e6804 | 1772 | return (KERN_TRY_AGAIN); |
984263bc | 1773 | } |
b12defdc MD |
1774 | |
1775 | /* | |
1776 | * Fall through to readrest. We have a new page which | |
1777 | * will have to be paged (since m->valid will be 0). | |
1778 | */ | |
984263bc MD |
1779 | } |
1780 | ||
1781 | readrest: | |
1782 | /* | |
1b9d3514 | 1783 | * We have found an invalid or partially valid page, a |
1c9602b3 MD |
1784 | * page with a read-ahead mark which might be partially or |
1785 | * fully valid (and maybe dirty too), or we have allocated | |
1786 | * a new page. | |
984263bc MD |
1787 | * |
1788 | * Attempt to fault-in the page if there is a chance that the | |
1789 | * pager has it, and potentially fault in additional pages | |
1790 | * at the same time. | |
06ecca5a | 1791 | * |
01251219 MD |
1792 | * If TRYPAGER is true then fs.mary[0] will be non-NULL and |
1793 | * busied for us. | |
984263bc | 1794 | */ |
568e6804 | 1795 | if (TRYPAGER(fs)) { |
568e6804 | 1796 | u_char behavior = vm_map_entry_behavior(fs->entry); |
9de48ead MD |
1797 | vm_object_t object; |
1798 | vm_page_t first_m; | |
1799 | int seqaccess; | |
9de48ead | 1800 | int rv; |
984263bc | 1801 | |
1b9d3514 MD |
1802 | if (behavior == MAP_ENTRY_BEHAV_RANDOM) |
1803 | seqaccess = 0; | |
1804 | else | |
1805 | seqaccess = -1; | |
984263bc | 1806 | |
1b9d3514 | 1807 | /* |
501747bf MD |
1808 | * Doing I/O may synchronously insert additional |
1809 | * pages so we can't be shared at this point either. | |
1b9d3514 | 1810 | * |
01251219 MD |
1811 | * NOTE: We can't free fs->mary[0] here in the |
1812 | * allocated case (fs->ba != fs->first_ba) as | |
1813 | * this would require an exclusively locked | |
501747bf | 1814 | * VM object. |
1b9d3514 | 1815 | */ |
9de48ead | 1816 | if (fs->ba == fs->first_ba && fs->first_shared) { |
5ebb17ad MD |
1817 | if (fs->mary[0]) { |
1818 | vm_page_deactivate(fs->mary[0]); | |
1819 | vm_page_wakeup(fs->mary[0]); | |
1820 | fs->mary[0]= NULL; | |
1821 | } | |
501747bf | 1822 | fs->first_shared = 0; |
9de48ead MD |
1823 | vm_object_pip_wakeup(fs->first_ba->object); |
1824 | unlock_things(fs); | |
501747bf MD |
1825 | return (KERN_TRY_AGAIN); |
1826 | } | |
9de48ead | 1827 | if (fs->ba != fs->first_ba && fs->shared) { |
5ebb17ad MD |
1828 | if (fs->mary[0]) { |
1829 | vm_page_deactivate(fs->mary[0]); | |
1830 | vm_page_wakeup(fs->mary[0]); | |
1831 | fs->mary[0] = NULL; | |
1832 | } | |
501747bf MD |
1833 | fs->first_shared = 0; |
1834 | fs->shared = 0; | |
9de48ead MD |
1835 | vm_object_pip_wakeup(fs->first_ba->object); |
1836 | unlock_things(fs); | |
501747bf | 1837 | return (KERN_TRY_AGAIN); |
984263bc MD |
1838 | } |
1839 | ||
9de48ead | 1840 | object = fs->ba->object; |
9de48ead | 1841 | first_m = NULL; |
1c024bc6 | 1842 | |
9de48ead | 1843 | /* object is held, no more access to entry or ba's */ |
984263bc MD |
1844 | |
1845 | /* | |
9de48ead MD |
1846 | * Acquire the page data. We still hold object |
1847 | * and the page has been BUSY's. | |
1b9d3514 | 1848 | * |
9de48ead MD |
1849 | * We own the page, but we must re-issue the lookup |
1850 | * because the pager may have replaced it (for example, | |
1851 | * in order to enter a fictitious page into the | |
1852 | * object). In this situation the pager will have | |
1853 | * cleaned up the old page and left the new one | |
1854 | * busy for us. | |
1c9602b3 MD |
1855 | * |
1856 | * If we got here through a PG_RAM read-ahead | |
1857 | * mark the page may be partially dirty and thus | |
1858 | * not freeable. Don't bother checking to see | |
1859 | * if the pager has the page because we can't free | |
1860 | * it anyway. We have to depend on the get_page | |
1861 | * operation filling in any gaps whether there is | |
1862 | * backing store or not. | |
9de48ead | 1863 | * |
01251219 | 1864 | * We must dispose of the page (fs->mary[0]) and also |
9de48ead MD |
1865 | * possibly first_m (the fronting layer). If |
1866 | * this is a write fault leave the page intact | |
01251219 | 1867 | * because we will probably have to copy fs->mary[0] |
9de48ead MD |
1868 | * to fs->first_m on the retry. If this is a |
1869 | * read fault we probably won't need the page. | |
5ebb17ad MD |
1870 | * |
1871 | * For OBJT_MGTDEVICE (and eventually all types), | |
1872 | * fs->mary[0] is not pre-allocated and may be set | |
1873 | * to a vm_page (busied for us) without being inserted | |
1874 | * into the object. In this case we want to return | |
1875 | * the vm_page directly so the caller can issue the | |
1876 | * pmap_enter(). | |
984263bc | 1877 | */ |
5ebb17ad MD |
1878 | rv = vm_pager_get_page(object, pindex, |
1879 | &fs->mary[0], seqaccess); | |
984263bc MD |
1880 | |
1881 | if (rv == VM_PAGER_OK) { | |
9de48ead | 1882 | ++fs->hardfault; |
5ebb17ad MD |
1883 | if (object->type == OBJT_MGTDEVICE) { |
1884 | break; | |
1885 | } | |
1886 | ||
01251219 MD |
1887 | fs->mary[0] = vm_page_lookup(object, pindex); |
1888 | if (fs->mary[0]) { | |
1889 | vm_page_activate(fs->mary[0]); | |
1890 | vm_page_wakeup(fs->mary[0]); | |
1891 | fs->mary[0] = NULL; | |
9de48ead MD |
1892 | } |
1893 | ||
01251219 | 1894 | if (fs->mary[0]) { |
5ebb17ad | 1895 | /* NOT REACHED */ |
9de48ead MD |
1896 | /* have page */ |
1897 | break; | |
1898 | } | |
1899 | vm_object_pip_wakeup(fs->first_ba->object); | |
1900 | unlock_things(fs); | |
1901 | return (KERN_TRY_AGAIN); | |
9de48ead MD |
1902 | } |
1903 | ||
1904 | /* | |
1905 | * If the pager doesn't have the page, continue on | |
1906 | * to the next object. Retain the vm_page if this | |
1907 | * is the first object, we may need to copy into | |
1908 | * it later. | |
1909 | */ | |
1910 | if (rv == VM_PAGER_FAIL) { | |
1911 | if (fs->ba != fs->first_ba) { | |
5ebb17ad MD |
1912 | if (fs->mary[0]) { |
1913 | vm_page_free(fs->mary[0]); | |
1914 | fs->mary[0] = NULL; | |
1915 | } | |
9de48ead MD |
1916 | } |
1917 | goto next; | |
984263bc | 1918 | } |
568e6804 | 1919 | |
984263bc MD |
1920 | /* |
1921 | * Remove the bogus page (which does not exist at this | |
9de48ead | 1922 | * object/offset). |
984263bc MD |
1923 | * |
1924 | * Also wake up any other process that may want to bring | |
1925 | * in this page. | |
1926 | * | |
1927 | * If this is the top-level object, we must leave the | |
1928 | * busy page to prevent another process from rushing | |
1929 | * past us, and inserting the page in that object at | |
1930 | * the same time that we are. | |
1931 | */ | |
a0bc8638 | 1932 | if (rv == VM_PAGER_ERROR) { |
b12defdc MD |
1933 | if (curproc) { |
1934 | kprintf("vm_fault: pager read error, " | |
1935 | "pid %d (%s)\n", | |
1936 | curproc->p_pid, | |
1937 | curproc->p_comm); | |
1938 | } else { | |
1939 | kprintf("vm_fault: pager read error, " | |
1940 | "thread %p (%s)\n", | |
1941 | curthread, | |
37d2f283 | 1942 | curthread->td_comm); |
b12defdc | 1943 | } |
a0bc8638 | 1944 | } |
1b9d3514 | 1945 | |
9de48ead MD |
1946 | /* |
1947 | * I/O error or data outside pager's range. | |
1948 | */ | |
01251219 MD |
1949 | if (fs->mary[0]) { |
1950 | vnode_pager_freepage(fs->mary[0]); | |
1951 | fs->mary[0] = NULL; | |
9de48ead MD |
1952 | } |
1953 | if (first_m) { | |
1954 | vm_page_free(first_m); | |
1955 | first_m = NULL; /* safety */ | |
1956 | } | |
1957 | vm_object_pip_wakeup(object); | |
1958 | unlock_things(fs); | |
1c024bc6 | 1959 | |
9de48ead MD |
1960 | switch(rv) { |
1961 | case VM_PAGER_ERROR: | |
1962 | return (KERN_FAILURE); | |
1963 | case VM_PAGER_BAD: | |
1964 | return (KERN_PROTECTION_FAILURE); | |
1965 | default: | |
1966 | return (KERN_PROTECTION_FAILURE); | |
1967 | } | |
1968 | ||
1969 | #if 0 | |
984263bc MD |
1970 | /* |
1971 | * Data outside the range of the pager or an I/O error | |
a55afca2 MD |
1972 | * |
1973 | * The page may have been wired during the pagein, | |
1974 | * e.g. by the buffer cache, and cannot simply be | |
1b9d3514 | 1975 | * freed. Call vnode_pager_freepage() to deal with it. |
501747bf | 1976 | * |
9de48ead MD |
1977 | * The object is not held shared so we can safely |
1978 | * free the page. | |
984263bc | 1979 | */ |
9de48ead | 1980 | if (fs->ba != fs->first_ba) { |
e24b12cd | 1981 | |
501747bf MD |
1982 | /* |
1983 | * XXX - we cannot just fall out at this | |
1984 | * point, m has been freed and is invalid! | |
1985 | */ | |
1986 | } | |
9de48ead | 1987 | |
984263bc MD |
1988 | /* |
1989 | * XXX - the check for kernel_map is a kludge to work | |
1990 | * around having the machine panic on a kernel space | |
1991 | * fault w/ I/O error. | |
1992 | */ | |
1eeaf6b2 | 1993 | if (((fs->map != kernel_map) && |
1b9d3514 | 1994 | (rv == VM_PAGER_ERROR)) || (rv == VM_PAGER_BAD)) { |
501747bf | 1995 | if (fs->m) { |
9de48ead MD |
1996 | /* from just above */ |
1997 | KKASSERT(fs->first_shared == 0); | |
1998 | vnode_pager_freepage(fs->m); | |
501747bf MD |
1999 | fs->m = NULL; |
2000 | } | |
568e6804 | 2001 | /* NOT REACHED */ |
984263bc | 2002 | } |
9de48ead | 2003 | #endif |
984263bc MD |
2004 | } |
2005 | ||
9de48ead | 2006 | next: |
984263bc | 2007 | /* |
568e6804 | 2008 | * We get here if the object has a default pager (or unwiring) |
984263bc | 2009 | * or the pager doesn't have the page. |
501747bf MD |
2010 | * |
2011 | * fs->first_m will be used for the COW unless we find a | |
2012 | * deeper page to be mapped read-only, in which case the | |
2013 | * unlock*(fs) will free first_m. | |
984263bc | 2014 | */ |
9de48ead | 2015 | if (fs->ba == fs->first_ba) |
01251219 | 2016 | fs->first_m = fs->mary[0]; |
984263bc MD |
2017 | |
2018 | /* | |
b12defdc MD |
2019 | * Move on to the next object. The chain lock should prevent |
2020 | * the backing_object from getting ripped out from under us. | |
ce94514e | 2021 | * |
501747bf MD |
2022 | * The object lock for the next object is governed by |
2023 | * fs->shared. | |
984263bc | 2024 | */ |
67e7cb85 | 2025 | next_ba = fs->ba->backing_ba; |
9de48ead | 2026 | if (next_ba == NULL) { |
984263bc MD |
2027 | /* |
2028 | * If there's no object left, fill the page in the top | |
2029 | * object with zeros. | |
2030 | */ | |
9de48ead MD |
2031 | if (fs->ba != fs->first_ba) { |
2032 | vm_object_pip_wakeup(fs->ba->object); | |
2033 | vm_object_drop(fs->ba->object); | |
2034 | fs->ba = fs->first_ba; | |
72579d2e | 2035 | pindex = first_pindex; |
01251219 | 2036 | fs->mary[0] = fs->first_m; |
984263bc | 2037 | } |
568e6804 | 2038 | fs->first_m = NULL; |
984263bc MD |
2039 | |
2040 | /* | |
afd2da4d | 2041 | * Zero the page and mark it valid. |
984263bc | 2042 | */ |
01251219 | 2043 | vm_page_zero_fill(fs->mary[0]); |
12e4aaff | 2044 | mycpu->gd_cnt.v_zfod++; |
01251219 | 2045 | fs->mary[0]->valid = VM_PAGE_BITS_ALL; |
984263bc | 2046 | break; /* break to PAGE HAS BEEN FOUND */ |
984263bc | 2047 | } |
67e7cb85 MD |
2048 | |
2049 | if (fs->shared) | |
2050 | vm_object_hold_shared(next_ba->object); | |
2051 | else | |
2052 | vm_object_hold(next_ba->object); | |
2053 | KKASSERT(next_ba == fs->ba->backing_ba); | |
2054 | pindex -= OFF_TO_IDX(fs->ba->offset); | |
2055 | pindex += OFF_TO_IDX(next_ba->offset); | |
2056 | ||
9de48ead MD |
2057 | if (fs->ba != fs->first_ba) { |
2058 | vm_object_pip_wakeup(fs->ba->object); | |
2059 | vm_object_lock_swap(); /* flip ba/next_ba */ | |
2060 | vm_object_drop(fs->ba->object); | |
9ad0147b | 2061 | } |
9de48ead MD |
2062 | fs->ba = next_ba; |
2063 | vm_object_pip_add(next_ba->object, 1); | |
984263bc MD |
2064 | } |
2065 | ||
984263bc MD |
2066 | /* |
2067 | * PAGE HAS BEEN FOUND. [Loop invariant still holds -- the object lock | |
2068 | * is held.] | |
1b9d3514 | 2069 | * |
b12defdc | 2070 | * object still held. |
641f3b0a | 2071 | * vm_map may not be locked (determined by fs->lookup_still_valid) |
9ad0147b | 2072 | * |
501747bf MD |
2073 | * local shared variable may be different from fs->shared. |
2074 | * | |
984263bc MD |
2075 | * If the page is being written, but isn't already owned by the |
2076 | * top-level object, we have to copy it into a new page owned by the | |
2077 | * top-level object. | |
2078 | */ | |
01251219 | 2079 | KASSERT((fs->mary[0]->busy_count & PBUSY_LOCKED) != 0, |
1b9d3514 MD |
2080 | ("vm_fault: not busy after main loop")); |
2081 | ||
9de48ead | 2082 | if (fs->ba != fs->first_ba) { |
984263bc MD |
2083 | /* |
2084 | * We only really need to copy if we want to write it. | |
2085 | */ | |
984263bc | 2086 | if (fault_type & VM_PROT_WRITE) { |
9de48ead MD |
2087 | #if 0 |
2088 | /* CODE REFACTOR IN PROGRESS, REMOVE OPTIMIZATION */ | |
984263bc MD |
2089 | /* |
2090 | * This allows pages to be virtually copied from a | |
2091 | * backing_object into the first_object, where the | |
2092 | * backing object has no other refs to it, and cannot | |
2093 | * gain any more refs. Instead of a bcopy, we just | |
2094 | * move the page from the backing object to the | |
2095 | * first object. Note that we must mark the page | |
2096 | * dirty in the first object so that it will go out | |
2097 | * to swap when needed. | |
2098 | */ | |
641f3b0a | 2099 | if (virtual_copy_ok(fs)) { |
984263bc | 2100 | /* |
b12defdc MD |
2101 | * (first_m) and (m) are both busied. We have |
2102 | * move (m) into (first_m)'s object/pindex | |
2103 | * in an atomic fashion, then free (first_m). | |
2104 | * | |
2105 | * first_object is held so second remove | |
2106 | * followed by the rename should wind | |
2107 | * up being atomic. vm_page_free() might | |
2108 | * block so we don't do it until after the | |
2109 | * rename. | |
984263bc | 2110 | */ |
568e6804 | 2111 | vm_page_protect(fs->first_m, VM_PROT_NONE); |
b12defdc | 2112 | vm_page_remove(fs->first_m); |
01251219 | 2113 | vm_page_rename(fs->mary[0], |
9de48ead | 2114 | fs->first_ba->object, |
b12defdc | 2115 | first_pindex); |
568e6804 | 2116 | vm_page_free(fs->first_m); |
01251219 MD |
2117 | fs->first_m = fs->mary[0]; |
2118 | fs->mary[0] = NULL; | |
12e4aaff | 2119 | mycpu->gd_cnt.v_cow_optim++; |
9de48ead MD |
2120 | } else |
2121 | #endif | |
2122 | { | |
984263bc MD |
2123 | /* |
2124 | * Oh, well, lets copy it. | |
385c96e4 | 2125 | * |
e32fb2aa MD |
2126 | * We used to unmap the original page here |
2127 | * because vm_fault_page() didn't and this | |
2128 | * would cause havoc for the umtx*() code | |
2129 | * and the procfs code. | |
bc0aa189 | 2130 | * |
e32fb2aa MD |
2131 | * This is no longer necessary. The |
2132 | * vm_fault_page() routine will now unmap the | |
2133 | * page after a COW, and the umtx code will | |
2134 | * recover on its own. | |
984263bc | 2135 | */ |
567a6398 | 2136 | /* |
01251219 MD |
2137 | * NOTE: Since fs->mary[0] is a backing page, |
2138 | * it is read-only, so there isn't any | |
567a6398 MD |
2139 | * copy race vs writers. |
2140 | */ | |
501747bf | 2141 | KKASSERT(fs->first_shared == 0); |
01251219 | 2142 | vm_page_copy(fs->mary[0], fs->first_m); |
e32fb2aa | 2143 | /* pmap_remove_specific( |
bc0aa189 | 2144 | &curthread->td_lwp->lwp_vmspace->vm_pmap, |
01251219 | 2145 | fs->mary[0]); */ |
984263bc MD |
2146 | } |
2147 | ||
501747bf MD |
2148 | /* |
2149 | * We no longer need the old page or object. | |
2150 | */ | |
01251219 | 2151 | if (fs->mary[0]) |
568e6804 | 2152 | release_page(fs); |
984263bc | 2153 | |
b12defdc | 2154 | /* |
9de48ead | 2155 | * fs->ba != fs->first_ba due to above conditional |
b12defdc | 2156 | */ |
9de48ead MD |
2157 | vm_object_pip_wakeup(fs->ba->object); |
2158 | vm_object_drop(fs->ba->object); | |
2159 | fs->ba = fs->first_ba; | |
984263bc MD |
2160 | |
2161 | /* | |
2162 | * Only use the new page below... | |
2163 | */ | |
12e4aaff | 2164 | mycpu->gd_cnt.v_cow_faults++; |
01251219 | 2165 | fs->mary[0] = fs->first_m; |
72579d2e | 2166 | pindex = first_pindex; |
984263bc | 2167 | } else { |
568e6804 MD |
2168 | /* |
2169 | * If it wasn't a write fault avoid having to copy | |
9de48ead MD |
2170 | * the page by mapping it read-only from backing |
2171 | * store. The process is not allowed to modify | |
2172 | * backing pages. | |
568e6804 MD |
2173 | */ |
2174 | fs->prot &= ~VM_PROT_WRITE; | |
984263bc MD |
2175 | } |
2176 | } | |
2177 | ||
2178 | /* | |
6e9c0867 MD |
2179 | * Relock the map if necessary, then check the generation count. |
2180 | * relock_map() will update fs->timestamp to account for the | |
2181 | * relocking if necessary. | |
2182 | * | |
2183 | * If the count has changed after relocking then all sorts of | |
2184 | * crap may have happened and we have to retry. | |
625a2937 MD |
2185 | * |
2186 | * NOTE: The relock_map() can fail due to a deadlock against | |
2187 | * the vm_page we are holding BUSY. | |
984263bc | 2188 | */ |
1c024bc6 | 2189 | KKASSERT(fs->lookup_still_valid != 0); |
9de48ead | 2190 | #if 0 |
1c024bc6 | 2191 | if (fs->lookup_still_valid == 0 && fs->map) { |
625a2937 MD |
2192 | if (relock_map(fs) || |
2193 | fs->map->timestamp != fs->map_generation) { | |
6e9c0867 | 2194 | release_page(fs); |
9de48ead MD |
2195 | vm_object_pip_wakeup(fs->first_ba->object); |
2196 | unlock_things(fs); | |
6e9c0867 MD |
2197 | return (KERN_TRY_AGAIN); |
2198 | } | |
568e6804 | 2199 | } |
9de48ead | 2200 | #endif |
568e6804 | 2201 | |
984263bc | 2202 | /* |
17cde63e MD |
2203 | * If the fault is a write, we know that this page is being |
2204 | * written NOW so dirty it explicitly to save on pmap_is_modified() | |
2205 | * calls later. | |
2206 | * | |
2207 | * If this is a NOSYNC mmap we do not want to set PG_NOSYNC | |
2208 | * if the page is already dirty to prevent data written with | |
2209 | * the expectation of being synced from not being synced. | |
2210 | * Likewise if this entry does not request NOSYNC then make | |
2211 | * sure the page isn't marked NOSYNC. Applications sharing | |
2212 | * data should use the same flags to avoid ping ponging. | |
2213 | * | |
2214 | * Also tell the backing pager, if any, that it should remove | |
2215 | * any swap backing since the page is now dirty. | |
984263bc | 2216 | */ |
01251219 | 2217 | vm_page_activate(fs->mary[0]); |
568e6804 | 2218 | if (fs->prot & VM_PROT_WRITE) { |
5ebb17ad | 2219 | vm_object_set_writeable_dirty(fs->first_ba->object); |
01251219 | 2220 | vm_set_nosync(fs->mary[0], fs->entry); |
568e6804 | 2221 | if (fs->fault_flags & VM_FAULT_DIRTY) { |
01251219 MD |
2222 | vm_page_dirty(fs->mary[0]); |
2223 | if (fs->mary[0]->flags & PG_SWAPPED) { | |
ceb0e493 MD |
2224 | /* |
2225 | * If the page is swapped out we have to call | |
2226 | * swap_pager_unswapped() which requires an | |
2227 | * exclusive object lock. If we are shared, | |
2228 | * we must clear the shared flag and retry. | |
2229 | */ | |
9de48ead | 2230 | if ((fs->ba == fs->first_ba && |
ceb0e493 | 2231 | fs->first_shared) || |
9de48ead | 2232 | (fs->ba != fs->first_ba && fs->shared)) { |
01251219 MD |
2233 | vm_page_wakeup(fs->mary[0]); |
2234 | fs->mary[0] = NULL; | |
9de48ead | 2235 | if (fs->ba == fs->first_ba) |
ceb0e493 MD |
2236 | fs->first_shared = 0; |
2237 | else | |
2238 | fs->shared = 0; | |
9de48ead MD |
2239 | vm_object_pip_wakeup( |
2240 | fs->first_ba->object); | |
2241 | unlock_things(fs); | |
ceb0e493 MD |
2242 | return (KERN_TRY_AGAIN); |
2243 | } | |
01251219 | 2244 | swap_pager_unswapped(fs->mary[0]); |
ceb0e493 | 2245 | } |
984263bc MD |
2246 | } |
2247 | } | |
2248 | ||
9de48ead MD |
2249 | /* |
2250 | * We found our page at backing layer ba. Leave the layer state | |
2251 | * intact. | |
2252 | */ | |
2253 | ||
2254 | vm_object_pip_wakeup(fs->first_ba->object); | |
2255 | #if 0 | |
2256 | if (fs->ba != fs->first_ba) | |
2257 | vm_object_drop(fs->ba->object); | |
2258 | #endif | |
9ad0147b | 2259 | |
984263bc | 2260 | /* |
75f59a66 | 2261 | * Page had better still be busy. We are still locked up and |
9de48ead MD |
2262 | * fs->ba->object will have another PIP reference for the case |
2263 | * where fs->ba != fs->first_ba. | |
984263bc | 2264 | */ |
01251219 MD |
2265 | KASSERT(fs->mary[0]->busy_count & PBUSY_LOCKED, |
2266 | ("vm_fault: page %p not busy!", fs->mary[0])); | |
984263bc | 2267 | |
984263bc MD |
2268 | /* |
2269 | * Sanity check: page must be completely valid or it is not fit to | |
2270 | * map into user space. vm_pager_get_pages() ensures this. | |
2271 | */ | |
01251219 MD |
2272 | if (fs->mary[0]->valid != VM_PAGE_BITS_ALL) { |
2273 | vm_page_zero_invalid(fs->mary[0], TRUE); | |
2274 | kprintf("Warning: page %p partially invalid on fault\n", | |
2275 | fs->mary[0]); | |
984263bc MD |
2276 | } |
2277 | ||
984263bc | 2278 | return (KERN_SUCCESS); |
984263bc MD |
2279 | } |
2280 | ||
2281 | /* | |
f2d22ebf MD |
2282 | * Wire down a range of virtual addresses in a map. The entry in question |
2283 | * should be marked in-transition and the map must be locked. We must | |
2284 | * release the map temporarily while faulting-in the page to avoid a | |
2285 | * deadlock. Note that the entry may be clipped while we are blocked but | |
2286 | * will never be freed. | |
9ad0147b | 2287 | * |
1c024bc6 | 2288 | * map must be locked on entry. |
984263bc MD |
2289 | */ |
2290 | int | |
06c66eb2 MD |
2291 | vm_fault_wire(vm_map_t map, vm_map_entry_t entry, |
2292 | boolean_t user_wire, int kmflags) | |
984263bc | 2293 | { |
f2d22ebf MD |
2294 | boolean_t fictitious; |
2295 | vm_offset_t start; | |
2296 | vm_offset_t end; | |
5f910b2f RG |
2297 | vm_offset_t va; |
2298 | pmap_t pmap; | |
984263bc | 2299 | int rv; |
06c66eb2 MD |
2300 | int wire_prot; |
2301 | int fault_flags; | |
76f1911e | 2302 | vm_page_t m; |
984263bc | 2303 | |
06c66eb2 MD |
2304 | if (user_wire) { |
2305 | wire_prot = VM_PROT_READ; | |
2306 | fault_flags = VM_FAULT_USER_WIRE; | |
2307 | } else { | |
2308 | wire_prot = VM_PROT_READ | VM_PROT_WRITE; | |
2309 | fault_flags = VM_FAULT_CHANGE_WIRING; | |
2310 | } | |
2311 | if (kmflags & KM_NOTLBSYNC) | |
2312 | wire_prot |= VM_PROT_NOSYNC; | |
2313 | ||
984263bc | 2314 | pmap = vm_map_pmap(map); |
67e7cb85 MD |
2315 | start = entry->ba.start; |
2316 | end = entry->ba.end; | |
73101af2 | 2317 | |
0adbcbd6 MD |
2318 | switch(entry->maptype) { |
2319 | case VM_MAPTYPE_NORMAL: | |
9de48ead MD |
2320 | fictitious = entry->ba.object && |
2321 | ((entry->ba.object->type == OBJT_DEVICE) || | |
2322 | (entry->ba.object->type == OBJT_MGTDEVICE)); | |
0adbcbd6 MD |
2323 | break; |
2324 | case VM_MAPTYPE_UKSMAP: | |
2325 | fictitious = TRUE; | |
2326 | break; | |
2327 | default: | |
2328 | fictitious = FALSE; | |
2329 | break; | |
2330 | } | |
2331 | ||
e40cfbd7 MD |
2332 | if (entry->eflags & MAP_ENTRY_KSTACK) |
2333 | start += PAGE_SIZE; | |
f2d22ebf | 2334 | map->timestamp++; |
6e9c0867 | 2335 | vm_map_unlock(map); |
984263bc | 2336 | |
984263bc MD |
2337 | /* |
2338 | * We simulate a fault to get the page and enter it in the physical | |
2339 | * map. | |
2340 | */ | |
2341 | for (va = start; va < end; va += PAGE_SIZE) { | |
06c66eb2 | 2342 | rv = vm_fault(map, va, wire_prot, fault_flags); |
984263bc | 2343 | if (rv) { |
f2d22ebf MD |
2344 | while (va > start) { |
2345 | va -= PAGE_SIZE; | |
76f1911e MD |
2346 | m = pmap_unwire(pmap, va); |
2347 | if (m && !fictitious) { | |
b12defdc MD |
2348 | vm_page_busy_wait(m, FALSE, "vmwrpg"); |
2349 | vm_page_unwire(m, 1); | |
2350 | vm_page_wakeup(m); | |
2351 | } | |
f2d22ebf | 2352 | } |
b12defdc | 2353 | goto done; |
984263bc MD |
2354 | } |
2355 | } | |
b12defdc MD |
2356 | rv = KERN_SUCCESS; |
2357 | done: | |
f2d22ebf | 2358 | vm_map_lock(map); |
73101af2 | 2359 | |
b12defdc | 2360 | return (rv); |
984263bc MD |
2361 | } |
2362 | ||
984263bc | 2363 | /* |
f2d22ebf MD |
2364 | * Unwire a range of virtual addresses in a map. The map should be |
2365 | * locked. | |
984263bc MD |
2366 | */ |
2367 | void | |
f2d22ebf | 2368 | vm_fault_unwire(vm_map_t map, vm_map_entry_t entry) |
984263bc | 2369 | { |
f2d22ebf MD |
2370 | boolean_t fictitious; |
2371 | vm_offset_t start; | |
2372 | vm_offset_t end; | |
6ef943a3 | 2373 | vm_offset_t va; |
5f910b2f | 2374 | pmap_t pmap; |
76f1911e | 2375 | vm_page_t m; |
984263bc MD |
2376 | |
2377 | pmap = vm_map_pmap(map); | |
67e7cb85 MD |
2378 | start = entry->ba.start; |
2379 | end = entry->ba.end; | |
9de48ead MD |
2380 | fictitious = entry->ba.object && |
2381 | ((entry->ba.object->type == OBJT_DEVICE) || | |
2382 | (entry->ba.object->type == OBJT_MGTDEVICE)); | |
e40cfbd7 MD |
2383 | if (entry->eflags & MAP_ENTRY_KSTACK) |
2384 | start += PAGE_SIZE; | |
984263bc MD |
2385 | |
2386 | /* | |
2387 | * Since the pages are wired down, we must be able to get their | |
2388 | * mappings from the physical map system. | |
2389 | */ | |
984263bc | 2390 | for (va = start; va < end; va += PAGE_SIZE) { |
76f1911e MD |
2391 | m = pmap_unwire(pmap, va); |
2392 | if (m && !fictitious) { | |
2393 | vm_page_busy_wait(m, FALSE, "vmwrpg"); | |
2394 | vm_page_unwire(m, 1); | |
2395 | vm_page_wakeup(m); | |
984263bc MD |
2396 | } |
2397 | } | |
984263bc MD |
2398 | } |
2399 | ||
1c024bc6 MD |
2400 | /* |
2401 | * Simulate write faults to bring all data into the head object, return | |
2402 | * KERN_SUCCESS on success (which should be always unless the system runs | |
2403 | * out of memory). | |
2404 | * | |
2405 | * The caller will handle destroying the backing_ba's. | |
2406 | */ | |
2407 | int | |
2408 | vm_fault_collapse(vm_map_t map, vm_map_entry_t entry) | |
2409 | { | |
2410 | struct faultstate fs; | |
2411 | vm_ooffset_t scan; | |
2412 | vm_pindex_t pindex; | |
2413 | vm_object_t object; | |
2414 | int rv; | |
2415 | int all_shadowed; | |
2416 | ||
2417 | bzero(&fs, sizeof(fs)); | |
2418 | object = entry->ba.object; | |
2419 | ||
2420 | fs.first_prot = entry->max_protection | /* optional VM_PROT_EXECUTE */ | |
2421 | VM_PROT_READ | VM_PROT_WRITE | VM_PROT_OVERRIDE_WRITE; | |
2422 | fs.fault_flags = VM_FAULT_NORMAL; | |
2423 | fs.map = map; | |
2424 | fs.entry = entry; | |
2425 | fs.lookup_still_valid = -1; /* leave map atomically locked */ | |
2426 | fs.first_ba = &entry->ba; | |
2427 | fs.first_ba_held = -1; /* leave object held */ | |
2428 | ||
2429 | /* fs.hardfault */ | |
2430 | ||
2431 | vm_object_hold(object); | |
2432 | rv = KERN_SUCCESS; | |
2433 | ||
67e7cb85 | 2434 | scan = entry->ba.start; |
1c024bc6 MD |
2435 | all_shadowed = 1; |
2436 | ||
67e7cb85 MD |
2437 | while (scan < entry->ba.end) { |
2438 | pindex = OFF_TO_IDX(entry->ba.offset + (scan - entry->ba.start)); | |
1c024bc6 MD |
2439 | |
2440 | if (vm_page_lookup(object, pindex)) { | |
2441 | scan += PAGE_SIZE; | |
2442 | continue; | |
2443 | } | |
2444 | ||
2445 | all_shadowed = 0; | |
2446 | fs.ba = fs.first_ba; | |
2447 | fs.prot = fs.first_prot; | |
2448 | ||
2449 | rv = vm_fault_object(&fs, pindex, fs.first_prot, 1); | |
2450 | if (rv == KERN_TRY_AGAIN) | |
2451 | continue; | |
2452 | if (rv != KERN_SUCCESS) | |
2453 | break; | |
01251219 MD |
2454 | vm_page_flag_set(fs.mary[0], PG_REFERENCED); |
2455 | vm_page_activate(fs.mary[0]); | |
2456 | vm_page_wakeup(fs.mary[0]); | |
1c024bc6 MD |
2457 | scan += PAGE_SIZE; |
2458 | } | |
2459 | KKASSERT(entry->ba.object == object); | |
2460 | vm_object_drop(object); | |
2461 | ||
2462 | /* | |
2463 | * If the fronting object did not have every page we have to clear | |
2464 | * the pmap range due to the pages being changed so we can fault-in | |
2465 | * the proper pages. | |
2466 | */ | |
2467 | if (all_shadowed == 0) | |
67e7cb85 | 2468 | pmap_remove(map->pmap, entry->ba.start, entry->ba.end); |
1c024bc6 MD |
2469 | |
2470 | return rv; | |
2471 | } | |
2472 | ||
984263bc | 2473 | /* |
44293a80 MD |
2474 | * Copy all of the pages from one map entry to another. If the source |
2475 | * is wired down we just use vm_page_lookup(). If not we use | |
2476 | * vm_fault_object(). | |
9ad0147b MD |
2477 | * |
2478 | * The source and destination maps must be locked for write. | |
b12defdc | 2479 | * The source and destination maps token must be held |
984263bc | 2480 | * |
9ad0147b | 2481 | * No other requirements. |
921c891e MD |
2482 | * |
2483 | * XXX do segment optimization | |
984263bc | 2484 | */ |
984263bc | 2485 | void |
57e43348 | 2486 | vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map, |
9ad0147b | 2487 | vm_map_entry_t dst_entry, vm_map_entry_t src_entry) |
984263bc MD |
2488 | { |
2489 | vm_object_t dst_object; | |
2490 | vm_object_t src_object; | |
2491 | vm_ooffset_t dst_offset; | |
2492 | vm_ooffset_t src_offset; | |
2493 | vm_prot_t prot; | |
2494 | vm_offset_t vaddr; | |
2495 | vm_page_t dst_m; | |
2496 | vm_page_t src_m; | |
2497 | ||
9de48ead MD |
2498 | src_object = src_entry->ba.object; |
2499 | src_offset = src_entry->ba.offset; | |
984263bc MD |
2500 | |
2501 | /* | |
2502 | * Create the top-level object for the destination entry. (Doesn't | |
2503 | * actually shadow anything - we copy the pages directly.) | |
2504 | */ | |
53025830 | 2505 | vm_map_entry_allocate_object(dst_entry); |
9de48ead | 2506 | dst_object = dst_entry->ba.object; |
984263bc MD |
2507 | |
2508 | prot = dst_entry->max_protection; | |
2509 | ||
2510 | /* | |
2511 | * Loop through all of the pages in the entry's range, copying each | |
2512 | * one from the source object (it should be there) to the destination | |
2513 | * object. | |
2514 | */ | |
dcdeb25f CT |
2515 | vm_object_hold(src_object); |
2516 | vm_object_hold(dst_object); | |
641f3b0a | 2517 | |
67e7cb85 MD |
2518 | for (vaddr = dst_entry->ba.start, dst_offset = 0; |
2519 | vaddr < dst_entry->ba.end; | |
b443039b | 2520 | vaddr += PAGE_SIZE, dst_offset += PAGE_SIZE) { |
984263bc MD |
2521 | |
2522 | /* | |
2523 | * Allocate a page in the destination object | |
2524 | */ | |
2525 | do { | |
2526 | dst_m = vm_page_alloc(dst_object, | |
d2d8515b MD |
2527 | OFF_TO_IDX(dst_offset), |
2528 | VM_ALLOC_NORMAL); | |
984263bc | 2529 | if (dst_m == NULL) { |
4ecf7cc9 | 2530 | vm_wait(0); |
984263bc MD |
2531 | } |
2532 | } while (dst_m == NULL); | |
2533 | ||
2534 | /* | |
2535 | * Find the page in the source object, and copy it in. | |
2536 | * (Because the source is wired down, the page will be in | |
2537 | * memory.) | |
2538 | */ | |
2539 | src_m = vm_page_lookup(src_object, | |
080c00e6 | 2540 | OFF_TO_IDX(dst_offset + src_offset)); |
984263bc MD |
2541 | if (src_m == NULL) |
2542 | panic("vm_fault_copy_wired: page missing"); | |
2543 | ||
2544 | vm_page_copy(src_m, dst_m); | |
2545 | ||
2546 | /* | |
2547 | * Enter it in the pmap... | |
2548 | */ | |
921c891e | 2549 | pmap_enter(dst_map->pmap, vaddr, dst_m, prot, FALSE, dst_entry); |
984263bc MD |
2550 | |
2551 | /* | |
2552 | * Mark it no longer busy, and put it on the active list. | |
2553 | */ | |
2554 | vm_page_activate(dst_m); | |
2555 | vm_page_wakeup(dst_m); | |
2556 | } | |
dcdeb25f CT |
2557 | vm_object_drop(dst_object); |
2558 | vm_object_drop(src_object); | |
984263bc MD |
2559 | } |
2560 | ||
1b9d3514 | 2561 | #if 0 |
984263bc MD |
2562 | |
2563 | /* | |
2564 | * This routine checks around the requested page for other pages that | |
2565 | * might be able to be faulted in. This routine brackets the viable | |
2566 | * pages for the pages to be paged in. | |
2567 | * | |
2568 | * Inputs: | |
2569 | * m, rbehind, rahead | |
2570 | * | |
2571 | * Outputs: | |
2572 | * marray (array of vm_page_t), reqpage (index of requested page) | |
2573 | * | |
2574 | * Return value: | |
2575 | * number of pages in marray | |
2576 | */ | |
2577 | static int | |
57e43348 | 2578 | vm_fault_additional_pages(vm_page_t m, int rbehind, int rahead, |
bc823b32 | 2579 | vm_page_t *marray, int *reqpage) |
984263bc MD |
2580 | { |
2581 | int i,j; | |
2582 | vm_object_t object; | |
2583 | vm_pindex_t pindex, startpindex, endpindex, tpindex; | |
2584 | vm_page_t rtm; | |
2585 | int cbehind, cahead; | |
2586 | ||
2587 | object = m->object; | |
2588 | pindex = m->pindex; | |
2589 | ||
2590 | /* | |
2591 | * we don't fault-ahead for device pager | |
2592 | */ | |
f2c2051e JH |
2593 | if ((object->type == OBJT_DEVICE) || |
2594 | (object->type == OBJT_MGTDEVICE)) { | |
984263bc MD |
2595 | *reqpage = 0; |
2596 | marray[0] = m; | |
2597 | return 1; | |
2598 | } | |
2599 | ||
2600 | /* | |
2601 | * if the requested page is not available, then give up now | |
2602 | */ | |
984263bc | 2603 | if (!vm_pager_has_page(object, pindex, &cbehind, &cahead)) { |
17cde63e | 2604 | *reqpage = 0; /* not used by caller, fix compiler warn */ |
984263bc MD |
2605 | return 0; |
2606 | } | |
2607 | ||
2608 | if ((cbehind == 0) && (cahead == 0)) { | |
2609 | *reqpage = 0; | |
2610 | marray[0] = m; | |
2611 | return 1; | |
2612 | } | |
2613 | ||
2614 | if (rahead > cahead) { | |
2615 | rahead = cahead; | |
2616 | } | |
2617 | ||
2618 | if (rbehind > cbehind) { | |
2619 | rbehind = cbehind; | |
2620 | } | |
2621 | ||
2622 | /* | |
bc823b32 MD |
2623 | * Do not do any readahead if we have insufficient free memory. |
2624 | * | |
2625 | * XXX code was broken disabled before and has instability | |
2626 | * with this conditonal fixed, so shortcut for now. | |
984263bc | 2627 | */ |
bc823b32 | 2628 | if (burst_fault == 0 || vm_page_count_severe()) { |
984263bc MD |
2629 | marray[0] = m; |
2630 | *reqpage = 0; | |
2631 | return 1; | |
2632 | } | |
2633 | ||
2634 | /* | |
2635 | * scan backward for the read behind pages -- in memory | |
06ecca5a MD |
2636 | * |
2637 | * Assume that if the page is not found an interrupt will not | |
2638 | * create it. Theoretically interrupts can only remove (busy) | |
2639 | * pages, not create new associations. | |
984263bc MD |
2640 | */ |
2641 | if (pindex > 0) { | |
2642 | if (rbehind > pindex) { | |
2643 | rbehind = pindex; | |
2644 | startpindex = 0; | |
2645 | } else { | |
2646 | startpindex = pindex - rbehind; | |
2647 | } | |
2648 | ||
b12defdc | 2649 | vm_object_hold(object); |
bc823b32 MD |
2650 | for (tpindex = pindex; tpindex > startpindex; --tpindex) { |
2651 | if (vm_page_lookup(object, tpindex - 1)) | |
984263bc MD |
2652 | break; |
2653 | } | |
2654 | ||
bc823b32 MD |
2655 | i = 0; |
2656 | while (tpindex < pindex) { | |
d2d8515b MD |
2657 | rtm = vm_page_alloc(object, tpindex, VM_ALLOC_SYSTEM | |
2658 | VM_ALLOC_NULL_OK); | |
984263bc MD |
2659 | if (rtm == NULL) { |
2660 | for (j = 0; j < i; j++) { | |
2661 | vm_page_free(marray[j]); | |
2662 | } | |
b12defdc | 2663 | vm_object_drop(object); |
984263bc MD |
2664 | marray[0] = m; |
2665 | *reqpage = 0; | |
2666 | return 1; | |
2667 | } | |
984263bc | 2668 | marray[i] = rtm; |
bc823b32 MD |
2669 | ++i; |
2670 | ++tpindex; | |
984263bc | 2671 | } |
b12defdc | 2672 | vm_object_drop(object); |
984263bc | 2673 | } else { |
984263bc MD |
2674 | i = 0; |
2675 | } | |
2676 | ||
bc823b32 MD |
2677 | /* |
2678 | * Assign requested page | |
2679 | */ | |
984263bc | 2680 | marray[i] = m; |
984263bc | 2681 | *reqpage = i; |
bc823b32 | 2682 | ++i; |
984263bc MD |
2683 | |
2684 | /* | |
bc823b32 | 2685 | * Scan forwards for read-ahead pages |
984263bc | 2686 | */ |
bc823b32 | 2687 | tpindex = pindex + 1; |
984263bc MD |
2688 | endpindex = tpindex + rahead; |
2689 | if (endpindex > object->size) | |
2690 | endpindex = object->size; | |
2691 | ||
b12defdc | 2692 | vm_object_hold(object); |
bc823b32 MD |
2693 | while (tpindex < endpindex) { |
2694 | if (vm_page_lookup(object, tpindex)) | |
984263bc | 2695 | break; |
d2d8515b MD |
2696 | rtm = vm_page_alloc(object, tpindex, VM_ALLOC_SYSTEM | |
2697 | VM_ALLOC_NULL_OK); | |
bc823b32 | 2698 | if (rtm == NULL) |
984263bc | 2699 | break; |
984263bc | 2700 | marray[i] = rtm; |
bc823b32 MD |
2701 | ++i; |
2702 | ++tpindex; | |
984263bc | 2703 | } |
b12defdc | 2704 | vm_object_drop(object); |
984263bc | 2705 | |
bc823b32 | 2706 | return (i); |
984263bc | 2707 | } |
1b9d3514 MD |
2708 | |
2709 | #endif | |
2710 | ||
2711 | /* | |
2712 | * vm_prefault() provides a quick way of clustering pagefaults into a | |
2713 | * processes address space. It is a "cousin" of pmap_object_init_pt, | |
2714 | * except it runs at page fault time instead of mmap time. | |
2715 | * | |
85946b6c MD |
2716 | * vm.fast_fault Enables pre-faulting zero-fill pages |
2717 | * | |
2718 | * vm.prefault_pages Number of pages (1/2 negative, 1/2 positive) to | |
2719 | * prefault. Scan stops in either direction when | |
2720 | * a page is found to already exist. | |
2721 | * | |
1b9d3514 MD |
2722 | * This code used to be per-platform pmap_prefault(). It is now |
2723 | * machine-independent and enhanced to also pre-fault zero-fill pages | |
2724 | * (see vm.fast_fault) as well as make them writable, which greatly | |
2725 | * reduces the number of page faults programs incur. | |
2726 | * | |
2727 | * Application performance when pre-faulting zero-fill pages is heavily | |
2728 | * dependent on the application. Very tiny applications like /bin/echo | |
2729 | * lose a little performance while applications of any appreciable size | |
2730 | * gain performance. Prefaulting multiple pages also reduces SMP | |
2731 | * congestion and can improve SMP performance significantly. | |
2732 | * | |
2733 | * NOTE! prot may allow writing but this only applies to the top level | |
2734 | * object. If we wind up mapping a page extracted from a backing | |
2735 | * object we have to make sure it is read-only. | |
2736 | * | |
2737 | * NOTE! The caller has already handled any COW operations on the | |
2738 | * vm_map_entry via the normal fault code. Do NOT call this | |
2739 | * shortcut unless the normal fault code has run on this entry. | |
9ad0147b | 2740 | * |
d2d8515b | 2741 | * The related map must be locked. |
9ad0147b | 2742 | * No other requirements. |
1b9d3514 | 2743 | */ |
aac423a7 | 2744 | __read_mostly static int vm_prefault_pages = 8; |
85946b6c MD |
2745 | SYSCTL_INT(_vm, OID_AUTO, prefault_pages, CTLFLAG_RW, &vm_prefault_pages, 0, |
2746 | "Maximum number of pages to pre-fault"); | |
aac423a7 | 2747 | __read_mostly static int vm_fast_fault = 1; |
85946b6c MD |
2748 | SYSCTL_INT(_vm, OID_AUTO, fast_fault, CTLFLAG_RW, &vm_fast_fault, 0, |
2749 | "Burst fault zero-fill regions"); | |
1b9d3514 | 2750 | |
2421aac7 MD |
2751 | /* |
2752 | * Set PG_NOSYNC if the map entry indicates so, but only if the page | |
2753 | * is not already dirty by other means. This will prevent passive | |
2754 | * filesystem syncing as well as 'sync' from writing out the page. | |
2755 | */ | |
2756 | static void | |
2757 | vm_set_nosync(vm_page_t m, vm_map_entry_t entry) | |
2758 | { | |
2759 | if (entry->eflags & MAP_ENTRY_NOSYNC) { | |
2760 | if (m->dirty == 0) | |
2761 | vm_page_flag_set(m, PG_NOSYNC); | |
2762 | } else { | |
2763 | vm_page_flag_clear(m, PG_NOSYNC); | |
2764 | } | |
2765 | } | |
2766 | ||
1b9d3514 | 2767 | static void |
54341a3b MD |
2768 | vm_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry, int prot, |
2769 | int fault_flags) | |
1b9d3514 | 2770 | { |
44293a80 | 2771 | vm_map_backing_t ba; /* first ba */ |
1b9d3514 MD |
2772 | struct lwp *lp; |
2773 | vm_page_t m; | |
1b9d3514 MD |
2774 | vm_offset_t addr; |
2775 | vm_pindex_t index; | |
2776 | vm_pindex_t pindex; | |
2777 | vm_object_t object; | |
2778 | int pprot; | |
2779 | int i; | |
85946b6c MD |
2780 | int noneg; |
2781 | int nopos; | |
2782 | int maxpages; | |
2783 | ||
2784 | /* | |
2785 | * Get stable max count value, disabled if set to 0 | |
2786 | */ | |
2787 | maxpages = vm_prefault_pages; | |
2788 | cpu_ccfence(); | |
2789 | if (maxpages <= 0) | |
2790 | return; | |
1b9d3514 MD |
2791 | |
2792 | /* | |
2793 | * We do not currently prefault mappings that use virtual page | |
2794 | * tables. We do not prefault foreign pmaps. | |
2795 | */ | |
0adbcbd6 | 2796 | if (entry->maptype != VM_MAPTYPE_NORMAL) |
1b9d3514 MD |
2797 | return; |
2798 | lp = curthread->td_lwp; | |
2799 | if (lp == NULL || (pmap != vmspace_pmap(lp->lwp_vmspace))) | |
2800 | return; | |
2801 | ||
85946b6c MD |
2802 | /* |
2803 | * Limit pre-fault count to 1024 pages. | |
2804 | */ | |
2805 | if (maxpages > 1024) | |
2806 | maxpages = 1024; | |
1b9d3514 | 2807 | |
9de48ead MD |
2808 | ba = &entry->ba; |
2809 | object = entry->ba.object; | |
b12defdc | 2810 | KKASSERT(object != NULL); |
ceb0e493 MD |
2811 | |
2812 | /* | |
2813 | * NOTE: VM_FAULT_DIRTY allowed later so must hold object exclusively | |
2814 | * now (or do something more complex XXX). | |
2815 | */ | |
54341a3b | 2816 | vm_object_hold(object); |
b12defdc | 2817 | |
85946b6c MD |
2818 | noneg = 0; |
2819 | nopos = 0; | |
2820 | for (i = 0; i < maxpages; ++i) { | |
1b9d3514 | 2821 | vm_object_t lobject; |
a31129d8 | 2822 | vm_object_t nobject; |
44293a80 MD |
2823 | vm_map_backing_t last_ba; /* last ba */ |
2824 | vm_map_backing_t next_ba; /* last ba */ | |
3bb7eedb | 2825 | int allocated = 0; |
b12defdc | 2826 | int error; |
1b9d3514 | 2827 | |
d2d8515b MD |
2828 | /* |
2829 | * This can eat a lot of time on a heavily contended | |
2830 | * machine so yield on the tick if needed. | |
2831 | */ | |
2832 | if ((i & 7) == 7) | |
2833 | lwkt_yield(); | |
2834 | ||
85946b6c MD |
2835 | /* |
2836 | * Calculate the page to pre-fault, stopping the scan in | |
2837 | * each direction separately if the limit is reached. | |
2838 | */ | |
2839 | if (i & 1) { | |
2840 | if (noneg) | |
2841 | continue; | |
2842 | addr = addra - ((i + 1) >> 1) * PAGE_SIZE; | |
2843 | } else { | |
2844 | if (nopos) | |
2845 | continue; | |
2846 | addr = addra + ((i + 2) >> 1) * PAGE_SIZE; | |
2847 | } | |
67e7cb85 | 2848 | if (addr < entry->ba.start) { |
85946b6c MD |
2849 | noneg = 1; |
2850 | if (noneg && nopos) | |
2851 | break; | |
2852 | continue; | |
2853 | } | |
67e7cb85 | 2854 | if (addr >= entry->ba.end) { |
85946b6c MD |
2855 | nopos = 1; |
2856 | if (noneg && nopos) | |
2857 | break; | |
1b9d3514 | 2858 | continue; |
85946b6c | 2859 | } |
1b9d3514 | 2860 | |
85946b6c MD |
2861 | /* |
2862 | * Skip pages already mapped, and stop scanning in that | |
2863 | * direction. When the scan terminates in both directions | |
2864 | * we are done. | |
2865 | */ | |
2866 | if (pmap_prefault_ok(pmap, addr) == 0) { | |
2867 | if (i & 1) | |
2868 | noneg = 1; | |
2869 | else | |
2870 | nopos = 1; | |
2871 | if (noneg && nopos) | |
2872 | break; | |
1b9d3514 | 2873 | continue; |
85946b6c | 2874 | } |
1b9d3514 MD |
2875 | |
2876 | /* | |
9de48ead | 2877 | * Follow the backing layers to obtain the page to be mapped |
1b9d3514 MD |
2878 | * into the pmap. |
2879 | * | |
2880 | * If we reach the terminal object without finding a page | |
2881 | * and we determine it would be advantageous, then allocate | |
2882 | * a zero-fill page for the base object. The base object | |
2883 | * is guaranteed to be OBJT_DEFAULT for this case. | |
3bb7eedb MD |
2884 | * |
2885 | * In order to not have to check the pager via *haspage*() | |
2886 | * we stop if any non-default object is encountered. e.g. | |
2887 | * a vnode or swap object would stop the loop. | |
1b9d3514 | 2888 | */ |
67e7cb85 | 2889 | index = ((addr - entry->ba.start) + entry->ba.offset) >> |
9de48ead MD |
2890 | PAGE_SHIFT; |
2891 | last_ba = ba; | |
1b9d3514 MD |
2892 | lobject = object; |
2893 | pindex = index; | |
2894 | pprot = prot; | |
2895 | ||
b12defdc | 2896 | /*vm_object_hold(lobject); implied */ |
a31129d8 | 2897 | |
b12defdc MD |
2898 | while ((m = vm_page_lookup_busy_try(lobject, pindex, |
2899 | TRUE, &error)) == NULL) { | |
1b9d3514 MD |
2900 | if (lobject->type != OBJT_DEFAULT) |
2901 | break; | |
9de48ead | 2902 | if ((next_ba = last_ba->backing_ba) == NULL) { |
1b9d3514 MD |
2903 | if (vm_fast_fault == 0) |
2904 | break; | |
85946b6c | 2905 | if ((prot & VM_PROT_WRITE) == 0 || |
e91e64c7 | 2906 | vm_paging_min()) { |
1b9d3514 MD |
2907 | break; |
2908 | } | |
a31129d8 | 2909 | |
b12defdc MD |
2910 | /* |
2911 | * NOTE: Allocated from base object | |
2912 | */ | |
1b9d3514 | 2913 | m = vm_page_alloc(object, index, |
d2d8515b MD |
2914 | VM_ALLOC_NORMAL | |
2915 | VM_ALLOC_ZERO | | |
54341a3b | 2916 | VM_ALLOC_USE_GD | |
d2d8515b MD |
2917 | VM_ALLOC_NULL_OK); |
2918 | if (m == NULL) | |
2919 | break; | |
3bb7eedb | 2920 | allocated = 1; |
1b9d3514 MD |
2921 | pprot = prot; |
2922 | /* lobject = object .. not needed */ | |
2923 | break; | |
2924 | } | |
9de48ead | 2925 | if (next_ba->offset & PAGE_MASK) |
1b9d3514 | 2926 | break; |
9de48ead | 2927 | nobject = next_ba->object; |
b12defdc | 2928 | vm_object_hold(nobject); |
67e7cb85 | 2929 | pindex -= last_ba->offset >> PAGE_SHIFT; |
9de48ead MD |
2930 | pindex += next_ba->offset >> PAGE_SHIFT; |
2931 | if (last_ba != ba) { | |
b12defdc MD |
2932 | vm_object_lock_swap(); |
2933 | vm_object_drop(lobject); | |
a31129d8 | 2934 | } |
b12defdc | 2935 | lobject = nobject; |
9de48ead | 2936 | last_ba = next_ba; |
1b9d3514 MD |
2937 | pprot &= ~VM_PROT_WRITE; |
2938 | } | |
a31129d8 | 2939 | |
1b9d3514 | 2940 | /* |
b12defdc MD |
2941 | * NOTE: A non-NULL (m) will be associated with lobject if |
2942 | * it was found there, otherwise it is probably a | |
2943 | * zero-fill page associated with the base object. | |
1b9d3514 | 2944 | * |
b12defdc | 2945 | * Give-up if no page is available. |
1b9d3514 | 2946 | */ |
b12defdc | 2947 | if (m == NULL) { |
9de48ead | 2948 | if (last_ba != ba) |
b12defdc | 2949 | vm_object_drop(lobject); |
1b9d3514 | 2950 | break; |
b12defdc | 2951 | } |
1b9d3514 | 2952 | |
54341a3b MD |
2953 | /* |
2954 | * The object must be marked dirty if we are mapping a | |
5ebb17ad MD |
2955 | * writable page. Note that (m) does not have to be |
2956 | * entered into the object, so use lobject or object | |
2957 | * as appropriate instead of m->object. | |
2958 | * | |
2959 | * Do this before we potentially drop the object. | |
54341a3b | 2960 | */ |
5ebb17ad MD |
2961 | if (pprot & VM_PROT_WRITE) { |
2962 | vm_object_set_writeable_dirty( | |
2963 | (allocated ? object : lobject)); | |
2964 | } | |
54341a3b | 2965 | |
1b9d3514 MD |
2966 | /* |
2967 | * Do not conditionalize on PG_RAM. If pages are present in | |
2968 | * the VM system we assume optimal caching. If caching is | |
2969 | * not optimal the I/O gravy train will be restarted when we | |
2970 | * hit an unavailable page. We do not want to try to restart | |
2971 | * the gravy train now because we really don't know how much | |
2972 | * of the object has been cached. The cost for restarting | |
2973 | * the gravy train should be low (since accesses will likely | |
2974 | * be I/O bound anyway). | |
1b9d3514 | 2975 | */ |
9de48ead | 2976 | if (last_ba != ba) |
b12defdc | 2977 | vm_object_drop(lobject); |
b12defdc | 2978 | |
1b9d3514 | 2979 | /* |
3bb7eedb MD |
2980 | * Enter the page into the pmap if appropriate. If we had |
2981 | * allocated the page we have to place it on a queue. If not | |
2982 | * we just have to make sure it isn't on the cache queue | |
2983 | * (pages on the cache queue are not allowed to be mapped). | |
5ebb17ad MD |
2984 | * |
2985 | * When allocated is TRUE, m corresponds to object, | |
2986 | * not lobject. | |
1b9d3514 | 2987 | */ |
3bb7eedb | 2988 | if (allocated) { |
54341a3b MD |
2989 | /* |
2990 | * Page must be zerod. | |
2991 | */ | |
afd2da4d | 2992 | vm_page_zero_fill(m); |
54341a3b MD |
2993 | mycpu->gd_cnt.v_zfod++; |
2994 | m->valid = VM_PAGE_BITS_ALL; | |
2995 | ||
2996 | /* | |
2997 | * Handle dirty page case | |
2998 | */ | |
2421aac7 MD |
2999 | if (pprot & VM_PROT_WRITE) |
3000 | vm_set_nosync(m, entry); | |
921c891e | 3001 | pmap_enter(pmap, addr, m, pprot, 0, entry); |
01251219 MD |
3002 | #if 0 |
3003 | /* REMOVE ME, a burst counts as one fault */ | |
54341a3b MD |
3004 | mycpu->gd_cnt.v_vm_faults++; |
3005 | if (curthread->td_lwp) | |
3006 | ++curthread->td_lwp->lwp_ru.ru_minflt; | |
01251219 | 3007 | #endif |
3bb7eedb | 3008 | vm_page_deactivate(m); |
54341a3b | 3009 | if (pprot & VM_PROT_WRITE) { |
5ebb17ad | 3010 | /*vm_object_set_writeable_dirty(object);*/ |
54341a3b MD |
3011 | vm_set_nosync(m, entry); |
3012 | if (fault_flags & VM_FAULT_DIRTY) { | |
3013 | vm_page_dirty(m); | |
3014 | /*XXX*/ | |
3015 | swap_pager_unswapped(m); | |
3016 | } | |
3017 | } | |
3bb7eedb | 3018 | vm_page_wakeup(m); |
b12defdc MD |
3019 | } else if (error) { |
3020 | /* couldn't busy page, no wakeup */ | |
a31129d8 MD |
3021 | } else if ( |
3022 | ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && | |
b12defdc | 3023 | (m->flags & PG_FICTITIOUS) == 0) { |
a31129d8 MD |
3024 | /* |
3025 | * A fully valid page not undergoing soft I/O can | |
3026 | * be immediately entered into the pmap. | |
5ebb17ad MD |
3027 | * |
3028 | * When allocated is false, m corresponds to lobject. | |
a31129d8 | 3029 | */ |
b12defdc | 3030 | if ((m->queue - m->pc) == PQ_CACHE) |
1b9d3514 | 3031 | vm_page_deactivate(m); |
54341a3b | 3032 | if (pprot & VM_PROT_WRITE) { |
5ebb17ad | 3033 | /*vm_object_set_writeable_dirty(lobject);*/ |
54341a3b MD |
3034 | vm_set_nosync(m, entry); |
3035 | if (fault_flags & VM_FAULT_DIRTY) { | |
3036 | vm_page_dirty(m); | |
3037 | /*XXX*/ | |
3038 | swap_pager_unswapped(m); | |
3039 | } | |
3040 | } | |
2421aac7 MD |
3041 | if (pprot & VM_PROT_WRITE) |
3042 | vm_set_nosync(m, entry); | |
921c891e | 3043 | pmap_enter(pmap, addr, m, pprot, 0, entry); |
01251219 MD |
3044 | #if 0 |
3045 | /* REMOVE ME, a burst counts as one fault */ | |
54341a3b MD |
3046 | mycpu->gd_cnt.v_vm_faults++; |
3047 | if (curthread->td_lwp) | |
3048 | ++curthread->td_lwp->lwp_ru.ru_minflt; | |
01251219 | 3049 | #endif |
1b9d3514 | 3050 | vm_page_wakeup(m); |
b12defdc MD |
3051 | } else { |
3052 | vm_page_wakeup(m); | |
1b9d3514 MD |
3053 | } |
3054 | } | |
a31129d8 | 3055 | vm_object_drop(object); |
1b9d3514 | 3056 | } |
54341a3b | 3057 | |
501747bf MD |
3058 | /* |
3059 | * Object can be held shared | |
3060 | */ | |
54341a3b MD |
3061 | static void |
3062 | vm_prefault_quick(pmap_t pmap, vm_offset_t addra, | |
3063 | vm_map_entry_t entry, int prot, int fault_flags) | |
3064 | { | |
3065 | struct lwp *lp; | |
3066 | vm_page_t m; | |
3067 | vm_offset_t addr; | |
3068 | vm_pindex_t pindex; | |
3069 | vm_object_t object; | |
3070 | int i; | |
3071 | int noneg; | |
3072 | int nopos; | |
3073 | int maxpages; | |
3074 | ||
3075 | /* | |
3076 | * Get stable max count value, disabled if set to 0 | |
3077 | */ | |
3078 | maxpages = vm_prefault_pages; | |
3079 | cpu_ccfence(); | |
3080 | if (maxpages <= 0) | |
3081 | return; | |
3082 | ||
3083 | /* | |
3084 | * We do not currently prefault mappings that use virtual page | |
3085 | * tables. We do not prefault foreign pmaps. | |
3086 | */ | |
0adbcbd6 | 3087 | if (entry->maptype != VM_MAPTYPE_NORMAL) |
54341a3b MD |
3088 | return; |
3089 | lp = curthread->td_lwp; | |
3090 | if (lp == NULL || (pmap != vmspace_pmap(lp->lwp_vmspace))) | |
3091 | return; | |
9de48ead MD |
3092 | object = entry->ba.object; |
3093 | if (entry->ba.backing_ba != NULL) | |
501747bf MD |
3094 | return; |
3095 | ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); | |
54341a3b MD |
3096 | |
3097 | /* | |
3098 | * Limit pre-fault count to 1024 pages. | |
3099 | */ | |
3100 | if (maxpages > 1024) | |
3101 | maxpages = 1024; | |
3102 | ||
54341a3b MD |
3103 | noneg = 0; |
3104 | nopos = 0; | |
3105 | for (i = 0; i < maxpages; ++i) { | |
3106 | int error; | |
3107 | ||
3108 | /* | |
3109 | * Calculate the page to pre-fault, stopping the scan in | |
3110 | * each direction separately if the limit is reached. | |
3111 | */ | |
3112 | if (i & 1) { | |
3113 | if (noneg) | |
3114 | continue; | |
3115 | addr = addra - ((i + 1) >> 1) * PAGE_SIZE; | |
3116 | } else { | |
3117 | if (nopos) | |
3118 | continue; | |
3119 | addr = addra + ((i + 2) >> 1) * PAGE_SIZE; | |
3120 | } | |
67e7cb85 | 3121 | if (addr < entry->ba.start) { |
54341a3b MD |
3122 | noneg = 1; |
3123 | if (noneg && nopos) | |
3124 | break; | |
3125 | continue; | |
3126 | } | |
67e7cb85 | 3127 | if (addr >= entry->ba.end) { |
54341a3b MD |
3128 | nopos = 1; |
3129 | if (noneg && nopos) | |
3130 | break; | |
3131 | continue; | |
3132 | } | |
3133 | ||
cfffe7b1 MD |
3134 | /* |
3135 | * Follow the VM object chain to obtain the page to be mapped | |
3136 | * into the pmap. This version of the prefault code only | |
3137 | * works with terminal objects. | |
3138 | * | |
3139 | * The page must already exist. If we encounter a problem | |
3140 | * we stop here. | |
3141 | * | |
3142 | * WARNING! We cannot call swap_pager_unswapped() or insert | |
3143 | * a new vm_page with a shared token. | |
3144 | */ | |
67e7cb85 | 3145 | pindex = ((addr - entry->ba.start) + entry->ba.offset) >> |
9de48ead | 3146 | PAGE_SHIFT; |
cfffe7b1 | 3147 | |
54341a3b MD |
3148 | /* |
3149 | * Skip pages already mapped, and stop scanning in that | |
3150 | * direction. When the scan terminates in both directions | |
3151 | * we are done. | |
3152 | */ | |
3153 | if (pmap_prefault_ok(pmap, addr) == 0) { | |
3154 | if (i & 1) | |
3155 | noneg = 1; | |
3156 | else | |
3157 | nopos = 1; | |
3158 | if (noneg && nopos) | |
3159 | break; | |
3160 | continue; | |
3161 | } | |
3162 | ||
bc0aa189 MD |
3163 | /* |
3164 | * Shortcut the read-only mapping case using the far more | |
3165 | * efficient vm_page_lookup_sbusy_try() function. This | |
3166 | * allows us to acquire the page soft-busied only which | |
3167 | * is especially nice for concurrent execs of the same | |
3168 | * program. | |
3169 | * | |
3170 | * The lookup function also validates page suitability | |
3171 | * (all valid bits set, and not fictitious). | |
bb1339f8 MD |
3172 | * |
3173 | * If the page is in PQ_CACHE we have to fall-through | |
3174 | * and hard-busy it so we can move it out of PQ_CACHE. | |
bc0aa189 | 3175 | */ |
7a45978d | 3176 | if ((prot & VM_PROT_WRITE) == 0) { |
eae4df88 MD |
3177 | m = vm_page_lookup_sbusy_try(object, pindex, |
3178 | 0, PAGE_SIZE); | |
bc0aa189 MD |
3179 | if (m == NULL) |
3180 | break; | |
bb1339f8 MD |
3181 | if ((m->queue - m->pc) != PQ_CACHE) { |
3182 | pmap_enter(pmap, addr, m, prot, 0, entry); | |
01251219 MD |
3183 | #if 0 |
3184 | /* REMOVE ME, a burst counts as one fault */ | |
bb1339f8 MD |
3185 | mycpu->gd_cnt.v_vm_faults++; |
3186 | if (curthread->td_lwp) | |
3187 | ++curthread->td_lwp->lwp_ru.ru_minflt; | |
01251219 | 3188 | #endif |
bb1339f8 MD |
3189 | vm_page_sbusy_drop(m); |
3190 | continue; | |
3191 | } | |
bc0aa189 | 3192 | vm_page_sbusy_drop(m); |
bc0aa189 MD |
3193 | } |
3194 | ||
3195 | /* | |
3196 | * Fallback to normal vm_page lookup code. This code | |
3197 | * hard-busies the page. Not only that, but the page | |
3198 | * can remain in that state for a significant period | |
3199 | * time due to pmap_enter()'s overhead. | |
3200 | */ | |
3201 | m = vm_page_lookup_busy_try(object, pindex, TRUE, &error); | |
3202 | if (m == NULL || error) | |
3203 | break; | |
3204 | ||
54341a3b | 3205 | /* |
cfffe7b1 MD |
3206 | * Stop if the page cannot be trivially entered into the |
3207 | * pmap. | |
54341a3b | 3208 | */ |
cfffe7b1 MD |
3209 | if (((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) || |
3210 | (m->flags & PG_FICTITIOUS) || | |
3211 | ((m->flags & PG_SWAPPED) && | |
3212 | (prot & VM_PROT_WRITE) && | |
3213 | (fault_flags & VM_FAULT_DIRTY))) { | |
3214 | vm_page_wakeup(m); | |
3215 | break; | |
3216 | } | |
54341a3b | 3217 | |
cfffe7b1 MD |
3218 | /* |
3219 | * Enter the page into the pmap. The object might be held | |
3220 | * shared so we can't do any (serious) modifying operation | |
3221 | * on it. | |
3222 | */ | |
3223 | if ((m->queue - m->pc) == PQ_CACHE) | |
3224 | vm_page_deactivate(m); | |
3225 | if (prot & VM_PROT_WRITE) { | |
3226 | vm_object_set_writeable_dirty(m->object); | |
3227 | vm_set_nosync(m, entry); | |
3228 | if (fault_flags & VM_FAULT_DIRTY) { | |
3229 | vm_page_dirty(m); | |
3230 | /* can't happeen due to conditional above */ | |
3231 | /* swap_pager_unswapped(m); */ | |
54341a3b | 3232 | } |
54341a3b | 3233 | } |
cfffe7b1 | 3234 | pmap_enter(pmap, addr, m, prot, 0, entry); |
01251219 MD |
3235 | #if 0 |
3236 | /* REMOVE ME, a burst counts as one fault */ | |
cfffe7b1 MD |
3237 | mycpu->gd_cnt.v_vm_faults++; |
3238 | if (curthread->td_lwp) | |
3239 | ++curthread->td_lwp->lwp_ru.ru_minflt; | |
01251219 | 3240 | #endif |
54341a3b MD |
3241 | vm_page_wakeup(m); |
3242 | } | |
3243 | } |