| Commit | Line | Data |
|---|---|---|
| 984263bc MD |
1 | /* |
| 2 | * Copyright (c) 1991, 1993 | |
| 3 | * The Regents of the University of California. All rights reserved. | |
| 4 | * Copyright (c) 1994 John S. Dyson | |
| 5 | * All rights reserved. | |
| 6 | * Copyright (c) 1994 David Greenman | |
| 7 | * All rights reserved. | |
| 8 | * | |
| 9 | * | |
| 10 | * This code is derived from software contributed to Berkeley by | |
| 11 | * The Mach Operating System project at Carnegie-Mellon University. | |
| 12 | * | |
| 13 | * Redistribution and use in source and binary forms, with or without | |
| 14 | * modification, are permitted provided that the following conditions | |
| 15 | * are met: | |
| 16 | * 1. Redistributions of source code must retain the above copyright | |
| 17 | * notice, this list of conditions and the following disclaimer. | |
| 18 | * 2. Redistributions in binary form must reproduce the above copyright | |
| 19 | * notice, this list of conditions and the following disclaimer in the | |
| 20 | * documentation and/or other materials provided with the distribution. | |
| 21 | * 3. All advertising materials mentioning features or use of this software | |
| 22 | * must display the following acknowledgement: | |
| 23 | * This product includes software developed by the University of | |
| 24 | * California, Berkeley and its contributors. | |
| 25 | * 4. Neither the name of the University nor the names of its contributors | |
| 26 | * may be used to endorse or promote products derived from this software | |
| 27 | * without specific prior written permission. | |
| 28 | * | |
| 29 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND | |
| 30 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
| 31 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
| 32 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | |
| 33 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
| 34 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
| 35 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
| 36 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
| 37 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
| 38 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
| 39 | * SUCH DAMAGE. | |
| 40 | * | |
| 41 | * from: @(#)vm_fault.c 8.4 (Berkeley) 1/12/94 | |
| 42 | * | |
| 43 | * | |
| 44 | * Copyright (c) 1987, 1990 Carnegie-Mellon University. | |
| 45 | * All rights reserved. | |
| 46 | * | |
| 47 | * Authors: Avadis Tevanian, Jr., Michael Wayne Young | |
| 48 | * | |
| 49 | * Permission to use, copy, modify and distribute this software and | |
| 50 | * its documentation is hereby granted, provided that both the copyright | |
| 51 | * notice and this permission notice appear in all copies of the | |
| 52 | * software, derivative works or modified versions, and any portions | |
| 53 | * thereof, and that both notices appear in supporting documentation. | |
| 54 | * | |
| 55 | * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" | |
| 56 | * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND | |
| 57 | * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. | |
| 58 | * | |
| 59 | * Carnegie Mellon requests users of this software to return to | |
| 60 | * | |
| 61 | * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU | |
| 62 | * School of Computer Science | |
| 63 | * Carnegie Mellon University | |
| 64 | * Pittsburgh PA 15213-3890 | |
| 65 | * | |
| 66 | * any improvements or extensions that they make and grant Carnegie the | |
| 67 | * rights to redistribute these changes. | |
| 68 | * | |
| 69 | * $FreeBSD: src/sys/vm/vm_fault.c,v 1.108.2.8 2002/02/26 05:49:27 silby Exp $ | |
| 4ecf7cc9 | 70 | * $DragonFly: src/sys/vm/vm_fault.c,v 1.47 2008/07/01 02:02:56 dillon Exp $ |
| 984263bc MD |
71 | */ |
| 72 | ||
| 73 | /* | |
| 74 | * Page fault handling module. | |
| 75 | */ | |
| 76 | ||
| 77 | #include <sys/param.h> | |
| 78 | #include <sys/systm.h> | |
| 46311ac2 | 79 | #include <sys/kernel.h> |
| 984263bc MD |
80 | #include <sys/proc.h> |
| 81 | #include <sys/vnode.h> | |
| 82 | #include <sys/resourcevar.h> | |
| 83 | #include <sys/vmmeter.h> | |
| 75f59a66 | 84 | #include <sys/vkernel.h> |
| 75f59a66 | 85 | #include <sys/lock.h> |
| bc823b32 | 86 | #include <sys/sysctl.h> |
| 984263bc | 87 | |
| 5c5185ae SG |
88 | #include <cpu/lwbuf.h> |
| 89 | ||
| 984263bc MD |
90 | #include <vm/vm.h> |
| 91 | #include <vm/vm_param.h> | |
| 984263bc MD |
92 | #include <vm/pmap.h> |
| 93 | #include <vm/vm_map.h> | |
| 94 | #include <vm/vm_object.h> | |
| 95 | #include <vm/vm_page.h> | |
| 96 | #include <vm/vm_pageout.h> | |
| 97 | #include <vm/vm_kern.h> | |
| 98 | #include <vm/vm_pager.h> | |
| 99 | #include <vm/vnode_pager.h> | |
| 100 | #include <vm/vm_extern.h> | |
| 654a39f0 MD |
101 | |
| 102 | #include <sys/thread2.h> | |
| 12e4aaff | 103 | #include <vm/vm_page2.h> |
| 984263bc | 104 | |
| 984263bc MD |
105 | struct faultstate { |
| 106 | vm_page_t m; | |
| 107 | vm_object_t object; | |
| 108 | vm_pindex_t pindex; | |
| 72579d2e | 109 | vm_prot_t prot; |
| 984263bc | 110 | vm_page_t first_m; |
| 568e6804 | 111 | vm_object_t first_object; |
| 72579d2e | 112 | vm_prot_t first_prot; |
| 984263bc MD |
113 | vm_map_t map; |
| 114 | vm_map_entry_t entry; | |
| 115 | int lookup_still_valid; | |
| 568e6804 MD |
116 | int didlimit; |
| 117 | int hardfault; | |
| 568e6804 MD |
118 | int fault_flags; |
| 119 | int map_generation; | |
| 120 | boolean_t wired; | |
| 984263bc MD |
121 | struct vnode *vp; |
| 122 | }; | |
| 123 | ||
| 1b9d3514 MD |
124 | static int vm_fast_fault = 1; |
| 125 | SYSCTL_INT(_vm, OID_AUTO, fast_fault, CTLFLAG_RW, &vm_fast_fault, 0, ""); | |
| cf1bb2a8 MD |
126 | static int debug_cluster = 0; |
| 127 | SYSCTL_INT(_vm, OID_AUTO, debug_cluster, CTLFLAG_RW, &debug_cluster, 0, ""); | |
| bc823b32 | 128 | |
| 72579d2e | 129 | static int vm_fault_object(struct faultstate *, vm_pindex_t, vm_prot_t); |
| 4e7c41c5 | 130 | static int vm_fault_vpagetable(struct faultstate *, vm_pindex_t *, vpte_t, int); |
| 1b9d3514 | 131 | #if 0 |
| 568e6804 | 132 | static int vm_fault_additional_pages (vm_page_t, int, int, vm_page_t *, int *); |
| 1b9d3514 | 133 | #endif |
| 72579d2e | 134 | static int vm_fault_ratelimit(struct vmspace *); |
| 1b9d3514 MD |
135 | static void vm_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry, |
| 136 | int prot); | |
| 568e6804 | 137 | |
| 984263bc MD |
138 | static __inline void |
| 139 | release_page(struct faultstate *fs) | |
| 140 | { | |
| 984263bc | 141 | vm_page_deactivate(fs->m); |
| 17cde63e | 142 | vm_page_wakeup(fs->m); |
| 984263bc MD |
143 | fs->m = NULL; |
| 144 | } | |
| 145 | ||
| 146 | static __inline void | |
| 147 | unlock_map(struct faultstate *fs) | |
| 148 | { | |
| aa542ad5 | 149 | if (fs->lookup_still_valid && fs->map) { |
| a108bf71 | 150 | vm_map_lookup_done(fs->map, fs->entry, 0); |
| 984263bc MD |
151 | fs->lookup_still_valid = FALSE; |
| 152 | } | |
| 153 | } | |
| 154 | ||
| 75f59a66 MD |
155 | /* |
| 156 | * Clean up after a successful call to vm_fault_object() so another call | |
| 157 | * to vm_fault_object() can be made. | |
| 158 | */ | |
| 984263bc | 159 | static void |
| 75f59a66 | 160 | _cleanup_successful_fault(struct faultstate *fs, int relock) |
| 984263bc | 161 | { |
| 984263bc MD |
162 | if (fs->object != fs->first_object) { |
| 163 | vm_page_free(fs->first_m); | |
| 75f59a66 | 164 | vm_object_pip_wakeup(fs->object); |
| 984263bc MD |
165 | fs->first_m = NULL; |
| 166 | } | |
| 75f59a66 MD |
167 | fs->object = fs->first_object; |
| 168 | if (relock && fs->lookup_still_valid == FALSE) { | |
| aa542ad5 MD |
169 | if (fs->map) |
| 170 | vm_map_lock_read(fs->map); | |
| 75f59a66 MD |
171 | fs->lookup_still_valid = TRUE; |
| 172 | } | |
| 173 | } | |
| 174 | ||
| 175 | static void | |
| 176 | _unlock_things(struct faultstate *fs, int dealloc) | |
| 177 | { | |
| 178 | vm_object_pip_wakeup(fs->first_object); | |
| 179 | _cleanup_successful_fault(fs, 0); | |
| 984263bc MD |
180 | if (dealloc) { |
| 181 | vm_object_deallocate(fs->first_object); | |
| bc823b32 | 182 | fs->first_object = NULL; |
| 984263bc MD |
183 | } |
| 184 | unlock_map(fs); | |
| 185 | if (fs->vp != NULL) { | |
| 186 | vput(fs->vp); | |
| 187 | fs->vp = NULL; | |
| 188 | } | |
| 189 | } | |
| 190 | ||
| 191 | #define unlock_things(fs) _unlock_things(fs, 0) | |
| 192 | #define unlock_and_deallocate(fs) _unlock_things(fs, 1) | |
| 75f59a66 | 193 | #define cleanup_successful_fault(fs) _cleanup_successful_fault(fs, 1) |
| 984263bc MD |
194 | |
| 195 | /* | |
| 568e6804 MD |
196 | * TRYPAGER |
| 197 | * | |
| 198 | * Determine if the pager for the current object *might* contain the page. | |
| 984263bc | 199 | * |
| 568e6804 MD |
200 | * We only need to try the pager if this is not a default object (default |
| 201 | * objects are zero-fill and have no real pager), and if we are not taking | |
| 202 | * a wiring fault or if the FS entry is wired. | |
| 984263bc | 203 | */ |
| 568e6804 MD |
204 | #define TRYPAGER(fs) \ |
| 205 | (fs->object->type != OBJT_DEFAULT && \ | |
| 206 | (((fs->fault_flags & VM_FAULT_WIRE_MASK) == 0) || fs->wired)) | |
| 984263bc MD |
207 | |
| 208 | /* | |
| 568e6804 | 209 | * vm_fault: |
| 984263bc | 210 | * |
| 568e6804 MD |
211 | * Handle a page fault occuring at the given address, requiring the given |
| 212 | * permissions, in the map specified. If successful, the page is inserted | |
| 213 | * into the associated physical map. | |
| 984263bc | 214 | * |
| 568e6804 | 215 | * NOTE: The given address should be truncated to the proper page address. |
| 984263bc | 216 | * |
| 568e6804 MD |
217 | * KERN_SUCCESS is returned if the page fault is handled; otherwise, |
| 218 | * a standard error specifying why the fault is fatal is returned. | |
| 984263bc | 219 | * |
| 568e6804 MD |
220 | * The map in question must be referenced, and remains so. |
| 221 | * The caller may hold no locks. | |
| 984263bc MD |
222 | */ |
| 223 | int | |
| 224 | vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags) | |
| 225 | { | |
| 984263bc | 226 | int result; |
| 72579d2e | 227 | vm_pindex_t first_pindex; |
| 984263bc | 228 | struct faultstate fs; |
| 8d496bf9 | 229 | int growstack; |
| 984263bc | 230 | |
| 12e4aaff | 231 | mycpu->gd_cnt.v_vm_faults++; |
| 568e6804 MD |
232 | |
| 233 | fs.didlimit = 0; | |
| 234 | fs.hardfault = 0; | |
| 235 | fs.fault_flags = fault_flags; | |
| 8d496bf9 | 236 | growstack = 1; |
| 984263bc | 237 | |
| 06ecca5a | 238 | RetryFault: |
| 984263bc | 239 | /* |
| 568e6804 MD |
240 | * Find the vm_map_entry representing the backing store and resolve |
| 241 | * the top level object and page index. This may have the side | |
| 242 | * effect of executing a copy-on-write on the map entry and/or | |
| 243 | * creating a shadow object, but will not COW any actual VM pages. | |
| 244 | * | |
| 245 | * On success fs.map is left read-locked and various other fields | |
| 246 | * are initialized but not otherwise referenced or locked. | |
| 247 | * | |
| 4e7c41c5 MD |
248 | * NOTE! vm_map_lookup will try to upgrade the fault_type to |
| 249 | * VM_FAULT_WRITE if the map entry is a virtual page table and also | |
| 250 | * writable, so we can set the 'A'accessed bit in the virtual page | |
| 251 | * table entry. | |
| 984263bc MD |
252 | */ |
| 253 | fs.map = map; | |
| 568e6804 MD |
254 | result = vm_map_lookup(&fs.map, vaddr, fault_type, |
| 255 | &fs.entry, &fs.first_object, | |
| 72579d2e | 256 | &first_pindex, &fs.first_prot, &fs.wired); |
| 568e6804 MD |
257 | |
| 258 | /* | |
| 259 | * If the lookup failed or the map protections are incompatible, | |
| 260 | * the fault generally fails. However, if the caller is trying | |
| 261 | * to do a user wiring we have more work to do. | |
| 262 | */ | |
| 263 | if (result != KERN_SUCCESS) { | |
| 8d496bf9 MD |
264 | if (result != KERN_PROTECTION_FAILURE || |
| 265 | (fs.fault_flags & VM_FAULT_WIRE_MASK) != VM_FAULT_USER_WIRE) | |
| 266 | { | |
| 267 | if (result == KERN_INVALID_ADDRESS && growstack && | |
| 268 | map != &kernel_map && curproc != NULL) { | |
| 269 | result = vm_map_growstack(curproc, vaddr); | |
| 270 | if (result != KERN_SUCCESS) | |
| 271 | return (KERN_FAILURE); | |
| 272 | growstack = 0; | |
| 273 | goto RetryFault; | |
| 274 | } | |
| 275 | return (result); | |
| 276 | } | |
| 984263bc MD |
277 | |
| 278 | /* | |
| 279 | * If we are user-wiring a r/w segment, and it is COW, then | |
| 568e6804 MD |
280 | * we need to do the COW operation. Note that we don't |
| 281 | * currently COW RO sections now, because it is NOT desirable | |
| 984263bc MD |
282 | * to COW .text. We simply keep .text from ever being COW'ed |
| 283 | * and take the heat that one cannot debug wired .text sections. | |
| 284 | */ | |
| 285 | result = vm_map_lookup(&fs.map, vaddr, | |
| 568e6804 MD |
286 | VM_PROT_READ|VM_PROT_WRITE| |
| 287 | VM_PROT_OVERRIDE_WRITE, | |
| 288 | &fs.entry, &fs.first_object, | |
| 72579d2e MD |
289 | &first_pindex, &fs.first_prot, |
| 290 | &fs.wired); | |
| 568e6804 | 291 | if (result != KERN_SUCCESS) |
| 984263bc | 292 | return result; |
| 984263bc MD |
293 | |
| 294 | /* | |
| 295 | * If we don't COW now, on a user wire, the user will never | |
| 296 | * be able to write to the mapping. If we don't make this | |
| 297 | * restriction, the bookkeeping would be nearly impossible. | |
| 298 | */ | |
| 299 | if ((fs.entry->protection & VM_PROT_WRITE) == 0) | |
| 300 | fs.entry->max_protection &= ~VM_PROT_WRITE; | |
| 301 | } | |
| 302 | ||
| 568e6804 MD |
303 | /* |
| 304 | * fs.map is read-locked | |
| 305 | * | |
| 306 | * Misc checks. Save the map generation number to detect races. | |
| 307 | */ | |
| 308 | fs.map_generation = fs.map->timestamp; | |
| 984263bc MD |
309 | |
| 310 | if (fs.entry->eflags & MAP_ENTRY_NOFAULT) { | |
| 311 | panic("vm_fault: fault on nofault entry, addr: %lx", | |
| 312 | (u_long)vaddr); | |
| 313 | } | |
| 314 | ||
| 315 | /* | |
| c40f2b75 MD |
316 | * A system map entry may return a NULL object. No object means |
| 317 | * no pager means an unrecoverable kernel fault. | |
| 318 | */ | |
| 319 | if (fs.first_object == NULL) { | |
| 320 | panic("vm_fault: unrecoverable fault at %p in entry %p", | |
| 321 | (void *)vaddr, fs.entry); | |
| 322 | } | |
| 323 | ||
| 324 | /* | |
| 984263bc MD |
325 | * Make a reference to this object to prevent its disposal while we |
| 326 | * are messing with it. Once we have the reference, the map is free | |
| 327 | * to be diddled. Since objects reference their shadows (and copies), | |
| 328 | * they will stay around as well. | |
| 329 | * | |
| 330 | * Bump the paging-in-progress count to prevent size changes (e.g. | |
| 331 | * truncation operations) during I/O. This must be done after | |
| 332 | * obtaining the vnode lock in order to avoid possible deadlocks. | |
| 333 | */ | |
| 334 | vm_object_reference(fs.first_object); | |
| 335 | fs.vp = vnode_pager_lock(fs.first_object); | |
| 336 | vm_object_pip_add(fs.first_object, 1); | |
| 337 | ||
| 984263bc | 338 | fs.lookup_still_valid = TRUE; |
| 984263bc | 339 | fs.first_m = NULL; |
| afeabdca | 340 | fs.object = fs.first_object; /* so unlock_and_deallocate works */ |
| 984263bc MD |
341 | |
| 342 | /* | |
| 568e6804 | 343 | * If the entry is wired we cannot change the page protection. |
| 984263bc | 344 | */ |
| 568e6804 | 345 | if (fs.wired) |
| 72579d2e | 346 | fault_type = fs.first_prot; |
| 984263bc | 347 | |
| 568e6804 | 348 | /* |
| 75f59a66 MD |
349 | * The page we want is at (first_object, first_pindex), but if the |
| 350 | * vm_map_entry is VM_MAPTYPE_VPAGETABLE we have to traverse the | |
| 351 | * page table to figure out the actual pindex. | |
| 352 | * | |
| 353 | * NOTE! DEVELOPMENT IN PROGRESS, THIS IS AN INITIAL IMPLEMENTATION | |
| 354 | * ONLY | |
| 568e6804 | 355 | */ |
| 568e6804 | 356 | if (fs.entry->maptype == VM_MAPTYPE_VPAGETABLE) { |
| 72579d2e | 357 | result = vm_fault_vpagetable(&fs, &first_pindex, |
| 4e7c41c5 MD |
358 | fs.entry->aux.master_pde, |
| 359 | fault_type); | |
| 568e6804 MD |
360 | if (result == KERN_TRY_AGAIN) |
| 361 | goto RetryFault; | |
| 75f59a66 | 362 | if (result != KERN_SUCCESS) |
| 568e6804 | 363 | return (result); |
| 568e6804 | 364 | } |
| 75f59a66 | 365 | |
| 568e6804 MD |
366 | /* |
| 367 | * Now we have the actual (object, pindex), fault in the page. If | |
| 368 | * vm_fault_object() fails it will unlock and deallocate the FS | |
| 75f59a66 MD |
369 | * data. If it succeeds everything remains locked and fs->object |
| 370 | * will have an additinal PIP count if it is not equal to | |
| 371 | * fs->first_object | |
| 4e7c41c5 MD |
372 | * |
| 373 | * vm_fault_object will set fs->prot for the pmap operation. It is | |
| 374 | * allowed to set VM_PROT_WRITE if fault_type == VM_PROT_READ if the | |
| 375 | * page can be safely written. However, it will force a read-only | |
| 376 | * mapping for a read fault if the memory is managed by a virtual | |
| 377 | * page table. | |
| 568e6804 | 378 | */ |
| 72579d2e | 379 | result = vm_fault_object(&fs, first_pindex, fault_type); |
| afeabdca | 380 | |
| 568e6804 MD |
381 | if (result == KERN_TRY_AGAIN) |
| 382 | goto RetryFault; | |
| 383 | if (result != KERN_SUCCESS) | |
| 384 | return (result); | |
| 385 | ||
| 386 | /* | |
| 75f59a66 MD |
387 | * On success vm_fault_object() does not unlock or deallocate, and fs.m |
| 388 | * will contain a busied page. | |
| 568e6804 MD |
389 | * |
| 390 | * Enter the page into the pmap and do pmap-related adjustments. | |
| 391 | */ | |
| 392 | pmap_enter(fs.map->pmap, vaddr, fs.m, fs.prot, fs.wired); | |
| 393 | ||
| 1b9d3514 MD |
394 | /* |
| 395 | * Burst in a few more pages if possible. The fs.map should still | |
| 396 | * be locked. | |
| 397 | */ | |
| 398 | if (fault_flags & VM_FAULT_BURST) { | |
| 399 | if ((fs.fault_flags & VM_FAULT_WIRE_MASK) == 0 && | |
| 400 | fs.wired == 0) { | |
| 401 | vm_prefault(fs.map->pmap, vaddr, fs.entry, fs.prot); | |
| 402 | } | |
| 568e6804 | 403 | } |
| 1b9d3514 | 404 | unlock_things(&fs); |
| 568e6804 MD |
405 | |
| 406 | vm_page_flag_clear(fs.m, PG_ZERO); | |
| 17cde63e | 407 | vm_page_flag_set(fs.m, PG_REFERENCED); |
| 568e6804 MD |
408 | |
| 409 | /* | |
| 410 | * If the page is not wired down, then put it where the pageout daemon | |
| 411 | * can find it. | |
| 412 | */ | |
| 413 | if (fs.fault_flags & VM_FAULT_WIRE_MASK) { | |
| 414 | if (fs.wired) | |
| 415 | vm_page_wire(fs.m); | |
| 416 | else | |
| 417 | vm_page_unwire(fs.m, 1); | |
| 418 | } else { | |
| 419 | vm_page_activate(fs.m); | |
| 420 | } | |
| 421 | ||
| fde7ac71 | 422 | if (curthread->td_lwp) { |
| 568e6804 | 423 | if (fs.hardfault) { |
| fde7ac71 | 424 | curthread->td_lwp->lwp_ru.ru_majflt++; |
| 568e6804 | 425 | } else { |
| fde7ac71 | 426 | curthread->td_lwp->lwp_ru.ru_minflt++; |
| 568e6804 MD |
427 | } |
| 428 | } | |
| 429 | ||
| 430 | /* | |
| 431 | * Unlock everything, and return | |
| 432 | */ | |
| 433 | vm_page_wakeup(fs.m); | |
| 434 | vm_object_deallocate(fs.first_object); | |
| 435 | ||
| 436 | return (KERN_SUCCESS); | |
| 437 | } | |
| 438 | ||
| 439 | /* | |
| 5a0e2a66 MD |
440 | * Fault in the specified virtual address in the current process map, |
| 441 | * returning a held VM page or NULL. See vm_fault_page() for more | |
| 442 | * information. | |
| 443 | */ | |
| 444 | vm_page_t | |
| 445 | vm_fault_page_quick(vm_offset_t va, vm_prot_t fault_type, int *errorp) | |
| 446 | { | |
| 287ebb09 | 447 | struct lwp *lp = curthread->td_lwp; |
| 5a0e2a66 MD |
448 | vm_page_t m; |
| 449 | ||
| 287ebb09 | 450 | m = vm_fault_page(&lp->lwp_vmspace->vm_map, va, |
| 5a0e2a66 MD |
451 | fault_type, VM_FAULT_NORMAL, errorp); |
| 452 | return(m); | |
| 453 | } | |
| 454 | ||
| 455 | /* | |
| 456 | * Fault in the specified virtual address in the specified map, doing all | |
| 4e158347 MD |
457 | * necessary manipulation of the object store and all necessary I/O. Return |
| 458 | * a held VM page or NULL, and set *errorp. The related pmap is not | |
| 459 | * updated. | |
| 460 | * | |
| 5a0e2a66 MD |
461 | * The returned page will be properly dirtied if VM_PROT_WRITE was specified, |
| 462 | * and marked PG_REFERENCED as well. | |
| 17cde63e MD |
463 | * |
| 464 | * If the page cannot be faulted writable and VM_PROT_WRITE was specified, an | |
| 465 | * error will be returned. | |
| 4e158347 MD |
466 | */ |
| 467 | vm_page_t | |
| 468 | vm_fault_page(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, | |
| 469 | int fault_flags, int *errorp) | |
| 470 | { | |
| 4e158347 MD |
471 | vm_pindex_t first_pindex; |
| 472 | struct faultstate fs; | |
| 17cde63e MD |
473 | int result; |
| 474 | vm_prot_t orig_fault_type = fault_type; | |
| 4e158347 MD |
475 | |
| 476 | mycpu->gd_cnt.v_vm_faults++; | |
| 477 | ||
| 478 | fs.didlimit = 0; | |
| 479 | fs.hardfault = 0; | |
| 480 | fs.fault_flags = fault_flags; | |
| 481 | KKASSERT((fault_flags & VM_FAULT_WIRE_MASK) == 0); | |
| 482 | ||
| 483 | RetryFault: | |
| 484 | /* | |
| 485 | * Find the vm_map_entry representing the backing store and resolve | |
| 486 | * the top level object and page index. This may have the side | |
| 487 | * effect of executing a copy-on-write on the map entry and/or | |
| 488 | * creating a shadow object, but will not COW any actual VM pages. | |
| 489 | * | |
| 490 | * On success fs.map is left read-locked and various other fields | |
| 491 | * are initialized but not otherwise referenced or locked. | |
| 492 | * | |
| 493 | * NOTE! vm_map_lookup will upgrade the fault_type to VM_FAULT_WRITE | |
| 494 | * if the map entry is a virtual page table and also writable, | |
| 495 | * so we can set the 'A'accessed bit in the virtual page table entry. | |
| 496 | */ | |
| 497 | fs.map = map; | |
| 498 | result = vm_map_lookup(&fs.map, vaddr, fault_type, | |
| 499 | &fs.entry, &fs.first_object, | |
| 500 | &first_pindex, &fs.first_prot, &fs.wired); | |
| 501 | ||
| 502 | if (result != KERN_SUCCESS) { | |
| 503 | *errorp = result; | |
| 504 | return (NULL); | |
| 505 | } | |
| 506 | ||
| 507 | /* | |
| 508 | * fs.map is read-locked | |
| 509 | * | |
| 510 | * Misc checks. Save the map generation number to detect races. | |
| 511 | */ | |
| 512 | fs.map_generation = fs.map->timestamp; | |
| 513 | ||
| 514 | if (fs.entry->eflags & MAP_ENTRY_NOFAULT) { | |
| 515 | panic("vm_fault: fault on nofault entry, addr: %lx", | |
| 516 | (u_long)vaddr); | |
| 517 | } | |
| 518 | ||
| 519 | /* | |
| 520 | * A system map entry may return a NULL object. No object means | |
| 521 | * no pager means an unrecoverable kernel fault. | |
| 522 | */ | |
| 523 | if (fs.first_object == NULL) { | |
| 524 | panic("vm_fault: unrecoverable fault at %p in entry %p", | |
| 525 | (void *)vaddr, fs.entry); | |
| 526 | } | |
| 527 | ||
| 528 | /* | |
| 529 | * Make a reference to this object to prevent its disposal while we | |
| 530 | * are messing with it. Once we have the reference, the map is free | |
| 531 | * to be diddled. Since objects reference their shadows (and copies), | |
| 532 | * they will stay around as well. | |
| 533 | * | |
| 534 | * Bump the paging-in-progress count to prevent size changes (e.g. | |
| 535 | * truncation operations) during I/O. This must be done after | |
| 536 | * obtaining the vnode lock in order to avoid possible deadlocks. | |
| 537 | */ | |
| 538 | vm_object_reference(fs.first_object); | |
| 539 | fs.vp = vnode_pager_lock(fs.first_object); | |
| 540 | vm_object_pip_add(fs.first_object, 1); | |
| 541 | ||
| 542 | fs.lookup_still_valid = TRUE; | |
| 543 | fs.first_m = NULL; | |
| 544 | fs.object = fs.first_object; /* so unlock_and_deallocate works */ | |
| 545 | ||
| 546 | /* | |
| 547 | * If the entry is wired we cannot change the page protection. | |
| 548 | */ | |
| 549 | if (fs.wired) | |
| 550 | fault_type = fs.first_prot; | |
| 551 | ||
| 552 | /* | |
| 553 | * The page we want is at (first_object, first_pindex), but if the | |
| 554 | * vm_map_entry is VM_MAPTYPE_VPAGETABLE we have to traverse the | |
| 555 | * page table to figure out the actual pindex. | |
| 556 | * | |
| 557 | * NOTE! DEVELOPMENT IN PROGRESS, THIS IS AN INITIAL IMPLEMENTATION | |
| 558 | * ONLY | |
| 559 | */ | |
| 560 | if (fs.entry->maptype == VM_MAPTYPE_VPAGETABLE) { | |
| 561 | result = vm_fault_vpagetable(&fs, &first_pindex, | |
| 4e7c41c5 MD |
562 | fs.entry->aux.master_pde, |
| 563 | fault_type); | |
| 4e158347 MD |
564 | if (result == KERN_TRY_AGAIN) |
| 565 | goto RetryFault; | |
| 566 | if (result != KERN_SUCCESS) { | |
| 567 | *errorp = result; | |
| 568 | return (NULL); | |
| 569 | } | |
| 570 | } | |
| 571 | ||
| 572 | /* | |
| 573 | * Now we have the actual (object, pindex), fault in the page. If | |
| 574 | * vm_fault_object() fails it will unlock and deallocate the FS | |
| 575 | * data. If it succeeds everything remains locked and fs->object | |
| 576 | * will have an additinal PIP count if it is not equal to | |
| 577 | * fs->first_object | |
| 578 | */ | |
| 579 | result = vm_fault_object(&fs, first_pindex, fault_type); | |
| 580 | ||
| 581 | if (result == KERN_TRY_AGAIN) | |
| 582 | goto RetryFault; | |
| 583 | if (result != KERN_SUCCESS) { | |
| 584 | *errorp = result; | |
| 585 | return(NULL); | |
| 586 | } | |
| 587 | ||
| 17cde63e MD |
588 | if ((orig_fault_type & VM_PROT_WRITE) && |
| 589 | (fs.prot & VM_PROT_WRITE) == 0) { | |
| 590 | *errorp = KERN_PROTECTION_FAILURE; | |
| 591 | unlock_and_deallocate(&fs); | |
| 592 | return(NULL); | |
| 593 | } | |
| 594 | ||
| 4e158347 MD |
595 | /* |
| 596 | * On success vm_fault_object() does not unlock or deallocate, and fs.m | |
| 597 | * will contain a busied page. | |
| 598 | */ | |
| 599 | unlock_things(&fs); | |
| 600 | ||
| 601 | /* | |
| 602 | * Return a held page. We are not doing any pmap manipulation so do | |
| 5a0e2a66 MD |
603 | * not set PG_MAPPED. However, adjust the page flags according to |
| 604 | * the fault type because the caller may not use a managed pmapping | |
| 605 | * (so we don't want to lose the fact that the page will be dirtied | |
| 606 | * if a write fault was specified). | |
| 4e158347 | 607 | */ |
| 5a0e2a66 | 608 | vm_page_hold(fs.m); |
| 4e158347 | 609 | vm_page_flag_clear(fs.m, PG_ZERO); |
| 5a0e2a66 MD |
610 | if (fault_type & VM_PROT_WRITE) |
| 611 | vm_page_dirty(fs.m); | |
| 4e158347 MD |
612 | |
| 613 | /* | |
| 40d6ef3a MD |
614 | * Update the pmap. We really only have to do this if a COW |
| 615 | * occured to replace the read-only page with the new page. For | |
| 616 | * now just do it unconditionally. XXX | |
| 617 | */ | |
| 618 | pmap_enter(fs.map->pmap, vaddr, fs.m, fs.prot, fs.wired); | |
| 17cde63e | 619 | vm_page_flag_set(fs.m, PG_REFERENCED); |
| 40d6ef3a MD |
620 | |
| 621 | /* | |
| 4e158347 MD |
622 | * Unbusy the page by activating it. It remains held and will not |
| 623 | * be reclaimed. | |
| 624 | */ | |
| 625 | vm_page_activate(fs.m); | |
| 626 | ||
| 627 | if (curthread->td_lwp) { | |
| 628 | if (fs.hardfault) { | |
| 629 | curthread->td_lwp->lwp_ru.ru_majflt++; | |
| 630 | } else { | |
| 631 | curthread->td_lwp->lwp_ru.ru_minflt++; | |
| 632 | } | |
| 633 | } | |
| 634 | ||
| 635 | /* | |
| 636 | * Unlock everything, and return the held page. | |
| 637 | */ | |
| 638 | vm_page_wakeup(fs.m); | |
| 639 | vm_object_deallocate(fs.first_object); | |
| 640 | ||
| 641 | *errorp = 0; | |
| 642 | return(fs.m); | |
| 643 | } | |
| 644 | ||
| 645 | /* | |
| 17cde63e MD |
646 | * Fault in the specified (object,offset), dirty the returned page as |
| 647 | * needed. If the requested fault_type cannot be done NULL and an | |
| 648 | * error is returned. | |
| aa542ad5 MD |
649 | */ |
| 650 | vm_page_t | |
| 651 | vm_fault_object_page(vm_object_t object, vm_ooffset_t offset, | |
| 652 | vm_prot_t fault_type, int fault_flags, int *errorp) | |
| 653 | { | |
| 654 | int result; | |
| 655 | vm_pindex_t first_pindex; | |
| 656 | struct faultstate fs; | |
| 657 | struct vm_map_entry entry; | |
| 658 | ||
| 659 | bzero(&entry, sizeof(entry)); | |
| 660 | entry.object.vm_object = object; | |
| 661 | entry.maptype = VM_MAPTYPE_NORMAL; | |
| 662 | entry.protection = entry.max_protection = fault_type; | |
| 663 | ||
| 664 | fs.didlimit = 0; | |
| 665 | fs.hardfault = 0; | |
| 666 | fs.fault_flags = fault_flags; | |
| 667 | fs.map = NULL; | |
| 668 | KKASSERT((fault_flags & VM_FAULT_WIRE_MASK) == 0); | |
| 669 | ||
| 670 | RetryFault: | |
| 671 | ||
| 672 | fs.first_object = object; | |
| 673 | first_pindex = OFF_TO_IDX(offset); | |
| 674 | fs.entry = &entry; | |
| 675 | fs.first_prot = fault_type; | |
| 676 | fs.wired = 0; | |
| 677 | /*fs.map_generation = 0; unused */ | |
| 678 | ||
| 679 | /* | |
| 680 | * Make a reference to this object to prevent its disposal while we | |
| 681 | * are messing with it. Once we have the reference, the map is free | |
| 682 | * to be diddled. Since objects reference their shadows (and copies), | |
| 683 | * they will stay around as well. | |
| 684 | * | |
| 685 | * Bump the paging-in-progress count to prevent size changes (e.g. | |
| 686 | * truncation operations) during I/O. This must be done after | |
| 687 | * obtaining the vnode lock in order to avoid possible deadlocks. | |
| 688 | */ | |
| 689 | vm_object_reference(fs.first_object); | |
| 690 | fs.vp = vnode_pager_lock(fs.first_object); | |
| 691 | vm_object_pip_add(fs.first_object, 1); | |
| 692 | ||
| 693 | fs.lookup_still_valid = TRUE; | |
| 694 | fs.first_m = NULL; | |
| 695 | fs.object = fs.first_object; /* so unlock_and_deallocate works */ | |
| 696 | ||
| 697 | #if 0 | |
| 698 | /* XXX future - ability to operate on VM object using vpagetable */ | |
| 699 | if (fs.entry->maptype == VM_MAPTYPE_VPAGETABLE) { | |
| 700 | result = vm_fault_vpagetable(&fs, &first_pindex, | |
| 701 | fs.entry->aux.master_pde, | |
| 702 | fault_type); | |
| 703 | if (result == KERN_TRY_AGAIN) | |
| 704 | goto RetryFault; | |
| 705 | if (result != KERN_SUCCESS) { | |
| 706 | *errorp = result; | |
| 707 | return (NULL); | |
| 708 | } | |
| 709 | } | |
| 710 | #endif | |
| 711 | ||
| 712 | /* | |
| 713 | * Now we have the actual (object, pindex), fault in the page. If | |
| 714 | * vm_fault_object() fails it will unlock and deallocate the FS | |
| 715 | * data. If it succeeds everything remains locked and fs->object | |
| 716 | * will have an additinal PIP count if it is not equal to | |
| 717 | * fs->first_object | |
| 718 | */ | |
| 719 | result = vm_fault_object(&fs, first_pindex, fault_type); | |
| 720 | ||
| 721 | if (result == KERN_TRY_AGAIN) | |
| 722 | goto RetryFault; | |
| 723 | if (result != KERN_SUCCESS) { | |
| 724 | *errorp = result; | |
| 725 | return(NULL); | |
| 726 | } | |
| 727 | ||
| 17cde63e MD |
728 | if ((fault_type & VM_PROT_WRITE) && (fs.prot & VM_PROT_WRITE) == 0) { |
| 729 | *errorp = KERN_PROTECTION_FAILURE; | |
| 730 | unlock_and_deallocate(&fs); | |
| 731 | return(NULL); | |
| 732 | } | |
| 733 | ||
| aa542ad5 MD |
734 | /* |
| 735 | * On success vm_fault_object() does not unlock or deallocate, and fs.m | |
| 736 | * will contain a busied page. | |
| 737 | */ | |
| 738 | unlock_things(&fs); | |
| 739 | ||
| 740 | /* | |
| 741 | * Return a held page. We are not doing any pmap manipulation so do | |
| 742 | * not set PG_MAPPED. However, adjust the page flags according to | |
| 743 | * the fault type because the caller may not use a managed pmapping | |
| 744 | * (so we don't want to lose the fact that the page will be dirtied | |
| 745 | * if a write fault was specified). | |
| 746 | */ | |
| 747 | vm_page_hold(fs.m); | |
| 748 | vm_page_flag_clear(fs.m, PG_ZERO); | |
| 749 | if (fault_type & VM_PROT_WRITE) | |
| 750 | vm_page_dirty(fs.m); | |
| 751 | ||
| 752 | /* | |
| 753 | * Indicate that the page was accessed. | |
| 754 | */ | |
| 755 | vm_page_flag_set(fs.m, PG_REFERENCED); | |
| 756 | ||
| 757 | /* | |
| 758 | * Unbusy the page by activating it. It remains held and will not | |
| 759 | * be reclaimed. | |
| 760 | */ | |
| 761 | vm_page_activate(fs.m); | |
| 762 | ||
| 763 | if (curthread->td_lwp) { | |
| 764 | if (fs.hardfault) { | |
| 765 | mycpu->gd_cnt.v_vm_faults++; | |
| 766 | curthread->td_lwp->lwp_ru.ru_majflt++; | |
| 767 | } else { | |
| 768 | curthread->td_lwp->lwp_ru.ru_minflt++; | |
| 769 | } | |
| 770 | } | |
| 771 | ||
| 772 | /* | |
| 773 | * Unlock everything, and return the held page. | |
| 774 | */ | |
| 775 | vm_page_wakeup(fs.m); | |
| 776 | vm_object_deallocate(fs.first_object); | |
| 777 | ||
| 778 | *errorp = 0; | |
| 779 | return(fs.m); | |
| 780 | } | |
| 781 | ||
| 782 | /* | |
| 72579d2e | 783 | * Translate the virtual page number (first_pindex) that is relative |
| afeabdca MD |
784 | * to the address space into a logical page number that is relative to the |
| 785 | * backing object. Use the virtual page table pointed to by (vpte). | |
| 786 | * | |
| 787 | * This implements an N-level page table. Any level can terminate the | |
| 788 | * scan by setting VPTE_PS. A linear mapping is accomplished by setting | |
| 789 | * VPTE_PS in the master page directory entry set via mcontrol(MADV_SETMAP). | |
| 790 | */ | |
| 791 | static | |
| 792 | int | |
| 4e7c41c5 MD |
793 | vm_fault_vpagetable(struct faultstate *fs, vm_pindex_t *pindex, |
| 794 | vpte_t vpte, int fault_type) | |
| afeabdca | 795 | { |
| 5c5185ae | 796 | struct lwbuf *lwb; |
| afeabdca | 797 | int vshift = 32 - PAGE_SHIFT; /* page index bits remaining */ |
| 72579d2e | 798 | int result = KERN_SUCCESS; |
| 4e7c41c5 | 799 | vpte_t *ptep; |
| afeabdca MD |
800 | |
| 801 | for (;;) { | |
| 4e7c41c5 MD |
802 | /* |
| 803 | * We cannot proceed if the vpte is not valid, not readable | |
| 804 | * for a read fault, or not writable for a write fault. | |
| 805 | */ | |
| afeabdca MD |
806 | if ((vpte & VPTE_V) == 0) { |
| 807 | unlock_and_deallocate(fs); | |
| 808 | return (KERN_FAILURE); | |
| 809 | } | |
| 4e7c41c5 MD |
810 | if ((fault_type & VM_PROT_READ) && (vpte & VPTE_R) == 0) { |
| 811 | unlock_and_deallocate(fs); | |
| 812 | return (KERN_FAILURE); | |
| 813 | } | |
| 814 | if ((fault_type & VM_PROT_WRITE) && (vpte & VPTE_W) == 0) { | |
| 815 | unlock_and_deallocate(fs); | |
| 816 | return (KERN_FAILURE); | |
| 817 | } | |
| afeabdca MD |
818 | if ((vpte & VPTE_PS) || vshift == 0) |
| 819 | break; | |
| 820 | KKASSERT(vshift >= VPTE_PAGE_BITS); | |
| 821 | ||
| 822 | /* | |
| 4e7c41c5 MD |
823 | * Get the page table page. Nominally we only read the page |
| 824 | * table, but since we are actively setting VPTE_M and VPTE_A, | |
| 825 | * tell vm_fault_object() that we are writing it. | |
| 826 | * | |
| 827 | * There is currently no real need to optimize this. | |
| afeabdca | 828 | */ |
| 0035dca9 MD |
829 | result = vm_fault_object(fs, vpte >> PAGE_SHIFT, |
| 830 | VM_PROT_READ|VM_PROT_WRITE); | |
| afeabdca MD |
831 | if (result != KERN_SUCCESS) |
| 832 | return (result); | |
| 833 | ||
| 834 | /* | |
| 835 | * Process the returned fs.m and look up the page table | |
| 836 | * entry in the page table page. | |
| 837 | */ | |
| 838 | vshift -= VPTE_PAGE_BITS; | |
| 5c5185ae SG |
839 | lwb = lwbuf_alloc(fs->m); |
| 840 | ptep = ((vpte_t *)lwbuf_kva(lwb) + | |
| 4e7c41c5 MD |
841 | ((*pindex >> vshift) & VPTE_PAGE_MASK)); |
| 842 | vpte = *ptep; | |
| 843 | ||
| 844 | /* | |
| 845 | * Page table write-back. If the vpte is valid for the | |
| 846 | * requested operation, do a write-back to the page table. | |
| 847 | * | |
| 848 | * XXX VPTE_M is not set properly for page directory pages. | |
| 849 | * It doesn't get set in the page directory if the page table | |
| 850 | * is modified during a read access. | |
| 851 | */ | |
| 852 | if ((fault_type & VM_PROT_WRITE) && (vpte & VPTE_V) && | |
| 853 | (vpte & VPTE_W)) { | |
| 0035dca9 | 854 | if ((vpte & (VPTE_M|VPTE_A)) != (VPTE_M|VPTE_A)) { |
| 4e7c41c5 MD |
855 | atomic_set_int(ptep, VPTE_M|VPTE_A); |
| 856 | vm_page_dirty(fs->m); | |
| 857 | } | |
| 858 | } | |
| 859 | if ((fault_type & VM_PROT_READ) && (vpte & VPTE_V) && | |
| 860 | (vpte & VPTE_R)) { | |
| 861 | if ((vpte & VPTE_A) == 0) { | |
| 862 | atomic_set_int(ptep, VPTE_A); | |
| 863 | vm_page_dirty(fs->m); | |
| 864 | } | |
| 865 | } | |
| 5c5185ae | 866 | lwbuf_free(lwb); |
| afeabdca MD |
867 | vm_page_flag_set(fs->m, PG_REFERENCED); |
| 868 | vm_page_activate(fs->m); | |
| 869 | vm_page_wakeup(fs->m); | |
| 870 | cleanup_successful_fault(fs); | |
| 871 | } | |
| afeabdca MD |
872 | /* |
| 873 | * Combine remaining address bits with the vpte. | |
| 874 | */ | |
| 72579d2e MD |
875 | *pindex = (vpte >> PAGE_SHIFT) + |
| 876 | (*pindex & ((1 << vshift) - 1)); | |
| afeabdca MD |
877 | return (KERN_SUCCESS); |
| 878 | } | |
| 879 | ||
| 880 | ||
| 881 | /* | |
| 72579d2e | 882 | * Do all operations required to fault-in (fs.first_object, pindex). Run |
| 568e6804 MD |
883 | * through the shadow chain as necessary and do required COW or virtual |
| 884 | * copy operations. The caller has already fully resolved the vm_map_entry | |
| 885 | * and, if appropriate, has created a copy-on-write layer. All we need to | |
| 886 | * do is iterate the object chain. | |
| 887 | * | |
| 888 | * On failure (fs) is unlocked and deallocated and the caller may return or | |
| 75f59a66 MD |
889 | * retry depending on the failure code. On success (fs) is NOT unlocked or |
| 890 | * deallocated, fs.m will contained a resolved, busied page, and fs.object | |
| 891 | * will have an additional PIP count if it is not equal to fs.first_object. | |
| 568e6804 MD |
892 | */ |
| 893 | static | |
| 894 | int | |
| 72579d2e MD |
895 | vm_fault_object(struct faultstate *fs, |
| 896 | vm_pindex_t first_pindex, vm_prot_t fault_type) | |
| 568e6804 MD |
897 | { |
| 898 | vm_object_t next_object; | |
| 72579d2e | 899 | vm_pindex_t pindex; |
| 568e6804 | 900 | |
| 72579d2e MD |
901 | fs->prot = fs->first_prot; |
| 902 | fs->object = fs->first_object; | |
| 903 | pindex = first_pindex; | |
| 904 | ||
| 4e7c41c5 MD |
905 | /* |
| 906 | * If a read fault occurs we try to make the page writable if | |
| 907 | * possible. There are three cases where we cannot make the | |
| 908 | * page mapping writable: | |
| 909 | * | |
| 910 | * (1) The mapping is read-only or the VM object is read-only, | |
| 0035dca9 | 911 | * fs->prot above will simply not have VM_PROT_WRITE set. |
| 4e7c41c5 MD |
912 | * |
| 913 | * (2) If the mapping is a virtual page table we need to be able | |
| 70fc5283 MD |
914 | * to detect writes so we can set VPTE_M in the virtual page |
| 915 | * table. | |
| 4e7c41c5 MD |
916 | * |
| 917 | * (3) If the VM page is read-only or copy-on-write, upgrading would | |
| 918 | * just result in an unnecessary COW fault. | |
| 0035dca9 MD |
919 | * |
| 920 | * VM_PROT_VPAGED is set if faulting via a virtual page table and | |
| 921 | * causes adjustments to the 'M'odify bit to also turn off write | |
| 922 | * access to force a re-fault. | |
| 4e7c41c5 | 923 | */ |
| 0035dca9 MD |
924 | if (fs->entry->maptype == VM_MAPTYPE_VPAGETABLE) { |
| 925 | if ((fault_type & VM_PROT_WRITE) == 0) | |
| 926 | fs->prot &= ~VM_PROT_WRITE; | |
| 4e7c41c5 MD |
927 | } |
| 928 | ||
| 568e6804 MD |
929 | for (;;) { |
| 930 | /* | |
| 931 | * If the object is dead, we stop here | |
| 932 | */ | |
| 933 | if (fs->object->flags & OBJ_DEAD) { | |
| 934 | unlock_and_deallocate(fs); | |
| 984263bc MD |
935 | return (KERN_PROTECTION_FAILURE); |
| 936 | } | |
| 937 | ||
| 938 | /* | |
| 06ecca5a MD |
939 | * See if page is resident. spl protection is required |
| 940 | * to avoid an interrupt unbusy/free race against our | |
| 941 | * lookup. We must hold the protection through a page | |
| 942 | * allocation or busy. | |
| 984263bc | 943 | */ |
| 654a39f0 | 944 | crit_enter(); |
| 72579d2e | 945 | fs->m = vm_page_lookup(fs->object, pindex); |
| 568e6804 | 946 | if (fs->m != NULL) { |
| 06ecca5a | 947 | int queue; |
| 984263bc MD |
948 | /* |
| 949 | * Wait/Retry if the page is busy. We have to do this | |
| 950 | * if the page is busy via either PG_BUSY or | |
| 951 | * vm_page_t->busy because the vm_pager may be using | |
| 952 | * vm_page_t->busy for pageouts ( and even pageins if | |
| 953 | * it is the vnode pager ), and we could end up trying | |
| 954 | * to pagein and pageout the same page simultaneously. | |
| 955 | * | |
| 956 | * We can theoretically allow the busy case on a read | |
| 957 | * fault if the page is marked valid, but since such | |
| 958 | * pages are typically already pmap'd, putting that | |
| 959 | * special case in might be more effort then it is | |
| 960 | * worth. We cannot under any circumstances mess | |
| 961 | * around with a vm_page_t->busy page except, perhaps, | |
| 962 | * to pmap it. | |
| 963 | */ | |
| 568e6804 MD |
964 | if ((fs->m->flags & PG_BUSY) || fs->m->busy) { |
| 965 | unlock_things(fs); | |
| 966 | vm_page_sleep_busy(fs->m, TRUE, "vmpfw"); | |
| 12e4aaff | 967 | mycpu->gd_cnt.v_intrans++; |
| 568e6804 | 968 | vm_object_deallocate(fs->first_object); |
| bc823b32 | 969 | fs->first_object = NULL; |
| 654a39f0 | 970 | crit_exit(); |
| 568e6804 | 971 | return (KERN_TRY_AGAIN); |
| 984263bc MD |
972 | } |
| 973 | ||
| 568e6804 MD |
974 | /* |
| 975 | * If reactivating a page from PQ_CACHE we may have | |
| 976 | * to rate-limit. | |
| 977 | */ | |
| 978 | queue = fs->m->queue; | |
| 979 | vm_page_unqueue_nowakeup(fs->m); | |
| 984263bc | 980 | |
| 568e6804 MD |
981 | if ((queue - fs->m->pc) == PQ_CACHE && |
| 982 | vm_page_count_severe()) { | |
| 983 | vm_page_activate(fs->m); | |
| 984 | unlock_and_deallocate(fs); | |
| 659c6a07 | 985 | vm_waitpfault(); |
| 654a39f0 | 986 | crit_exit(); |
| 568e6804 | 987 | return (KERN_TRY_AGAIN); |
| 984263bc MD |
988 | } |
| 989 | ||
| 990 | /* | |
| 991 | * Mark page busy for other processes, and the | |
| 992 | * pagedaemon. If it still isn't completely valid | |
| cf1bb2a8 MD |
993 | * (readable), or if a read-ahead-mark is set on |
| 994 | * the VM page, jump to readrest, else we found the | |
| 568e6804 | 995 | * page and can return. |
| 06ecca5a MD |
996 | * |
| 997 | * We can release the spl once we have marked the | |
| 998 | * page busy. | |
| 984263bc | 999 | */ |
| 568e6804 | 1000 | vm_page_busy(fs->m); |
| 654a39f0 | 1001 | crit_exit(); |
| 06ecca5a | 1002 | |
| cf1bb2a8 MD |
1003 | if (fs->m->object != &kernel_object) { |
| 1004 | if ((fs->m->valid & VM_PAGE_BITS_ALL) != | |
| 1005 | VM_PAGE_BITS_ALL) { | |
| 1006 | goto readrest; | |
| 1007 | } | |
| 1008 | if (fs->m->flags & PG_RAM) { | |
| 1009 | if (debug_cluster) | |
| 1010 | kprintf("R"); | |
| 1011 | vm_page_flag_clear(fs->m, PG_RAM); | |
| 1012 | goto readrest; | |
| 1013 | } | |
| 984263bc | 1014 | } |
| 568e6804 | 1015 | break; /* break to PAGE HAS BEEN FOUND */ |
| 984263bc MD |
1016 | } |
| 1017 | ||
| 1018 | /* | |
| 1019 | * Page is not resident, If this is the search termination | |
| 1020 | * or the pager might contain the page, allocate a new page. | |
| 06ecca5a | 1021 | * |
| 568e6804 | 1022 | * NOTE: We are still in a critical section. |
| 984263bc | 1023 | */ |
| 568e6804 MD |
1024 | if (TRYPAGER(fs) || fs->object == fs->first_object) { |
| 1025 | /* | |
| 1026 | * If the page is beyond the object size we fail | |
| 1027 | */ | |
| 72579d2e | 1028 | if (pindex >= fs->object->size) { |
| 654a39f0 | 1029 | crit_exit(); |
| 568e6804 | 1030 | unlock_and_deallocate(fs); |
| 984263bc MD |
1031 | return (KERN_PROTECTION_FAILURE); |
| 1032 | } | |
| 1033 | ||
| 1034 | /* | |
| 46311ac2 MD |
1035 | * Ratelimit. |
| 1036 | */ | |
| 568e6804 MD |
1037 | if (fs->didlimit == 0 && curproc != NULL) { |
| 1038 | int limticks; | |
| 1039 | ||
| 1040 | limticks = vm_fault_ratelimit(curproc->p_vmspace); | |
| 46311ac2 MD |
1041 | if (limticks) { |
| 1042 | crit_exit(); | |
| 568e6804 | 1043 | unlock_and_deallocate(fs); |
| 46311ac2 | 1044 | tsleep(curproc, 0, "vmrate", limticks); |
| 568e6804 MD |
1045 | fs->didlimit = 1; |
| 1046 | return (KERN_TRY_AGAIN); | |
| 46311ac2 MD |
1047 | } |
| 1048 | } | |
| 1049 | ||
| 1050 | /* | |
| 984263bc MD |
1051 | * Allocate a new page for this object/offset pair. |
| 1052 | */ | |
| 568e6804 | 1053 | fs->m = NULL; |
| 984263bc | 1054 | if (!vm_page_count_severe()) { |
| 72579d2e | 1055 | fs->m = vm_page_alloc(fs->object, pindex, |
| 568e6804 | 1056 | (fs->vp || fs->object->backing_object) ? VM_ALLOC_NORMAL : VM_ALLOC_NORMAL | VM_ALLOC_ZERO); |
| 984263bc | 1057 | } |
| 568e6804 | 1058 | if (fs->m == NULL) { |
| 654a39f0 | 1059 | crit_exit(); |
| 568e6804 | 1060 | unlock_and_deallocate(fs); |
| 659c6a07 | 1061 | vm_waitpfault(); |
| 568e6804 | 1062 | return (KERN_TRY_AGAIN); |
| 984263bc MD |
1063 | } |
| 1064 | } | |
| 654a39f0 | 1065 | crit_exit(); |
| 984263bc MD |
1066 | |
| 1067 | readrest: | |
| 1068 | /* | |
| 1b9d3514 | 1069 | * We have found an invalid or partially valid page, a |
| 1c9602b3 MD |
1070 | * page with a read-ahead mark which might be partially or |
| 1071 | * fully valid (and maybe dirty too), or we have allocated | |
| 1072 | * a new page. | |
| 984263bc MD |
1073 | * |
| 1074 | * Attempt to fault-in the page if there is a chance that the | |
| 1075 | * pager has it, and potentially fault in additional pages | |
| 1076 | * at the same time. | |
| 06ecca5a MD |
1077 | * |
| 1078 | * We are NOT in splvm here and if TRYPAGER is true then | |
| 1079 | * fs.m will be non-NULL and will be PG_BUSY for us. | |
| 984263bc | 1080 | */ |
| 568e6804 | 1081 | if (TRYPAGER(fs)) { |
| 984263bc | 1082 | int rv; |
| 1b9d3514 | 1083 | int seqaccess; |
| 568e6804 | 1084 | u_char behavior = vm_map_entry_behavior(fs->entry); |
| 984263bc | 1085 | |
| 1b9d3514 MD |
1086 | if (behavior == MAP_ENTRY_BEHAV_RANDOM) |
| 1087 | seqaccess = 0; | |
| 1088 | else | |
| 1089 | seqaccess = -1; | |
| 984263bc | 1090 | |
| 1b9d3514 MD |
1091 | /* |
| 1092 | * If sequential access is detected then attempt | |
| 1093 | * to deactivate/cache pages behind the scan to | |
| 1094 | * prevent resource hogging. | |
| 1095 | * | |
| 1096 | * Use of PG_RAM to detect sequential access | |
| 1097 | * also simulates multi-zone sequential access | |
| 1098 | * detection for free. | |
| 1099 | * | |
| 1100 | * NOTE: Partially valid dirty pages cannot be | |
| 1101 | * deactivated without causing NFS picemeal | |
| 1102 | * writes to barf. | |
| 1103 | */ | |
| 568e6804 | 1104 | if ((fs->first_object->type != OBJT_DEVICE) && |
| 984263bc MD |
1105 | (behavior == MAP_ENTRY_BEHAV_SEQUENTIAL || |
| 1106 | (behavior != MAP_ENTRY_BEHAV_RANDOM && | |
| 1b9d3514 | 1107 | (fs->m->flags & PG_RAM))) |
| 984263bc | 1108 | ) { |
| 1b9d3514 MD |
1109 | vm_pindex_t scan_pindex; |
| 1110 | int scan_count = 16; | |
| 1111 | ||
| 1112 | if (first_pindex < 16) { | |
| 1113 | scan_pindex = 0; | |
| 1114 | scan_count = 0; | |
| 1115 | } else { | |
| 1116 | scan_pindex = first_pindex - 16; | |
| 1117 | if (scan_pindex < 16) | |
| 1118 | scan_count = scan_pindex; | |
| 1119 | else | |
| 1120 | scan_count = 16; | |
| 1121 | } | |
| 984263bc | 1122 | |
| 654a39f0 | 1123 | crit_enter(); |
| 1b9d3514 | 1124 | while (scan_count) { |
| 984263bc | 1125 | vm_page_t mt; |
| 568e6804 | 1126 | |
| 1b9d3514 MD |
1127 | mt = vm_page_lookup(fs->first_object, |
| 1128 | scan_pindex); | |
| 1129 | if (mt == NULL || | |
| 1130 | (mt->valid != VM_PAGE_BITS_ALL)) { | |
| 984263bc | 1131 | break; |
| 1b9d3514 | 1132 | } |
| 984263bc | 1133 | if (mt->busy || |
| 1b9d3514 MD |
1134 | (mt->flags & (PG_BUSY | PG_FICTITIOUS | PG_UNMANAGED)) || |
| 1135 | mt->hold_count || | |
| 1136 | mt->wire_count) { | |
| 1137 | goto skip; | |
| 1138 | } | |
| 984263bc MD |
1139 | if (mt->dirty == 0) |
| 1140 | vm_page_test_dirty(mt); | |
| 1141 | if (mt->dirty) { | |
| 17cde63e | 1142 | vm_page_busy(mt); |
| 1b9d3514 MD |
1143 | vm_page_protect(mt, |
| 1144 | VM_PROT_NONE); | |
| 984263bc | 1145 | vm_page_deactivate(mt); |
| 17cde63e | 1146 | vm_page_wakeup(mt); |
| 984263bc MD |
1147 | } else { |
| 1148 | vm_page_cache(mt); | |
| 1149 | } | |
| 1b9d3514 MD |
1150 | skip: |
| 1151 | --scan_count; | |
| 1152 | --scan_pindex; | |
| 984263bc | 1153 | } |
| 654a39f0 | 1154 | crit_exit(); |
| 984263bc | 1155 | |
| 1b9d3514 | 1156 | seqaccess = 1; |
| 984263bc MD |
1157 | } |
| 1158 | ||
| 1159 | /* | |
| 1b9d3514 MD |
1160 | * Avoid deadlocking against the map when doing I/O. |
| 1161 | * fs.object and the page is PG_BUSY'd. | |
| 984263bc | 1162 | */ |
| 1b9d3514 | 1163 | unlock_map(fs); |
| 984263bc MD |
1164 | |
| 1165 | /* | |
| 1b9d3514 MD |
1166 | * Acquire the page data. We still hold a ref on |
| 1167 | * fs.object and the page has been PG_BUSY's. | |
| 1168 | * | |
| 1169 | * The pager may replace the page (for example, in | |
| 1170 | * order to enter a fictitious page into the | |
| 1171 | * object). If it does so it is responsible for | |
| 1172 | * cleaning up the passed page and properly setting | |
| 1173 | * the new page PG_BUSY. | |
| 1c9602b3 MD |
1174 | * |
| 1175 | * If we got here through a PG_RAM read-ahead | |
| 1176 | * mark the page may be partially dirty and thus | |
| 1177 | * not freeable. Don't bother checking to see | |
| 1178 | * if the pager has the page because we can't free | |
| 1179 | * it anyway. We have to depend on the get_page | |
| 1180 | * operation filling in any gaps whether there is | |
| 1181 | * backing store or not. | |
| 984263bc | 1182 | */ |
| 1c9602b3 | 1183 | rv = vm_pager_get_page(fs->object, &fs->m, seqaccess); |
| 984263bc MD |
1184 | |
| 1185 | if (rv == VM_PAGER_OK) { | |
| 1186 | /* | |
| 984263bc MD |
1187 | * Relookup in case pager changed page. Pager |
| 1188 | * is responsible for disposition of old page | |
| 1189 | * if moved. | |
| 06ecca5a MD |
1190 | * |
| 1191 | * XXX other code segments do relookups too. | |
| 1192 | * It's a bad abstraction that needs to be | |
| 1193 | * fixed/removed. | |
| 984263bc | 1194 | */ |
| 72579d2e | 1195 | fs->m = vm_page_lookup(fs->object, pindex); |
| 568e6804 MD |
1196 | if (fs->m == NULL) { |
| 1197 | unlock_and_deallocate(fs); | |
| 1198 | return (KERN_TRY_AGAIN); | |
| 984263bc MD |
1199 | } |
| 1200 | ||
| 568e6804 | 1201 | ++fs->hardfault; |
| 984263bc MD |
1202 | break; /* break to PAGE HAS BEEN FOUND */ |
| 1203 | } | |
| 568e6804 | 1204 | |
| 984263bc MD |
1205 | /* |
| 1206 | * Remove the bogus page (which does not exist at this | |
| 1207 | * object/offset); before doing so, we must get back | |
| 1208 | * our object lock to preserve our invariant. | |
| 1209 | * | |
| 1210 | * Also wake up any other process that may want to bring | |
| 1211 | * in this page. | |
| 1212 | * | |
| 1213 | * If this is the top-level object, we must leave the | |
| 1214 | * busy page to prevent another process from rushing | |
| 1215 | * past us, and inserting the page in that object at | |
| 1216 | * the same time that we are. | |
| 1217 | */ | |
| a0bc8638 MD |
1218 | if (rv == VM_PAGER_ERROR) { |
| 1219 | if (curproc) | |
| 086c1d7e | 1220 | kprintf("vm_fault: pager read error, pid %d (%s)\n", curproc->p_pid, curproc->p_comm); |
| a0bc8638 | 1221 | else |
| 086c1d7e | 1222 | kprintf("vm_fault: pager read error, thread %p (%s)\n", curthread, curproc->p_comm); |
| a0bc8638 | 1223 | } |
| 1b9d3514 | 1224 | |
| 984263bc MD |
1225 | /* |
| 1226 | * Data outside the range of the pager or an I/O error | |
| a55afca2 MD |
1227 | * |
| 1228 | * The page may have been wired during the pagein, | |
| 1229 | * e.g. by the buffer cache, and cannot simply be | |
| 1b9d3514 | 1230 | * freed. Call vnode_pager_freepage() to deal with it. |
| 984263bc MD |
1231 | */ |
| 1232 | /* | |
| 1233 | * XXX - the check for kernel_map is a kludge to work | |
| 1234 | * around having the machine panic on a kernel space | |
| 1235 | * fault w/ I/O error. | |
| 1236 | */ | |
| 1b9d3514 MD |
1237 | if (((fs->map != &kernel_map) && |
| 1238 | (rv == VM_PAGER_ERROR)) || (rv == VM_PAGER_BAD)) { | |
| a55afca2 | 1239 | vnode_pager_freepage(fs->m); |
| 568e6804 MD |
1240 | fs->m = NULL; |
| 1241 | unlock_and_deallocate(fs); | |
| 1242 | if (rv == VM_PAGER_ERROR) | |
| 1243 | return (KERN_FAILURE); | |
| 1244 | else | |
| 1245 | return (KERN_PROTECTION_FAILURE); | |
| 1246 | /* NOT REACHED */ | |
| 984263bc | 1247 | } |
| 568e6804 | 1248 | if (fs->object != fs->first_object) { |
| a55afca2 | 1249 | vnode_pager_freepage(fs->m); |
| 568e6804 | 1250 | fs->m = NULL; |
| 984263bc MD |
1251 | /* |
| 1252 | * XXX - we cannot just fall out at this | |
| 1253 | * point, m has been freed and is invalid! | |
| 1254 | */ | |
| 1255 | } | |
| 1256 | } | |
| 1257 | ||
| 1258 | /* | |
| 568e6804 | 1259 | * We get here if the object has a default pager (or unwiring) |
| 984263bc MD |
1260 | * or the pager doesn't have the page. |
| 1261 | */ | |
| 568e6804 MD |
1262 | if (fs->object == fs->first_object) |
| 1263 | fs->first_m = fs->m; | |
| 984263bc MD |
1264 | |
| 1265 | /* | |
| 1266 | * Move on to the next object. Lock the next object before | |
| 1267 | * unlocking the current one. | |
| 1268 | */ | |
| 72579d2e | 1269 | pindex += OFF_TO_IDX(fs->object->backing_object_offset); |
| 568e6804 | 1270 | next_object = fs->object->backing_object; |
| 984263bc MD |
1271 | if (next_object == NULL) { |
| 1272 | /* | |
| 1273 | * If there's no object left, fill the page in the top | |
| 1274 | * object with zeros. | |
| 1275 | */ | |
| 568e6804 MD |
1276 | if (fs->object != fs->first_object) { |
| 1277 | vm_object_pip_wakeup(fs->object); | |
| 984263bc | 1278 | |
| 568e6804 | 1279 | fs->object = fs->first_object; |
| 72579d2e | 1280 | pindex = first_pindex; |
| 568e6804 | 1281 | fs->m = fs->first_m; |
| 984263bc | 1282 | } |
| 568e6804 | 1283 | fs->first_m = NULL; |
| 984263bc MD |
1284 | |
| 1285 | /* | |
| 1286 | * Zero the page if necessary and mark it valid. | |
| 1287 | */ | |
| 568e6804 MD |
1288 | if ((fs->m->flags & PG_ZERO) == 0) { |
| 1289 | vm_page_zero_fill(fs->m); | |
| 984263bc | 1290 | } else { |
| 12e4aaff | 1291 | mycpu->gd_cnt.v_ozfod++; |
| 984263bc | 1292 | } |
| 12e4aaff | 1293 | mycpu->gd_cnt.v_zfod++; |
| 568e6804 | 1294 | fs->m->valid = VM_PAGE_BITS_ALL; |
| 984263bc MD |
1295 | break; /* break to PAGE HAS BEEN FOUND */ |
| 1296 | } else { | |
| 568e6804 MD |
1297 | if (fs->object != fs->first_object) { |
| 1298 | vm_object_pip_wakeup(fs->object); | |
| 984263bc | 1299 | } |
| 568e6804 MD |
1300 | KASSERT(fs->object != next_object, ("object loop %p", next_object)); |
| 1301 | fs->object = next_object; | |
| 1302 | vm_object_pip_add(fs->object, 1); | |
| 984263bc MD |
1303 | } |
| 1304 | } | |
| 1305 | ||
| 984263bc MD |
1306 | /* |
| 1307 | * PAGE HAS BEEN FOUND. [Loop invariant still holds -- the object lock | |
| 1308 | * is held.] | |
| 1b9d3514 | 1309 | * |
| 984263bc MD |
1310 | * If the page is being written, but isn't already owned by the |
| 1311 | * top-level object, we have to copy it into a new page owned by the | |
| 1312 | * top-level object. | |
| 1313 | */ | |
| 1b9d3514 MD |
1314 | KASSERT((fs->m->flags & PG_BUSY) != 0, |
| 1315 | ("vm_fault: not busy after main loop")); | |
| 1316 | ||
| 568e6804 | 1317 | if (fs->object != fs->first_object) { |
| 984263bc MD |
1318 | /* |
| 1319 | * We only really need to copy if we want to write it. | |
| 1320 | */ | |
| 984263bc MD |
1321 | if (fault_type & VM_PROT_WRITE) { |
| 1322 | /* | |
| 1323 | * This allows pages to be virtually copied from a | |
| 1324 | * backing_object into the first_object, where the | |
| 1325 | * backing object has no other refs to it, and cannot | |
| 1326 | * gain any more refs. Instead of a bcopy, we just | |
| 1327 | * move the page from the backing object to the | |
| 1328 | * first object. Note that we must mark the page | |
| 1329 | * dirty in the first object so that it will go out | |
| 1330 | * to swap when needed. | |
| 1331 | */ | |
| aa542ad5 MD |
1332 | if ( |
| 1333 | /* | |
| 1334 | * Map, if present, has not changed | |
| 1335 | */ | |
| 1336 | (fs->map == NULL || | |
| 1337 | fs->map_generation == fs->map->timestamp) && | |
| 984263bc MD |
1338 | /* |
| 1339 | * Only one shadow object | |
| 1340 | */ | |
| 568e6804 | 1341 | (fs->object->shadow_count == 1) && |
| 984263bc MD |
1342 | /* |
| 1343 | * No COW refs, except us | |
| 1344 | */ | |
| 568e6804 | 1345 | (fs->object->ref_count == 1) && |
| 984263bc MD |
1346 | /* |
| 1347 | * No one else can look this object up | |
| 1348 | */ | |
| 568e6804 | 1349 | (fs->object->handle == NULL) && |
| 984263bc MD |
1350 | /* |
| 1351 | * No other ways to look the object up | |
| 1352 | */ | |
| 568e6804 MD |
1353 | ((fs->object->type == OBJT_DEFAULT) || |
| 1354 | (fs->object->type == OBJT_SWAP)) && | |
| 984263bc MD |
1355 | /* |
| 1356 | * We don't chase down the shadow chain | |
| 1357 | */ | |
| 568e6804 | 1358 | (fs->object == fs->first_object->backing_object) && |
| 984263bc MD |
1359 | |
| 1360 | /* | |
| 1361 | * grab the lock if we need to | |
| 1362 | */ | |
| 568e6804 | 1363 | (fs->lookup_still_valid || |
| aa542ad5 | 1364 | fs->map == NULL || |
| 568e6804 | 1365 | lockmgr(&fs->map->lock, LK_EXCLUSIVE|LK_NOWAIT) == 0) |
| 984263bc MD |
1366 | ) { |
| 1367 | ||
| 568e6804 | 1368 | fs->lookup_still_valid = 1; |
| 984263bc MD |
1369 | /* |
| 1370 | * get rid of the unnecessary page | |
| 1371 | */ | |
| 568e6804 MD |
1372 | vm_page_protect(fs->first_m, VM_PROT_NONE); |
| 1373 | vm_page_free(fs->first_m); | |
| 1374 | fs->first_m = NULL; | |
| 984263bc MD |
1375 | |
| 1376 | /* | |
| 1377 | * grab the page and put it into the | |
| 1378 | * process'es object. The page is | |
| 1379 | * automatically made dirty. | |
| 1380 | */ | |
| 72579d2e | 1381 | vm_page_rename(fs->m, fs->first_object, first_pindex); |
| 568e6804 MD |
1382 | fs->first_m = fs->m; |
| 1383 | vm_page_busy(fs->first_m); | |
| 1384 | fs->m = NULL; | |
| 12e4aaff | 1385 | mycpu->gd_cnt.v_cow_optim++; |
| 984263bc MD |
1386 | } else { |
| 1387 | /* | |
| 1388 | * Oh, well, lets copy it. | |
| 1389 | */ | |
| 568e6804 | 1390 | vm_page_copy(fs->m, fs->first_m); |
| 10192bae | 1391 | vm_page_event(fs->m, VMEVENT_COW); |
| 984263bc MD |
1392 | } |
| 1393 | ||
| 568e6804 | 1394 | if (fs->m) { |
| 984263bc MD |
1395 | /* |
| 1396 | * We no longer need the old page or object. | |
| 1397 | */ | |
| 568e6804 | 1398 | release_page(fs); |
| 984263bc MD |
1399 | } |
| 1400 | ||
| 1401 | /* | |
| 568e6804 | 1402 | * fs->object != fs->first_object due to above |
| 984263bc MD |
1403 | * conditional |
| 1404 | */ | |
| 568e6804 | 1405 | vm_object_pip_wakeup(fs->object); |
| 984263bc MD |
1406 | |
| 1407 | /* | |
| 1408 | * Only use the new page below... | |
| 1409 | */ | |
| 1410 | ||
| 12e4aaff | 1411 | mycpu->gd_cnt.v_cow_faults++; |
| 568e6804 MD |
1412 | fs->m = fs->first_m; |
| 1413 | fs->object = fs->first_object; | |
| 72579d2e | 1414 | pindex = first_pindex; |
| 984263bc | 1415 | } else { |
| 568e6804 MD |
1416 | /* |
| 1417 | * If it wasn't a write fault avoid having to copy | |
| 1418 | * the page by mapping it read-only. | |
| 1419 | */ | |
| 1420 | fs->prot &= ~VM_PROT_WRITE; | |
| 984263bc MD |
1421 | } |
| 1422 | } | |
| 1423 | ||
| 1424 | /* | |
| 568e6804 MD |
1425 | * We may have had to unlock a map to do I/O. If we did then |
| 1426 | * lookup_still_valid will be FALSE. If the map generation count | |
| 1427 | * also changed then all sorts of things could have happened while | |
| 1428 | * we were doing the I/O and we need to retry. | |
| 984263bc MD |
1429 | */ |
| 1430 | ||
| 568e6804 | 1431 | if (!fs->lookup_still_valid && |
| aa542ad5 | 1432 | fs->map != NULL && |
| 568e6804 MD |
1433 | (fs->map->timestamp != fs->map_generation)) { |
| 1434 | release_page(fs); | |
| 1435 | unlock_and_deallocate(fs); | |
| 1436 | return (KERN_TRY_AGAIN); | |
| 1437 | } | |
| 1438 | ||
| 984263bc | 1439 | /* |
| 17cde63e MD |
1440 | * If the fault is a write, we know that this page is being |
| 1441 | * written NOW so dirty it explicitly to save on pmap_is_modified() | |
| 1442 | * calls later. | |
| 1443 | * | |
| 1444 | * If this is a NOSYNC mmap we do not want to set PG_NOSYNC | |
| 1445 | * if the page is already dirty to prevent data written with | |
| 1446 | * the expectation of being synced from not being synced. | |
| 1447 | * Likewise if this entry does not request NOSYNC then make | |
| 1448 | * sure the page isn't marked NOSYNC. Applications sharing | |
| 1449 | * data should use the same flags to avoid ping ponging. | |
| 1450 | * | |
| 1451 | * Also tell the backing pager, if any, that it should remove | |
| 1452 | * any swap backing since the page is now dirty. | |
| 984263bc | 1453 | */ |
| 568e6804 | 1454 | if (fs->prot & VM_PROT_WRITE) { |
| 568e6804 | 1455 | vm_object_set_writeable_dirty(fs->m->object); |
| 568e6804 MD |
1456 | if (fs->entry->eflags & MAP_ENTRY_NOSYNC) { |
| 1457 | if (fs->m->dirty == 0) | |
| 1458 | vm_page_flag_set(fs->m, PG_NOSYNC); | |
| 984263bc | 1459 | } else { |
| 568e6804 | 1460 | vm_page_flag_clear(fs->m, PG_NOSYNC); |
| 984263bc | 1461 | } |
| 568e6804 | 1462 | if (fs->fault_flags & VM_FAULT_DIRTY) { |
| 654a39f0 | 1463 | crit_enter(); |
| 568e6804 | 1464 | vm_page_dirty(fs->m); |
| 107e9bcc | 1465 | swap_pager_unswapped(fs->m); |
| 654a39f0 | 1466 | crit_exit(); |
| 984263bc MD |
1467 | } |
| 1468 | } | |
| 1469 | ||
| 1470 | /* | |
| 75f59a66 MD |
1471 | * Page had better still be busy. We are still locked up and |
| 1472 | * fs->object will have another PIP reference if it is not equal | |
| 1473 | * to fs->first_object. | |
| 984263bc | 1474 | */ |
| 568e6804 MD |
1475 | KASSERT(fs->m->flags & PG_BUSY, |
| 1476 | ("vm_fault: page %p not busy!", fs->m)); | |
| 984263bc | 1477 | |
| 984263bc MD |
1478 | /* |
| 1479 | * Sanity check: page must be completely valid or it is not fit to | |
| 1480 | * map into user space. vm_pager_get_pages() ensures this. | |
| 1481 | */ | |
| 568e6804 MD |
1482 | if (fs->m->valid != VM_PAGE_BITS_ALL) { |
| 1483 | vm_page_zero_invalid(fs->m, TRUE); | |
| 086c1d7e | 1484 | kprintf("Warning: page %p partially invalid on fault\n", fs->m); |
| 984263bc MD |
1485 | } |
| 1486 | ||
| 984263bc | 1487 | return (KERN_SUCCESS); |
| 984263bc MD |
1488 | } |
| 1489 | ||
| 1490 | /* | |
| f2d22ebf MD |
1491 | * Wire down a range of virtual addresses in a map. The entry in question |
| 1492 | * should be marked in-transition and the map must be locked. We must | |
| 1493 | * release the map temporarily while faulting-in the page to avoid a | |
| 1494 | * deadlock. Note that the entry may be clipped while we are blocked but | |
| 1495 | * will never be freed. | |
| 984263bc MD |
1496 | */ |
| 1497 | int | |
| f2d22ebf | 1498 | vm_fault_wire(vm_map_t map, vm_map_entry_t entry, boolean_t user_wire) |
| 984263bc | 1499 | { |
| f2d22ebf MD |
1500 | boolean_t fictitious; |
| 1501 | vm_offset_t start; | |
| 1502 | vm_offset_t end; | |
| 5f910b2f | 1503 | vm_offset_t va; |
| f2d22ebf | 1504 | vm_paddr_t pa; |
| 5f910b2f | 1505 | pmap_t pmap; |
| 984263bc MD |
1506 | int rv; |
| 1507 | ||
| 1508 | pmap = vm_map_pmap(map); | |
| f2d22ebf MD |
1509 | start = entry->start; |
| 1510 | end = entry->end; | |
| 1511 | fictitious = entry->object.vm_object && | |
| 1512 | (entry->object.vm_object->type == OBJT_DEVICE); | |
| 984263bc | 1513 | |
| f2d22ebf MD |
1514 | vm_map_unlock(map); |
| 1515 | map->timestamp++; | |
| 984263bc MD |
1516 | |
| 1517 | /* | |
| 984263bc MD |
1518 | * We simulate a fault to get the page and enter it in the physical |
| 1519 | * map. | |
| 1520 | */ | |
| 1521 | for (va = start; va < end; va += PAGE_SIZE) { | |
| f2d22ebf MD |
1522 | if (user_wire) { |
| 1523 | rv = vm_fault(map, va, VM_PROT_READ, | |
| 1524 | VM_FAULT_USER_WIRE); | |
| 1525 | } else { | |
| 1526 | rv = vm_fault(map, va, VM_PROT_READ|VM_PROT_WRITE, | |
| 1527 | VM_FAULT_CHANGE_WIRING); | |
| 1528 | } | |
| 984263bc | 1529 | if (rv) { |
| f2d22ebf MD |
1530 | while (va > start) { |
| 1531 | va -= PAGE_SIZE; | |
| 1532 | if ((pa = pmap_extract(pmap, va)) == 0) | |
| 1533 | continue; | |
| 1534 | pmap_change_wiring(pmap, va, FALSE); | |
| 1535 | if (!fictitious) | |
| 1536 | vm_page_unwire(PHYS_TO_VM_PAGE(pa), 1); | |
| 1537 | } | |
| f2d22ebf | 1538 | vm_map_lock(map); |
| 984263bc MD |
1539 | return (rv); |
| 1540 | } | |
| 1541 | } | |
| f2d22ebf | 1542 | vm_map_lock(map); |
| 984263bc MD |
1543 | return (KERN_SUCCESS); |
| 1544 | } | |
| 1545 | ||
| 984263bc | 1546 | /* |
| f2d22ebf MD |
1547 | * Unwire a range of virtual addresses in a map. The map should be |
| 1548 | * locked. | |
| 984263bc MD |
1549 | */ |
| 1550 | void | |
| f2d22ebf | 1551 | vm_fault_unwire(vm_map_t map, vm_map_entry_t entry) |
| 984263bc | 1552 | { |
| f2d22ebf MD |
1553 | boolean_t fictitious; |
| 1554 | vm_offset_t start; | |
| 1555 | vm_offset_t end; | |
| 6ef943a3 MD |
1556 | vm_offset_t va; |
| 1557 | vm_paddr_t pa; | |
| 5f910b2f | 1558 | pmap_t pmap; |
| 984263bc MD |
1559 | |
| 1560 | pmap = vm_map_pmap(map); | |
| f2d22ebf MD |
1561 | start = entry->start; |
| 1562 | end = entry->end; | |
| 1563 | fictitious = entry->object.vm_object && | |
| 1564 | (entry->object.vm_object->type == OBJT_DEVICE); | |
| 984263bc MD |
1565 | |
| 1566 | /* | |
| 1567 | * Since the pages are wired down, we must be able to get their | |
| 1568 | * mappings from the physical map system. | |
| 1569 | */ | |
| 984263bc MD |
1570 | for (va = start; va < end; va += PAGE_SIZE) { |
| 1571 | pa = pmap_extract(pmap, va); | |
| 6ef943a3 | 1572 | if (pa != 0) { |
| 984263bc | 1573 | pmap_change_wiring(pmap, va, FALSE); |
| f2d22ebf MD |
1574 | if (!fictitious) |
| 1575 | vm_page_unwire(PHYS_TO_VM_PAGE(pa), 1); | |
| 984263bc MD |
1576 | } |
| 1577 | } | |
| 984263bc MD |
1578 | } |
| 1579 | ||
| 1580 | /* | |
| 46311ac2 MD |
1581 | * Reduce the rate at which memory is allocated to a process based |
| 1582 | * on the perceived load on the VM system. As the load increases | |
| 1583 | * the allocation burst rate goes down and the delay increases. | |
| 1584 | * | |
| 1585 | * Rate limiting does not apply when faulting active or inactive | |
| 1586 | * pages. When faulting 'cache' pages, rate limiting only applies | |
| 1587 | * if the system currently has a severe page deficit. | |
| 1588 | * | |
| 1589 | * XXX vm_pagesupply should be increased when a page is freed. | |
| 1590 | * | |
| 1591 | * We sleep up to 1/10 of a second. | |
| 1592 | */ | |
| 1593 | static int | |
| 1594 | vm_fault_ratelimit(struct vmspace *vmspace) | |
| 1595 | { | |
| 1596 | if (vm_load_enable == 0) | |
| 1597 | return(0); | |
| 1598 | if (vmspace->vm_pagesupply > 0) { | |
| 1599 | --vmspace->vm_pagesupply; | |
| 1600 | return(0); | |
| 1601 | } | |
| 1602 | #ifdef INVARIANTS | |
| 1603 | if (vm_load_debug) { | |
| 086c1d7e | 1604 | kprintf("load %-4d give %d pgs, wait %d, pid %-5d (%s)\n", |
| 46311ac2 MD |
1605 | vm_load, |
| 1606 | (1000 - vm_load ) / 10, vm_load * hz / 10000, | |
| 1607 | curproc->p_pid, curproc->p_comm); | |
| 1608 | } | |
| 1609 | #endif | |
| 1610 | vmspace->vm_pagesupply = (1000 - vm_load) / 10; | |
| 1611 | return(vm_load * hz / 10000); | |
| 1612 | } | |
| 1613 | ||
| 1614 | /* | |
| 984263bc MD |
1615 | * Routine: |
| 1616 | * vm_fault_copy_entry | |
| 1617 | * Function: | |
| 1618 | * Copy all of the pages from a wired-down map entry to another. | |
| 1619 | * | |
| 1620 | * In/out conditions: | |
| 1621 | * The source and destination maps must be locked for write. | |
| 1622 | * The source map entry must be wired down (or be a sharing map | |
| 1623 | * entry corresponding to a main map entry that is wired down). | |
| 1624 | */ | |
| 1625 | ||
| 1626 | void | |
| 57e43348 MD |
1627 | vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map, |
| 1628 | vm_map_entry_t dst_entry, vm_map_entry_t src_entry) | |
| 984263bc MD |
1629 | { |
| 1630 | vm_object_t dst_object; | |
| 1631 | vm_object_t src_object; | |
| 1632 | vm_ooffset_t dst_offset; | |
| 1633 | vm_ooffset_t src_offset; | |
| 1634 | vm_prot_t prot; | |
| 1635 | vm_offset_t vaddr; | |
| 1636 | vm_page_t dst_m; | |
| 1637 | vm_page_t src_m; | |
| 1638 | ||
| 1639 | #ifdef lint | |
| 1640 | src_map++; | |
| 1641 | #endif /* lint */ | |
| 1642 | ||
| 1643 | src_object = src_entry->object.vm_object; | |
| 1644 | src_offset = src_entry->offset; | |
| 1645 | ||
| 1646 | /* | |
| 1647 | * Create the top-level object for the destination entry. (Doesn't | |
| 1648 | * actually shadow anything - we copy the pages directly.) | |
| 1649 | */ | |
| 53025830 MD |
1650 | vm_map_entry_allocate_object(dst_entry); |
| 1651 | dst_object = dst_entry->object.vm_object; | |
| 984263bc MD |
1652 | |
| 1653 | prot = dst_entry->max_protection; | |
| 1654 | ||
| 1655 | /* | |
| 1656 | * Loop through all of the pages in the entry's range, copying each | |
| 1657 | * one from the source object (it should be there) to the destination | |
| 1658 | * object. | |
| 1659 | */ | |
| 1660 | for (vaddr = dst_entry->start, dst_offset = 0; | |
| 1661 | vaddr < dst_entry->end; | |
| 1662 | vaddr += PAGE_SIZE, dst_offset += PAGE_SIZE) { | |
| 1663 | ||
| 1664 | /* | |
| 1665 | * Allocate a page in the destination object | |
| 1666 | */ | |
| 1667 | do { | |
| 1668 | dst_m = vm_page_alloc(dst_object, | |
| 1669 | OFF_TO_IDX(dst_offset), VM_ALLOC_NORMAL); | |
| 1670 | if (dst_m == NULL) { | |
| 4ecf7cc9 | 1671 | vm_wait(0); |
| 984263bc MD |
1672 | } |
| 1673 | } while (dst_m == NULL); | |
| 1674 | ||
| 1675 | /* | |
| 1676 | * Find the page in the source object, and copy it in. | |
| 1677 | * (Because the source is wired down, the page will be in | |
| 1678 | * memory.) | |
| 1679 | */ | |
| 1680 | src_m = vm_page_lookup(src_object, | |
| 1681 | OFF_TO_IDX(dst_offset + src_offset)); | |
| 1682 | if (src_m == NULL) | |
| 1683 | panic("vm_fault_copy_wired: page missing"); | |
| 1684 | ||
| 1685 | vm_page_copy(src_m, dst_m); | |
| 10192bae | 1686 | vm_page_event(src_m, VMEVENT_COW); |
| 984263bc MD |
1687 | |
| 1688 | /* | |
| 1689 | * Enter it in the pmap... | |
| 1690 | */ | |
| 1691 | ||
| 1692 | vm_page_flag_clear(dst_m, PG_ZERO); | |
| 1693 | pmap_enter(dst_map->pmap, vaddr, dst_m, prot, FALSE); | |
| 984263bc MD |
1694 | |
| 1695 | /* | |
| 1696 | * Mark it no longer busy, and put it on the active list. | |
| 1697 | */ | |
| 1698 | vm_page_activate(dst_m); | |
| 1699 | vm_page_wakeup(dst_m); | |
| 1700 | } | |
| 1701 | } | |
| 1702 | ||
| 1b9d3514 | 1703 | #if 0 |
| 984263bc MD |
1704 | |
| 1705 | /* | |
| 1706 | * This routine checks around the requested page for other pages that | |
| 1707 | * might be able to be faulted in. This routine brackets the viable | |
| 1708 | * pages for the pages to be paged in. | |
| 1709 | * | |
| 1710 | * Inputs: | |
| 1711 | * m, rbehind, rahead | |
| 1712 | * | |
| 1713 | * Outputs: | |
| 1714 | * marray (array of vm_page_t), reqpage (index of requested page) | |
| 1715 | * | |
| 1716 | * Return value: | |
| 1717 | * number of pages in marray | |
| 1718 | */ | |
| 1719 | static int | |
| 57e43348 | 1720 | vm_fault_additional_pages(vm_page_t m, int rbehind, int rahead, |
| bc823b32 | 1721 | vm_page_t *marray, int *reqpage) |
| 984263bc MD |
1722 | { |
| 1723 | int i,j; | |
| 1724 | vm_object_t object; | |
| 1725 | vm_pindex_t pindex, startpindex, endpindex, tpindex; | |
| 1726 | vm_page_t rtm; | |
| 1727 | int cbehind, cahead; | |
| 1728 | ||
| 1729 | object = m->object; | |
| 1730 | pindex = m->pindex; | |
| 1731 | ||
| 1732 | /* | |
| 1733 | * we don't fault-ahead for device pager | |
| 1734 | */ | |
| 1735 | if (object->type == OBJT_DEVICE) { | |
| 1736 | *reqpage = 0; | |
| 1737 | marray[0] = m; | |
| 1738 | return 1; | |
| 1739 | } | |
| 1740 | ||
| 1741 | /* | |
| 1742 | * if the requested page is not available, then give up now | |
| 1743 | */ | |
| 984263bc | 1744 | if (!vm_pager_has_page(object, pindex, &cbehind, &cahead)) { |
| 17cde63e | 1745 | *reqpage = 0; /* not used by caller, fix compiler warn */ |
| 984263bc MD |
1746 | return 0; |
| 1747 | } | |
| 1748 | ||
| 1749 | if ((cbehind == 0) && (cahead == 0)) { | |
| 1750 | *reqpage = 0; | |
| 1751 | marray[0] = m; | |
| 1752 | return 1; | |
| 1753 | } | |
| 1754 | ||
| 1755 | if (rahead > cahead) { | |
| 1756 | rahead = cahead; | |
| 1757 | } | |
| 1758 | ||
| 1759 | if (rbehind > cbehind) { | |
| 1760 | rbehind = cbehind; | |
| 1761 | } | |
| 1762 | ||
| 1763 | /* | |
| bc823b32 MD |
1764 | * Do not do any readahead if we have insufficient free memory. |
| 1765 | * | |
| 1766 | * XXX code was broken disabled before and has instability | |
| 1767 | * with this conditonal fixed, so shortcut for now. | |
| 984263bc | 1768 | */ |
| bc823b32 | 1769 | if (burst_fault == 0 || vm_page_count_severe()) { |
| 984263bc MD |
1770 | marray[0] = m; |
| 1771 | *reqpage = 0; | |
| 1772 | return 1; | |
| 1773 | } | |
| 1774 | ||
| 1775 | /* | |
| 1776 | * scan backward for the read behind pages -- in memory | |
| 06ecca5a MD |
1777 | * |
| 1778 | * Assume that if the page is not found an interrupt will not | |
| 1779 | * create it. Theoretically interrupts can only remove (busy) | |
| 1780 | * pages, not create new associations. | |
| 984263bc MD |
1781 | */ |
| 1782 | if (pindex > 0) { | |
| 1783 | if (rbehind > pindex) { | |
| 1784 | rbehind = pindex; | |
| 1785 | startpindex = 0; | |
| 1786 | } else { | |
| 1787 | startpindex = pindex - rbehind; | |
| 1788 | } | |
| 1789 | ||
| 654a39f0 | 1790 | crit_enter(); |
| bc823b32 MD |
1791 | for (tpindex = pindex; tpindex > startpindex; --tpindex) { |
| 1792 | if (vm_page_lookup(object, tpindex - 1)) | |
| 984263bc MD |
1793 | break; |
| 1794 | } | |
| 1795 | ||
| bc823b32 MD |
1796 | i = 0; |
| 1797 | while (tpindex < pindex) { | |
| 1798 | rtm = vm_page_alloc(object, tpindex, VM_ALLOC_SYSTEM); | |
| 984263bc | 1799 | if (rtm == NULL) { |
| 654a39f0 | 1800 | crit_exit(); |
| 984263bc MD |
1801 | for (j = 0; j < i; j++) { |
| 1802 | vm_page_free(marray[j]); | |
| 1803 | } | |
| 1804 | marray[0] = m; | |
| 1805 | *reqpage = 0; | |
| 1806 | return 1; | |
| 1807 | } | |
| 984263bc | 1808 | marray[i] = rtm; |
| bc823b32 MD |
1809 | ++i; |
| 1810 | ++tpindex; | |
| 984263bc | 1811 | } |
| 654a39f0 | 1812 | crit_exit(); |
| 984263bc | 1813 | } else { |
| 984263bc MD |
1814 | i = 0; |
| 1815 | } | |
| 1816 | ||
| bc823b32 MD |
1817 | /* |
| 1818 | * Assign requested page | |
| 1819 | */ | |
| 984263bc | 1820 | marray[i] = m; |
| 984263bc | 1821 | *reqpage = i; |
| bc823b32 | 1822 | ++i; |
| 984263bc MD |
1823 | |
| 1824 | /* | |
| bc823b32 | 1825 | * Scan forwards for read-ahead pages |
| 984263bc | 1826 | */ |
| bc823b32 | 1827 | tpindex = pindex + 1; |
| 984263bc MD |
1828 | endpindex = tpindex + rahead; |
| 1829 | if (endpindex > object->size) | |
| 1830 | endpindex = object->size; | |
| 1831 | ||
| 654a39f0 | 1832 | crit_enter(); |
| bc823b32 MD |
1833 | while (tpindex < endpindex) { |
| 1834 | if (vm_page_lookup(object, tpindex)) | |
| 984263bc | 1835 | break; |
| bc823b32 MD |
1836 | rtm = vm_page_alloc(object, tpindex, VM_ALLOC_SYSTEM); |
| 1837 | if (rtm == NULL) | |
| 984263bc | 1838 | break; |
| 984263bc | 1839 | marray[i] = rtm; |
| bc823b32 MD |
1840 | ++i; |
| 1841 | ++tpindex; | |
| 984263bc | 1842 | } |
| 654a39f0 | 1843 | crit_exit(); |
| 984263bc | 1844 | |
| bc823b32 | 1845 | return (i); |
| 984263bc | 1846 | } |
| 1b9d3514 MD |
1847 | |
| 1848 | #endif | |
| 1849 | ||
| 1850 | /* | |
| 1851 | * vm_prefault() provides a quick way of clustering pagefaults into a | |
| 1852 | * processes address space. It is a "cousin" of pmap_object_init_pt, | |
| 1853 | * except it runs at page fault time instead of mmap time. | |
| 1854 | * | |
| 1855 | * This code used to be per-platform pmap_prefault(). It is now | |
| 1856 | * machine-independent and enhanced to also pre-fault zero-fill pages | |
| 1857 | * (see vm.fast_fault) as well as make them writable, which greatly | |
| 1858 | * reduces the number of page faults programs incur. | |
| 1859 | * | |
| 1860 | * Application performance when pre-faulting zero-fill pages is heavily | |
| 1861 | * dependent on the application. Very tiny applications like /bin/echo | |
| 1862 | * lose a little performance while applications of any appreciable size | |
| 1863 | * gain performance. Prefaulting multiple pages also reduces SMP | |
| 1864 | * congestion and can improve SMP performance significantly. | |
| 1865 | * | |
| 1866 | * NOTE! prot may allow writing but this only applies to the top level | |
| 1867 | * object. If we wind up mapping a page extracted from a backing | |
| 1868 | * object we have to make sure it is read-only. | |
| 1869 | * | |
| 1870 | * NOTE! The caller has already handled any COW operations on the | |
| 1871 | * vm_map_entry via the normal fault code. Do NOT call this | |
| 1872 | * shortcut unless the normal fault code has run on this entry. | |
| 1873 | */ | |
| 1874 | #define PFBAK 4 | |
| 1875 | #define PFFOR 4 | |
| 1876 | #define PAGEORDER_SIZE (PFBAK+PFFOR) | |
| 1877 | ||
| 1878 | static int vm_prefault_pageorder[] = { | |
| 1879 | -PAGE_SIZE, PAGE_SIZE, | |
| 1880 | -2 * PAGE_SIZE, 2 * PAGE_SIZE, | |
| 1881 | -3 * PAGE_SIZE, 3 * PAGE_SIZE, | |
| 1882 | -4 * PAGE_SIZE, 4 * PAGE_SIZE | |
| 1883 | }; | |
| 1884 | ||
| 1885 | static void | |
| 1886 | vm_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry, int prot) | |
| 1887 | { | |
| 1888 | struct lwp *lp; | |
| 1889 | vm_page_t m; | |
| 1890 | vm_offset_t starta; | |
| 1891 | vm_offset_t addr; | |
| 1892 | vm_pindex_t index; | |
| 1893 | vm_pindex_t pindex; | |
| 1894 | vm_object_t object; | |
| 1895 | int pprot; | |
| 1896 | int i; | |
| 1897 | ||
| 1898 | /* | |
| 1899 | * We do not currently prefault mappings that use virtual page | |
| 1900 | * tables. We do not prefault foreign pmaps. | |
| 1901 | */ | |
| 1902 | if (entry->maptype == VM_MAPTYPE_VPAGETABLE) | |
| 1903 | return; | |
| 1904 | lp = curthread->td_lwp; | |
| 1905 | if (lp == NULL || (pmap != vmspace_pmap(lp->lwp_vmspace))) | |
| 1906 | return; | |
| 1907 | ||
| 1908 | object = entry->object.vm_object; | |
| 1909 | ||
| 1910 | starta = addra - PFBAK * PAGE_SIZE; | |
| 1911 | if (starta < entry->start) | |
| 1912 | starta = entry->start; | |
| 1913 | else if (starta > addra) | |
| 1914 | starta = 0; | |
| 1915 | ||
| 1916 | /* | |
| 1917 | * critical section protection is required to maintain the | |
| 1918 | * page/object association, interrupts can free pages and remove | |
| 1919 | * them from their objects. | |
| 1920 | */ | |
| 1921 | crit_enter(); | |
| 1922 | for (i = 0; i < PAGEORDER_SIZE; i++) { | |
| 1923 | vm_object_t lobject; | |
| 3bb7eedb | 1924 | int allocated = 0; |
| 1b9d3514 MD |
1925 | |
| 1926 | addr = addra + vm_prefault_pageorder[i]; | |
| 1927 | if (addr > addra + (PFFOR * PAGE_SIZE)) | |
| 1928 | addr = 0; | |
| 1929 | ||
| 1930 | if (addr < starta || addr >= entry->end) | |
| 1931 | continue; | |
| 1932 | ||
| 1933 | if (pmap_prefault_ok(pmap, addr) == 0) | |
| 1934 | continue; | |
| 1935 | ||
| 1936 | /* | |
| 1937 | * Follow the VM object chain to obtain the page to be mapped | |
| 1938 | * into the pmap. | |
| 1939 | * | |
| 1940 | * If we reach the terminal object without finding a page | |
| 1941 | * and we determine it would be advantageous, then allocate | |
| 1942 | * a zero-fill page for the base object. The base object | |
| 1943 | * is guaranteed to be OBJT_DEFAULT for this case. | |
| 3bb7eedb MD |
1944 | * |
| 1945 | * In order to not have to check the pager via *haspage*() | |
| 1946 | * we stop if any non-default object is encountered. e.g. | |
| 1947 | * a vnode or swap object would stop the loop. | |
| 1b9d3514 MD |
1948 | */ |
| 1949 | index = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT; | |
| 1950 | lobject = object; | |
| 1951 | pindex = index; | |
| 1952 | pprot = prot; | |
| 1953 | ||
| 1954 | while ((m = vm_page_lookup(lobject, pindex)) == NULL) { | |
| 1955 | if (lobject->type != OBJT_DEFAULT) | |
| 1956 | break; | |
| 1957 | if (lobject->backing_object == NULL) { | |
| 1958 | if (vm_fast_fault == 0) | |
| 1959 | break; | |
| 1960 | if (vm_prefault_pageorder[i] < 0 || | |
| 1961 | (prot & VM_PROT_WRITE) == 0 || | |
| 1962 | vm_page_count_min(0)) { | |
| 1963 | break; | |
| 1964 | } | |
| 3bb7eedb | 1965 | /* note: allocate from base object */ |
| 1b9d3514 MD |
1966 | m = vm_page_alloc(object, index, |
| 1967 | VM_ALLOC_NORMAL | VM_ALLOC_ZERO); | |
| 1968 | ||
| 1969 | if ((m->flags & PG_ZERO) == 0) { | |
| 1970 | vm_page_zero_fill(m); | |
| 1971 | } else { | |
| 1972 | vm_page_flag_clear(m, PG_ZERO); | |
| 1973 | mycpu->gd_cnt.v_ozfod++; | |
| 1974 | } | |
| 1975 | mycpu->gd_cnt.v_zfod++; | |
| 1976 | m->valid = VM_PAGE_BITS_ALL; | |
| 3bb7eedb | 1977 | allocated = 1; |
| 1b9d3514 MD |
1978 | pprot = prot; |
| 1979 | /* lobject = object .. not needed */ | |
| 1980 | break; | |
| 1981 | } | |
| 1982 | if (lobject->backing_object_offset & PAGE_MASK) | |
| 1983 | break; | |
| 1984 | pindex += lobject->backing_object_offset >> PAGE_SHIFT; | |
| 1985 | lobject = lobject->backing_object; | |
| 1986 | pprot &= ~VM_PROT_WRITE; | |
| 1987 | } | |
| 1988 | /* | |
| 1989 | * NOTE: lobject now invalid (if we did a zero-fill we didn't | |
| 1990 | * bother assigning lobject = object). | |
| 1991 | * | |
| 1992 | * Give-up if the page is not available. | |
| 1993 | */ | |
| 1994 | if (m == NULL) | |
| 1995 | break; | |
| 1996 | ||
| 1997 | /* | |
| 1998 | * Do not conditionalize on PG_RAM. If pages are present in | |
| 1999 | * the VM system we assume optimal caching. If caching is | |
| 2000 | * not optimal the I/O gravy train will be restarted when we | |
| 2001 | * hit an unavailable page. We do not want to try to restart | |
| 2002 | * the gravy train now because we really don't know how much | |
| 2003 | * of the object has been cached. The cost for restarting | |
| 2004 | * the gravy train should be low (since accesses will likely | |
| 2005 | * be I/O bound anyway). | |
| 2006 | * | |
| 2007 | * The object must be marked dirty if we are mapping a | |
| 2008 | * writable page. | |
| 2009 | */ | |
| 2010 | if (pprot & VM_PROT_WRITE) | |
| 2011 | vm_object_set_writeable_dirty(m->object); | |
| 2012 | ||
| 2013 | /* | |
| 3bb7eedb MD |
2014 | * Enter the page into the pmap if appropriate. If we had |
| 2015 | * allocated the page we have to place it on a queue. If not | |
| 2016 | * we just have to make sure it isn't on the cache queue | |
| 2017 | * (pages on the cache queue are not allowed to be mapped). | |
| 1b9d3514 | 2018 | */ |
| 3bb7eedb MD |
2019 | if (allocated) { |
| 2020 | pmap_enter(pmap, addr, m, pprot, 0); | |
| 2021 | vm_page_deactivate(m); | |
| 2022 | vm_page_wakeup(m); | |
| 2023 | } else if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && | |
| 1b9d3514 MD |
2024 | (m->busy == 0) && |
| 2025 | (m->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { | |
| 2026 | ||
| 2027 | if ((m->queue - m->pc) == PQ_CACHE) { | |
| 2028 | vm_page_deactivate(m); | |
| 2029 | } | |
| 2030 | vm_page_busy(m); | |
| 2031 | pmap_enter(pmap, addr, m, pprot, 0); | |
| 2032 | vm_page_wakeup(m); | |
| 2033 | } | |
| 2034 | } | |
| 2035 | crit_exit(); | |
| 2036 | } |