| Commit | Line | Data |
|---|---|---|
| 984263bc | 1 | /* |
| 9ad0147b MD |
2 | * (MPSAFE) |
| 3 | * | |
| 984263bc MD |
4 | * Copyright (c) 1991, 1993 |
| 5 | * The Regents of the University of California. All rights reserved. | |
| 6 | * Copyright (c) 1994 John S. Dyson | |
| 7 | * All rights reserved. | |
| 8 | * Copyright (c) 1994 David Greenman | |
| 9 | * All rights reserved. | |
| 10 | * | |
| 11 | * | |
| 12 | * This code is derived from software contributed to Berkeley by | |
| 13 | * The Mach Operating System project at Carnegie-Mellon University. | |
| 14 | * | |
| 15 | * Redistribution and use in source and binary forms, with or without | |
| 16 | * modification, are permitted provided that the following conditions | |
| 17 | * are met: | |
| 18 | * 1. Redistributions of source code must retain the above copyright | |
| 19 | * notice, this list of conditions and the following disclaimer. | |
| 20 | * 2. Redistributions in binary form must reproduce the above copyright | |
| 21 | * notice, this list of conditions and the following disclaimer in the | |
| 22 | * documentation and/or other materials provided with the distribution. | |
| 23 | * 3. All advertising materials mentioning features or use of this software | |
| 24 | * must display the following acknowledgement: | |
| 25 | * This product includes software developed by the University of | |
| 26 | * California, Berkeley and its contributors. | |
| 27 | * 4. Neither the name of the University nor the names of its contributors | |
| 28 | * may be used to endorse or promote products derived from this software | |
| 29 | * without specific prior written permission. | |
| 30 | * | |
| 31 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND | |
| 32 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
| 33 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
| 34 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | |
| 35 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
| 36 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
| 37 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
| 38 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
| 39 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
| 40 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
| 41 | * SUCH DAMAGE. | |
| 42 | * | |
| 43 | * from: @(#)vm_fault.c 8.4 (Berkeley) 1/12/94 | |
| 44 | * | |
| 45 | * | |
| 46 | * Copyright (c) 1987, 1990 Carnegie-Mellon University. | |
| 47 | * All rights reserved. | |
| 48 | * | |
| 49 | * Authors: Avadis Tevanian, Jr., Michael Wayne Young | |
| 50 | * | |
| 51 | * Permission to use, copy, modify and distribute this software and | |
| 52 | * its documentation is hereby granted, provided that both the copyright | |
| 53 | * notice and this permission notice appear in all copies of the | |
| 54 | * software, derivative works or modified versions, and any portions | |
| 55 | * thereof, and that both notices appear in supporting documentation. | |
| 56 | * | |
| 57 | * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" | |
| 58 | * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND | |
| 59 | * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. | |
| 60 | * | |
| 61 | * Carnegie Mellon requests users of this software to return to | |
| 62 | * | |
| 63 | * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU | |
| 64 | * School of Computer Science | |
| 65 | * Carnegie Mellon University | |
| 66 | * Pittsburgh PA 15213-3890 | |
| 67 | * | |
| 68 | * any improvements or extensions that they make and grant Carnegie the | |
| 69 | * rights to redistribute these changes. | |
| 70 | * | |
| 71 | * $FreeBSD: src/sys/vm/vm_fault.c,v 1.108.2.8 2002/02/26 05:49:27 silby Exp $ | |
| 4ecf7cc9 | 72 | * $DragonFly: src/sys/vm/vm_fault.c,v 1.47 2008/07/01 02:02:56 dillon Exp $ |
| 984263bc MD |
73 | */ |
| 74 | ||
| 75 | /* | |
| 76 | * Page fault handling module. | |
| 77 | */ | |
| 78 | ||
| 79 | #include <sys/param.h> | |
| 80 | #include <sys/systm.h> | |
| 46311ac2 | 81 | #include <sys/kernel.h> |
| 984263bc MD |
82 | #include <sys/proc.h> |
| 83 | #include <sys/vnode.h> | |
| 84 | #include <sys/resourcevar.h> | |
| 85 | #include <sys/vmmeter.h> | |
| 75f59a66 | 86 | #include <sys/vkernel.h> |
| 75f59a66 | 87 | #include <sys/lock.h> |
| bc823b32 | 88 | #include <sys/sysctl.h> |
| 984263bc | 89 | |
| 5c5185ae SG |
90 | #include <cpu/lwbuf.h> |
| 91 | ||
| 984263bc MD |
92 | #include <vm/vm.h> |
| 93 | #include <vm/vm_param.h> | |
| 984263bc MD |
94 | #include <vm/pmap.h> |
| 95 | #include <vm/vm_map.h> | |
| 96 | #include <vm/vm_object.h> | |
| 97 | #include <vm/vm_page.h> | |
| 98 | #include <vm/vm_pageout.h> | |
| 99 | #include <vm/vm_kern.h> | |
| 100 | #include <vm/vm_pager.h> | |
| 101 | #include <vm/vnode_pager.h> | |
| 102 | #include <vm/vm_extern.h> | |
| 654a39f0 MD |
103 | |
| 104 | #include <sys/thread2.h> | |
| 12e4aaff | 105 | #include <vm/vm_page2.h> |
| 984263bc | 106 | |
| 984263bc MD |
107 | struct faultstate { |
| 108 | vm_page_t m; | |
| 109 | vm_object_t object; | |
| 110 | vm_pindex_t pindex; | |
| 72579d2e | 111 | vm_prot_t prot; |
| 984263bc | 112 | vm_page_t first_m; |
| 568e6804 | 113 | vm_object_t first_object; |
| 72579d2e | 114 | vm_prot_t first_prot; |
| 984263bc MD |
115 | vm_map_t map; |
| 116 | vm_map_entry_t entry; | |
| 117 | int lookup_still_valid; | |
| 568e6804 | 118 | int hardfault; |
| 568e6804 MD |
119 | int fault_flags; |
| 120 | int map_generation; | |
| 121 | boolean_t wired; | |
| 984263bc MD |
122 | struct vnode *vp; |
| 123 | }; | |
| 124 | ||
| cf1bb2a8 MD |
125 | static int debug_cluster = 0; |
| 126 | SYSCTL_INT(_vm, OID_AUTO, debug_cluster, CTLFLAG_RW, &debug_cluster, 0, ""); | |
| 4643740a | 127 | static int vm_shared_fault = 0; |
| 54341a3b MD |
128 | SYSCTL_INT(_vm, OID_AUTO, shared_fault, CTLFLAG_RW, &vm_shared_fault, 0, |
| 129 | "Allow shared token on vm_object"); | |
| 130 | static long vm_shared_hit = 0; | |
| 131 | SYSCTL_LONG(_vm, OID_AUTO, shared_hit, CTLFLAG_RW, &vm_shared_hit, 0, | |
| 132 | "Successful shared faults"); | |
| 133 | static long vm_shared_miss = 0; | |
| 134 | SYSCTL_LONG(_vm, OID_AUTO, shared_miss, CTLFLAG_RW, &vm_shared_miss, 0, | |
| 135 | "Successful shared faults"); | |
| bc823b32 | 136 | |
| 72579d2e | 137 | static int vm_fault_object(struct faultstate *, vm_pindex_t, vm_prot_t); |
| 4e7c41c5 | 138 | static int vm_fault_vpagetable(struct faultstate *, vm_pindex_t *, vpte_t, int); |
| 1b9d3514 | 139 | #if 0 |
| 568e6804 | 140 | static int vm_fault_additional_pages (vm_page_t, int, int, vm_page_t *, int *); |
| 1b9d3514 | 141 | #endif |
| 2421aac7 | 142 | static void vm_set_nosync(vm_page_t m, vm_map_entry_t entry); |
| 54341a3b MD |
143 | static void vm_prefault(pmap_t pmap, vm_offset_t addra, |
| 144 | vm_map_entry_t entry, int prot, int fault_flags); | |
| 145 | static void vm_prefault_quick(pmap_t pmap, vm_offset_t addra, | |
| 146 | vm_map_entry_t entry, int prot, int fault_flags); | |
| 568e6804 | 147 | |
| 984263bc MD |
148 | static __inline void |
| 149 | release_page(struct faultstate *fs) | |
| 150 | { | |
| 984263bc | 151 | vm_page_deactivate(fs->m); |
| 17cde63e | 152 | vm_page_wakeup(fs->m); |
| 984263bc MD |
153 | fs->m = NULL; |
| 154 | } | |
| 155 | ||
| 9ad0147b | 156 | /* |
| 6e9c0867 MD |
157 | * NOTE: Once unlocked any cached fs->entry becomes invalid, any reuse |
| 158 | * requires relocking and then checking the timestamp. | |
| 159 | * | |
| 160 | * NOTE: vm_map_lock_read() does not bump fs->map->timestamp so we do | |
| 161 | * not have to update fs->map_generation here. | |
| 625a2937 MD |
162 | * |
| 163 | * NOTE: This function can fail due to a deadlock against the caller's | |
| 164 | * holding of a vm_page BUSY. | |
| 9ad0147b | 165 | */ |
| 625a2937 | 166 | static __inline int |
| 6e9c0867 MD |
167 | relock_map(struct faultstate *fs) |
| 168 | { | |
| 625a2937 MD |
169 | int error; |
| 170 | ||
| 6e9c0867 | 171 | if (fs->lookup_still_valid == FALSE && fs->map) { |
| 625a2937 MD |
172 | error = vm_map_lock_read_to(fs->map); |
| 173 | if (error == 0) | |
| 174 | fs->lookup_still_valid = TRUE; | |
| 175 | } else { | |
| 176 | error = 0; | |
| 6e9c0867 | 177 | } |
| 625a2937 | 178 | return error; |
| 6e9c0867 MD |
179 | } |
| 180 | ||
| 181 | static __inline void | |
| 984263bc MD |
182 | unlock_map(struct faultstate *fs) |
| 183 | { | |
| aa542ad5 | 184 | if (fs->lookup_still_valid && fs->map) { |
| a108bf71 | 185 | vm_map_lookup_done(fs->map, fs->entry, 0); |
| 984263bc MD |
186 | fs->lookup_still_valid = FALSE; |
| 187 | } | |
| 188 | } | |
| 189 | ||
| 75f59a66 MD |
190 | /* |
| 191 | * Clean up after a successful call to vm_fault_object() so another call | |
| 192 | * to vm_fault_object() can be made. | |
| 193 | */ | |
| 984263bc | 194 | static void |
| 75f59a66 | 195 | _cleanup_successful_fault(struct faultstate *fs, int relock) |
| 984263bc | 196 | { |
| 984263bc MD |
197 | if (fs->object != fs->first_object) { |
| 198 | vm_page_free(fs->first_m); | |
| 75f59a66 | 199 | vm_object_pip_wakeup(fs->object); |
| 984263bc MD |
200 | fs->first_m = NULL; |
| 201 | } | |
| 75f59a66 MD |
202 | fs->object = fs->first_object; |
| 203 | if (relock && fs->lookup_still_valid == FALSE) { | |
| aa542ad5 MD |
204 | if (fs->map) |
| 205 | vm_map_lock_read(fs->map); | |
| 75f59a66 MD |
206 | fs->lookup_still_valid = TRUE; |
| 207 | } | |
| 208 | } | |
| 209 | ||
| 210 | static void | |
| 211 | _unlock_things(struct faultstate *fs, int dealloc) | |
| 212 | { | |
| 75f59a66 | 213 | _cleanup_successful_fault(fs, 0); |
| 984263bc | 214 | if (dealloc) { |
| b12defdc MD |
215 | /*vm_object_deallocate(fs->first_object);*/ |
| 216 | /*fs->first_object = NULL; drop used later on */ | |
| 984263bc MD |
217 | } |
| 218 | unlock_map(fs); | |
| 219 | if (fs->vp != NULL) { | |
| 220 | vput(fs->vp); | |
| 221 | fs->vp = NULL; | |
| 222 | } | |
| 223 | } | |
| 224 | ||
| 225 | #define unlock_things(fs) _unlock_things(fs, 0) | |
| 226 | #define unlock_and_deallocate(fs) _unlock_things(fs, 1) | |
| 75f59a66 | 227 | #define cleanup_successful_fault(fs) _cleanup_successful_fault(fs, 1) |
| 984263bc MD |
228 | |
| 229 | /* | |
| 568e6804 MD |
230 | * TRYPAGER |
| 231 | * | |
| 232 | * Determine if the pager for the current object *might* contain the page. | |
| 984263bc | 233 | * |
| 568e6804 MD |
234 | * We only need to try the pager if this is not a default object (default |
| 235 | * objects are zero-fill and have no real pager), and if we are not taking | |
| 236 | * a wiring fault or if the FS entry is wired. | |
| 984263bc | 237 | */ |
| 568e6804 MD |
238 | #define TRYPAGER(fs) \ |
| 239 | (fs->object->type != OBJT_DEFAULT && \ | |
| 240 | (((fs->fault_flags & VM_FAULT_WIRE_MASK) == 0) || fs->wired)) | |
| 984263bc MD |
241 | |
| 242 | /* | |
| 568e6804 | 243 | * vm_fault: |
| 984263bc | 244 | * |
| 568e6804 MD |
245 | * Handle a page fault occuring at the given address, requiring the given |
| 246 | * permissions, in the map specified. If successful, the page is inserted | |
| 247 | * into the associated physical map. | |
| 984263bc | 248 | * |
| 568e6804 | 249 | * NOTE: The given address should be truncated to the proper page address. |
| 984263bc | 250 | * |
| 568e6804 MD |
251 | * KERN_SUCCESS is returned if the page fault is handled; otherwise, |
| 252 | * a standard error specifying why the fault is fatal is returned. | |
| 984263bc | 253 | * |
| 568e6804 MD |
254 | * The map in question must be referenced, and remains so. |
| 255 | * The caller may hold no locks. | |
| 9ad0147b | 256 | * No other requirements. |
| 984263bc MD |
257 | */ |
| 258 | int | |
| 259 | vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags) | |
| 260 | { | |
| 984263bc | 261 | int result; |
| 72579d2e | 262 | vm_pindex_t first_pindex; |
| 984263bc | 263 | struct faultstate fs; |
| 54341a3b | 264 | struct lwp *lp; |
| 8d496bf9 | 265 | int growstack; |
| 984263bc | 266 | |
| 54341a3b | 267 | vm_page_pcpu_cache(); |
| 568e6804 MD |
268 | fs.hardfault = 0; |
| 269 | fs.fault_flags = fault_flags; | |
| 54341a3b | 270 | fs.vp = NULL; |
| 8d496bf9 | 271 | growstack = 1; |
| 984263bc | 272 | |
| 54341a3b | 273 | if ((lp = curthread->td_lwp) != NULL) |
| 4643740a | 274 | lp->lwp_flags |= LWP_PAGING; |
| 54341a3b | 275 | |
| b12defdc MD |
276 | lwkt_gettoken(&map->token); |
| 277 | ||
| 06ecca5a | 278 | RetryFault: |
| 984263bc | 279 | /* |
| 568e6804 MD |
280 | * Find the vm_map_entry representing the backing store and resolve |
| 281 | * the top level object and page index. This may have the side | |
| 282 | * effect of executing a copy-on-write on the map entry and/or | |
| 283 | * creating a shadow object, but will not COW any actual VM pages. | |
| 284 | * | |
| 285 | * On success fs.map is left read-locked and various other fields | |
| 286 | * are initialized but not otherwise referenced or locked. | |
| 287 | * | |
| 4e7c41c5 MD |
288 | * NOTE! vm_map_lookup will try to upgrade the fault_type to |
| 289 | * VM_FAULT_WRITE if the map entry is a virtual page table and also | |
| 290 | * writable, so we can set the 'A'accessed bit in the virtual page | |
| 291 | * table entry. | |
| 984263bc MD |
292 | */ |
| 293 | fs.map = map; | |
| 568e6804 MD |
294 | result = vm_map_lookup(&fs.map, vaddr, fault_type, |
| 295 | &fs.entry, &fs.first_object, | |
| 72579d2e | 296 | &first_pindex, &fs.first_prot, &fs.wired); |
| 568e6804 MD |
297 | |
| 298 | /* | |
| 299 | * If the lookup failed or the map protections are incompatible, | |
| 300 | * the fault generally fails. However, if the caller is trying | |
| 301 | * to do a user wiring we have more work to do. | |
| 302 | */ | |
| 303 | if (result != KERN_SUCCESS) { | |
| 8d496bf9 MD |
304 | if (result != KERN_PROTECTION_FAILURE || |
| 305 | (fs.fault_flags & VM_FAULT_WIRE_MASK) != VM_FAULT_USER_WIRE) | |
| 306 | { | |
| 307 | if (result == KERN_INVALID_ADDRESS && growstack && | |
| 308 | map != &kernel_map && curproc != NULL) { | |
| 309 | result = vm_map_growstack(curproc, vaddr); | |
| b12defdc MD |
310 | if (result == KERN_SUCCESS) { |
| 311 | growstack = 0; | |
| 312 | goto RetryFault; | |
| 313 | } | |
| 314 | result = KERN_FAILURE; | |
| 8d496bf9 | 315 | } |
| b12defdc | 316 | goto done; |
| 8d496bf9 | 317 | } |
| 984263bc MD |
318 | |
| 319 | /* | |
| 320 | * If we are user-wiring a r/w segment, and it is COW, then | |
| 568e6804 MD |
321 | * we need to do the COW operation. Note that we don't |
| 322 | * currently COW RO sections now, because it is NOT desirable | |
| 984263bc MD |
323 | * to COW .text. We simply keep .text from ever being COW'ed |
| 324 | * and take the heat that one cannot debug wired .text sections. | |
| 325 | */ | |
| 326 | result = vm_map_lookup(&fs.map, vaddr, | |
| 568e6804 MD |
327 | VM_PROT_READ|VM_PROT_WRITE| |
| 328 | VM_PROT_OVERRIDE_WRITE, | |
| 329 | &fs.entry, &fs.first_object, | |
| 72579d2e MD |
330 | &first_pindex, &fs.first_prot, |
| 331 | &fs.wired); | |
| b12defdc MD |
332 | if (result != KERN_SUCCESS) { |
| 333 | result = KERN_FAILURE; | |
| 334 | goto done; | |
| 335 | } | |
| 984263bc MD |
336 | |
| 337 | /* | |
| 338 | * If we don't COW now, on a user wire, the user will never | |
| 339 | * be able to write to the mapping. If we don't make this | |
| 340 | * restriction, the bookkeeping would be nearly impossible. | |
| 341 | */ | |
| 342 | if ((fs.entry->protection & VM_PROT_WRITE) == 0) | |
| 343 | fs.entry->max_protection &= ~VM_PROT_WRITE; | |
| 344 | } | |
| 345 | ||
| 568e6804 MD |
346 | /* |
| 347 | * fs.map is read-locked | |
| 348 | * | |
| 349 | * Misc checks. Save the map generation number to detect races. | |
| 350 | */ | |
| 351 | fs.map_generation = fs.map->timestamp; | |
| 984263bc | 352 | |
| e40cfbd7 MD |
353 | if (fs.entry->eflags & (MAP_ENTRY_NOFAULT | MAP_ENTRY_KSTACK)) { |
| 354 | if (fs.entry->eflags & MAP_ENTRY_NOFAULT) { | |
| 355 | panic("vm_fault: fault on nofault entry, addr: %p", | |
| 356 | (void *)vaddr); | |
| 357 | } | |
| 358 | if ((fs.entry->eflags & MAP_ENTRY_KSTACK) && | |
| 359 | vaddr >= fs.entry->start && | |
| 360 | vaddr < fs.entry->start + PAGE_SIZE) { | |
| 361 | panic("vm_fault: fault on stack guard, addr: %p", | |
| 362 | (void *)vaddr); | |
| 363 | } | |
| 984263bc MD |
364 | } |
| 365 | ||
| 366 | /* | |
| c40f2b75 MD |
367 | * A system map entry may return a NULL object. No object means |
| 368 | * no pager means an unrecoverable kernel fault. | |
| 369 | */ | |
| 370 | if (fs.first_object == NULL) { | |
| 371 | panic("vm_fault: unrecoverable fault at %p in entry %p", | |
| 372 | (void *)vaddr, fs.entry); | |
| 373 | } | |
| 374 | ||
| 375 | /* | |
| 54341a3b MD |
376 | * Attempt to shortcut the fault if the lookup returns a |
| 377 | * terminal object and the page is present. This allows us | |
| 378 | * to obtain a shared token on the object instead of an exclusive | |
| 379 | * token, which theoretically should allow concurrent faults. | |
| 380 | */ | |
| 381 | if (vm_shared_fault && | |
| 382 | fs.first_object->backing_object == NULL && | |
| 383 | fs.entry->maptype == VM_MAPTYPE_NORMAL) { | |
| 384 | int error; | |
| 385 | vm_object_hold_shared(fs.first_object); | |
| 386 | /*fs.vp = vnode_pager_lock(fs.first_object);*/ | |
| 387 | fs.m = vm_page_lookup_busy_try(fs.first_object, | |
| 388 | first_pindex, | |
| 389 | TRUE, &error); | |
| 390 | if (error == 0 && fs.m) { | |
| 391 | /* | |
| 392 | * Activate the page and figure out if we can | |
| 393 | * short-cut a quick mapping. | |
| 394 | * | |
| 395 | * WARNING! We cannot call swap_pager_unswapped() | |
| 4643740a MD |
396 | * with a shared token! Note that we |
| 397 | * have to test fs.first_prot here. | |
| 54341a3b MD |
398 | */ |
| 399 | vm_page_activate(fs.m); | |
| 400 | if (fs.m->valid == VM_PAGE_BITS_ALL && | |
| 401 | ((fs.m->flags & PG_SWAPPED) == 0 || | |
| 4643740a | 402 | (fs.first_prot & VM_PROT_WRITE) == 0 || |
| 54341a3b MD |
403 | (fs.fault_flags & VM_FAULT_DIRTY) == 0)) { |
| 404 | fs.lookup_still_valid = TRUE; | |
| 405 | fs.first_m = NULL; | |
| 406 | fs.object = fs.first_object; | |
| 407 | fs.prot = fs.first_prot; | |
| 408 | if (fs.wired) | |
| 409 | fault_type = fs.first_prot; | |
| 410 | if (fs.prot & VM_PROT_WRITE) { | |
| 411 | vm_object_set_writeable_dirty( | |
| 412 | fs.m->object); | |
| 413 | vm_set_nosync(fs.m, fs.entry); | |
| 414 | if (fs.fault_flags & VM_FAULT_DIRTY) { | |
| 415 | vm_page_dirty(fs.m); | |
| 416 | /*XXX*/ | |
| 417 | swap_pager_unswapped(fs.m); | |
| 418 | } | |
| 419 | } | |
| 420 | result = KERN_SUCCESS; | |
| 421 | fault_flags |= VM_FAULT_BURST_QUICK; | |
| 422 | fault_flags &= ~VM_FAULT_BURST; | |
| 423 | ++vm_shared_hit; | |
| 424 | goto quick; | |
| 425 | } | |
| 426 | vm_page_wakeup(fs.m); | |
| 427 | fs.m = NULL; | |
| 428 | } | |
| 429 | vm_object_drop(fs.first_object); /* XXX drop on shared tok?*/ | |
| 430 | } | |
| 431 | ++vm_shared_miss; | |
| 432 | ||
| 433 | /* | |
| 984263bc MD |
434 | * Bump the paging-in-progress count to prevent size changes (e.g. |
| 435 | * truncation operations) during I/O. This must be done after | |
| 436 | * obtaining the vnode lock in order to avoid possible deadlocks. | |
| 437 | */ | |
| b4460ab3 | 438 | vm_object_hold(fs.first_object); |
| 54341a3b MD |
439 | if (fs.vp == NULL) |
| 440 | fs.vp = vnode_pager_lock(fs.first_object); | |
| 984263bc | 441 | |
| 984263bc | 442 | fs.lookup_still_valid = TRUE; |
| 984263bc | 443 | fs.first_m = NULL; |
| afeabdca | 444 | fs.object = fs.first_object; /* so unlock_and_deallocate works */ |
| 984263bc MD |
445 | |
| 446 | /* | |
| 568e6804 | 447 | * If the entry is wired we cannot change the page protection. |
| 984263bc | 448 | */ |
| 568e6804 | 449 | if (fs.wired) |
| 72579d2e | 450 | fault_type = fs.first_prot; |
| 984263bc | 451 | |
| 568e6804 | 452 | /* |
| 75f59a66 MD |
453 | * The page we want is at (first_object, first_pindex), but if the |
| 454 | * vm_map_entry is VM_MAPTYPE_VPAGETABLE we have to traverse the | |
| 455 | * page table to figure out the actual pindex. | |
| 456 | * | |
| 457 | * NOTE! DEVELOPMENT IN PROGRESS, THIS IS AN INITIAL IMPLEMENTATION | |
| 458 | * ONLY | |
| 568e6804 | 459 | */ |
| 568e6804 | 460 | if (fs.entry->maptype == VM_MAPTYPE_VPAGETABLE) { |
| 72579d2e | 461 | result = vm_fault_vpagetable(&fs, &first_pindex, |
| 4e7c41c5 MD |
462 | fs.entry->aux.master_pde, |
| 463 | fault_type); | |
| b12defdc MD |
464 | if (result == KERN_TRY_AGAIN) { |
| 465 | vm_object_drop(fs.first_object); | |
| 568e6804 | 466 | goto RetryFault; |
| b12defdc | 467 | } |
| 75f59a66 | 468 | if (result != KERN_SUCCESS) |
| b12defdc | 469 | goto done; |
| 568e6804 | 470 | } |
| 75f59a66 | 471 | |
| 568e6804 MD |
472 | /* |
| 473 | * Now we have the actual (object, pindex), fault in the page. If | |
| 474 | * vm_fault_object() fails it will unlock and deallocate the FS | |
| 75f59a66 | 475 | * data. If it succeeds everything remains locked and fs->object |
| 9ad0147b | 476 | * will have an additional PIP count if it is not equal to |
| 75f59a66 | 477 | * fs->first_object |
| 4e7c41c5 MD |
478 | * |
| 479 | * vm_fault_object will set fs->prot for the pmap operation. It is | |
| 480 | * allowed to set VM_PROT_WRITE if fault_type == VM_PROT_READ if the | |
| 481 | * page can be safely written. However, it will force a read-only | |
| 482 | * mapping for a read fault if the memory is managed by a virtual | |
| 483 | * page table. | |
| 568e6804 | 484 | */ |
| 2a418930 | 485 | /* BEFORE */ |
| 72579d2e | 486 | result = vm_fault_object(&fs, first_pindex, fault_type); |
| afeabdca | 487 | |
| 2a418930 | 488 | if (result == KERN_TRY_AGAIN) { |
| b12defdc | 489 | vm_object_drop(fs.first_object); |
| 568e6804 | 490 | goto RetryFault; |
| 2a418930 | 491 | } |
| b12defdc MD |
492 | if (result != KERN_SUCCESS) |
| 493 | goto done; | |
| 568e6804 | 494 | |
| 54341a3b | 495 | quick: |
| 568e6804 | 496 | /* |
| 75f59a66 MD |
497 | * On success vm_fault_object() does not unlock or deallocate, and fs.m |
| 498 | * will contain a busied page. | |
| 568e6804 MD |
499 | * |
| 500 | * Enter the page into the pmap and do pmap-related adjustments. | |
| 501 | */ | |
| 2a418930 | 502 | vm_page_flag_set(fs.m, PG_REFERENCED); |
| 568e6804 | 503 | pmap_enter(fs.map->pmap, vaddr, fs.m, fs.prot, fs.wired); |
| 54341a3b MD |
504 | mycpu->gd_cnt.v_vm_faults++; |
| 505 | if (curthread->td_lwp) | |
| 506 | ++curthread->td_lwp->lwp_ru.ru_minflt; | |
| 568e6804 | 507 | |
| a491077e | 508 | /*KKASSERT(fs.m->queue == PQ_NONE); page-in op may deactivate page */ |
| 2a418930 | 509 | KKASSERT(fs.m->flags & PG_BUSY); |
| 568e6804 MD |
510 | |
| 511 | /* | |
| 512 | * If the page is not wired down, then put it where the pageout daemon | |
| 513 | * can find it. | |
| 514 | */ | |
| 515 | if (fs.fault_flags & VM_FAULT_WIRE_MASK) { | |
| 516 | if (fs.wired) | |
| 517 | vm_page_wire(fs.m); | |
| 518 | else | |
| 519 | vm_page_unwire(fs.m, 1); | |
| 520 | } else { | |
| 521 | vm_page_activate(fs.m); | |
| 522 | } | |
| ce2ac249 MD |
523 | vm_page_wakeup(fs.m); |
| 524 | ||
| 525 | /* | |
| 526 | * Burst in a few more pages if possible. The fs.map should still | |
| 527 | * be locked. To avoid interlocking against a vnode->getblk | |
| 528 | * operation we had to be sure to unbusy our primary vm_page above | |
| 529 | * first. | |
| 530 | */ | |
| 531 | if (fault_flags & VM_FAULT_BURST) { | |
| 54341a3b MD |
532 | if ((fs.fault_flags & VM_FAULT_WIRE_MASK) == 0 |
| 533 | && fs.wired == 0) { | |
| 534 | vm_prefault(fs.map->pmap, vaddr, | |
| 535 | fs.entry, fs.prot, fault_flags); | |
| 536 | } | |
| 537 | } | |
| 538 | if (fault_flags & VM_FAULT_BURST_QUICK) { | |
| 539 | if ((fs.fault_flags & VM_FAULT_WIRE_MASK) == 0 | |
| 540 | && fs.wired == 0) { | |
| 541 | vm_prefault_quick(fs.map->pmap, vaddr, | |
| 542 | fs.entry, fs.prot, fault_flags); | |
| ce2ac249 MD |
543 | } |
| 544 | } | |
| 545 | ||
| 546 | /* | |
| 547 | * Unlock everything, and return | |
| 548 | */ | |
| 549 | unlock_things(&fs); | |
| 568e6804 | 550 | |
| fde7ac71 | 551 | if (curthread->td_lwp) { |
| 568e6804 | 552 | if (fs.hardfault) { |
| fde7ac71 | 553 | curthread->td_lwp->lwp_ru.ru_majflt++; |
| 568e6804 | 554 | } else { |
| fde7ac71 | 555 | curthread->td_lwp->lwp_ru.ru_minflt++; |
| 568e6804 MD |
556 | } |
| 557 | } | |
| 558 | ||
| b12defdc | 559 | /*vm_object_deallocate(fs.first_object);*/ |
| a491077e | 560 | /*fs.m = NULL; */ |
| b12defdc MD |
561 | /*fs.first_object = NULL; must still drop later */ |
| 562 | ||
| 563 | result = KERN_SUCCESS; | |
| 564 | done: | |
| 565 | if (fs.first_object) | |
| 566 | vm_object_drop(fs.first_object); | |
| 567 | lwkt_reltoken(&map->token); | |
| 54341a3b | 568 | if (lp) |
| 4643740a | 569 | lp->lwp_flags &= ~LWP_PAGING; |
| b12defdc | 570 | return (result); |
| 568e6804 MD |
571 | } |
| 572 | ||
| 573 | /* | |
| 5a0e2a66 MD |
574 | * Fault in the specified virtual address in the current process map, |
| 575 | * returning a held VM page or NULL. See vm_fault_page() for more | |
| 576 | * information. | |
| 9ad0147b MD |
577 | * |
| 578 | * No requirements. | |
| 5a0e2a66 MD |
579 | */ |
| 580 | vm_page_t | |
| 581 | vm_fault_page_quick(vm_offset_t va, vm_prot_t fault_type, int *errorp) | |
| 582 | { | |
| 287ebb09 | 583 | struct lwp *lp = curthread->td_lwp; |
| 5a0e2a66 MD |
584 | vm_page_t m; |
| 585 | ||
| 287ebb09 | 586 | m = vm_fault_page(&lp->lwp_vmspace->vm_map, va, |
| 5a0e2a66 MD |
587 | fault_type, VM_FAULT_NORMAL, errorp); |
| 588 | return(m); | |
| 589 | } | |
| 590 | ||
| 591 | /* | |
| 592 | * Fault in the specified virtual address in the specified map, doing all | |
| 4e158347 MD |
593 | * necessary manipulation of the object store and all necessary I/O. Return |
| 594 | * a held VM page or NULL, and set *errorp. The related pmap is not | |
| 595 | * updated. | |
| 596 | * | |
| 5a0e2a66 MD |
597 | * The returned page will be properly dirtied if VM_PROT_WRITE was specified, |
| 598 | * and marked PG_REFERENCED as well. | |
| 17cde63e MD |
599 | * |
| 600 | * If the page cannot be faulted writable and VM_PROT_WRITE was specified, an | |
| 601 | * error will be returned. | |
| 9ad0147b MD |
602 | * |
| 603 | * No requirements. | |
| 4e158347 MD |
604 | */ |
| 605 | vm_page_t | |
| 606 | vm_fault_page(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, | |
| 607 | int fault_flags, int *errorp) | |
| 608 | { | |
| 4e158347 MD |
609 | vm_pindex_t first_pindex; |
| 610 | struct faultstate fs; | |
| 17cde63e MD |
611 | int result; |
| 612 | vm_prot_t orig_fault_type = fault_type; | |
| 4e158347 | 613 | |
| 4e158347 MD |
614 | fs.hardfault = 0; |
| 615 | fs.fault_flags = fault_flags; | |
| 616 | KKASSERT((fault_flags & VM_FAULT_WIRE_MASK) == 0); | |
| 617 | ||
| b12defdc MD |
618 | lwkt_gettoken(&map->token); |
| 619 | ||
| 4e158347 MD |
620 | RetryFault: |
| 621 | /* | |
| 622 | * Find the vm_map_entry representing the backing store and resolve | |
| 623 | * the top level object and page index. This may have the side | |
| 624 | * effect of executing a copy-on-write on the map entry and/or | |
| 625 | * creating a shadow object, but will not COW any actual VM pages. | |
| 626 | * | |
| 627 | * On success fs.map is left read-locked and various other fields | |
| 628 | * are initialized but not otherwise referenced or locked. | |
| 629 | * | |
| 630 | * NOTE! vm_map_lookup will upgrade the fault_type to VM_FAULT_WRITE | |
| 631 | * if the map entry is a virtual page table and also writable, | |
| 632 | * so we can set the 'A'accessed bit in the virtual page table entry. | |
| 633 | */ | |
| 634 | fs.map = map; | |
| 635 | result = vm_map_lookup(&fs.map, vaddr, fault_type, | |
| 636 | &fs.entry, &fs.first_object, | |
| 637 | &first_pindex, &fs.first_prot, &fs.wired); | |
| 638 | ||
| 639 | if (result != KERN_SUCCESS) { | |
| 640 | *errorp = result; | |
| b12defdc MD |
641 | fs.m = NULL; |
| 642 | goto done; | |
| 4e158347 MD |
643 | } |
| 644 | ||
| 645 | /* | |
| 646 | * fs.map is read-locked | |
| 647 | * | |
| 648 | * Misc checks. Save the map generation number to detect races. | |
| 649 | */ | |
| 650 | fs.map_generation = fs.map->timestamp; | |
| 651 | ||
| 652 | if (fs.entry->eflags & MAP_ENTRY_NOFAULT) { | |
| 653 | panic("vm_fault: fault on nofault entry, addr: %lx", | |
| 654 | (u_long)vaddr); | |
| 655 | } | |
| 656 | ||
| 657 | /* | |
| 658 | * A system map entry may return a NULL object. No object means | |
| 659 | * no pager means an unrecoverable kernel fault. | |
| 660 | */ | |
| 661 | if (fs.first_object == NULL) { | |
| 662 | panic("vm_fault: unrecoverable fault at %p in entry %p", | |
| 663 | (void *)vaddr, fs.entry); | |
| 664 | } | |
| 665 | ||
| 666 | /* | |
| 667 | * Make a reference to this object to prevent its disposal while we | |
| 668 | * are messing with it. Once we have the reference, the map is free | |
| 669 | * to be diddled. Since objects reference their shadows (and copies), | |
| 670 | * they will stay around as well. | |
| 671 | * | |
| b12defdc MD |
672 | * The reference should also prevent an unexpected collapse of the |
| 673 | * parent that might move pages from the current object into the | |
| 674 | * parent unexpectedly, resulting in corruption. | |
| 675 | * | |
| 4e158347 MD |
676 | * Bump the paging-in-progress count to prevent size changes (e.g. |
| 677 | * truncation operations) during I/O. This must be done after | |
| 678 | * obtaining the vnode lock in order to avoid possible deadlocks. | |
| 679 | */ | |
| b4460ab3 | 680 | vm_object_hold(fs.first_object); |
| 4e158347 | 681 | fs.vp = vnode_pager_lock(fs.first_object); |
| 4e158347 MD |
682 | |
| 683 | fs.lookup_still_valid = TRUE; | |
| 684 | fs.first_m = NULL; | |
| 685 | fs.object = fs.first_object; /* so unlock_and_deallocate works */ | |
| 686 | ||
| 687 | /* | |
| 688 | * If the entry is wired we cannot change the page protection. | |
| 689 | */ | |
| 690 | if (fs.wired) | |
| 691 | fault_type = fs.first_prot; | |
| 692 | ||
| 693 | /* | |
| 694 | * The page we want is at (first_object, first_pindex), but if the | |
| 695 | * vm_map_entry is VM_MAPTYPE_VPAGETABLE we have to traverse the | |
| 696 | * page table to figure out the actual pindex. | |
| 697 | * | |
| 698 | * NOTE! DEVELOPMENT IN PROGRESS, THIS IS AN INITIAL IMPLEMENTATION | |
| 699 | * ONLY | |
| 700 | */ | |
| 701 | if (fs.entry->maptype == VM_MAPTYPE_VPAGETABLE) { | |
| 702 | result = vm_fault_vpagetable(&fs, &first_pindex, | |
| 4e7c41c5 MD |
703 | fs.entry->aux.master_pde, |
| 704 | fault_type); | |
| b12defdc MD |
705 | if (result == KERN_TRY_AGAIN) { |
| 706 | vm_object_drop(fs.first_object); | |
| 4e158347 | 707 | goto RetryFault; |
| b12defdc | 708 | } |
| 4e158347 MD |
709 | if (result != KERN_SUCCESS) { |
| 710 | *errorp = result; | |
| b12defdc MD |
711 | fs.m = NULL; |
| 712 | goto done; | |
| 4e158347 MD |
713 | } |
| 714 | } | |
| 715 | ||
| 716 | /* | |
| 717 | * Now we have the actual (object, pindex), fault in the page. If | |
| 718 | * vm_fault_object() fails it will unlock and deallocate the FS | |
| 719 | * data. If it succeeds everything remains locked and fs->object | |
| 720 | * will have an additinal PIP count if it is not equal to | |
| 721 | * fs->first_object | |
| 722 | */ | |
| 723 | result = vm_fault_object(&fs, first_pindex, fault_type); | |
| 724 | ||
| b12defdc MD |
725 | if (result == KERN_TRY_AGAIN) { |
| 726 | vm_object_drop(fs.first_object); | |
| 4e158347 | 727 | goto RetryFault; |
| b12defdc | 728 | } |
| 4e158347 MD |
729 | if (result != KERN_SUCCESS) { |
| 730 | *errorp = result; | |
| b12defdc MD |
731 | fs.m = NULL; |
| 732 | goto done; | |
| 4e158347 MD |
733 | } |
| 734 | ||
| 17cde63e MD |
735 | if ((orig_fault_type & VM_PROT_WRITE) && |
| 736 | (fs.prot & VM_PROT_WRITE) == 0) { | |
| 737 | *errorp = KERN_PROTECTION_FAILURE; | |
| 738 | unlock_and_deallocate(&fs); | |
| b12defdc MD |
739 | fs.m = NULL; |
| 740 | goto done; | |
| 17cde63e MD |
741 | } |
| 742 | ||
| 4e158347 | 743 | /* |
| d2d8515b MD |
744 | * Update the pmap. We really only have to do this if a COW |
| 745 | * occured to replace the read-only page with the new page. For | |
| 746 | * now just do it unconditionally. XXX | |
| 747 | */ | |
| 748 | pmap_enter(fs.map->pmap, vaddr, fs.m, fs.prot, fs.wired); | |
| 749 | vm_page_flag_set(fs.m, PG_REFERENCED); | |
| 54341a3b MD |
750 | mycpu->gd_cnt.v_vm_faults++; |
| 751 | if (curthread->td_lwp) | |
| 752 | ++curthread->td_lwp->lwp_ru.ru_minflt; | |
| d2d8515b MD |
753 | |
| 754 | /* | |
| 4e158347 | 755 | * On success vm_fault_object() does not unlock or deallocate, and fs.m |
| d2d8515b MD |
756 | * will contain a busied page. So we must unlock here after having |
| 757 | * messed with the pmap. | |
| 4e158347 MD |
758 | */ |
| 759 | unlock_things(&fs); | |
| 760 | ||
| 761 | /* | |
| 762 | * Return a held page. We are not doing any pmap manipulation so do | |
| 5a0e2a66 MD |
763 | * not set PG_MAPPED. However, adjust the page flags according to |
| 764 | * the fault type because the caller may not use a managed pmapping | |
| 765 | * (so we don't want to lose the fact that the page will be dirtied | |
| 766 | * if a write fault was specified). | |
| 4e158347 | 767 | */ |
| 5a0e2a66 | 768 | vm_page_hold(fs.m); |
| 54341a3b | 769 | vm_page_activate(fs.m); |
| 5a0e2a66 MD |
770 | if (fault_type & VM_PROT_WRITE) |
| 771 | vm_page_dirty(fs.m); | |
| 4e158347 | 772 | |
| 4e158347 MD |
773 | if (curthread->td_lwp) { |
| 774 | if (fs.hardfault) { | |
| 775 | curthread->td_lwp->lwp_ru.ru_majflt++; | |
| 776 | } else { | |
| 777 | curthread->td_lwp->lwp_ru.ru_minflt++; | |
| 778 | } | |
| 779 | } | |
| 780 | ||
| 781 | /* | |
| 782 | * Unlock everything, and return the held page. | |
| 783 | */ | |
| 784 | vm_page_wakeup(fs.m); | |
| b12defdc | 785 | /*vm_object_deallocate(fs.first_object);*/ |
| a491077e | 786 | /*fs.first_object = NULL; */ |
| 4e158347 | 787 | *errorp = 0; |
| b12defdc MD |
788 | |
| 789 | done: | |
| 790 | if (fs.first_object) | |
| 791 | vm_object_drop(fs.first_object); | |
| 792 | lwkt_reltoken(&map->token); | |
| 4e158347 MD |
793 | return(fs.m); |
| 794 | } | |
| 795 | ||
| 796 | /* | |
| 17cde63e MD |
797 | * Fault in the specified (object,offset), dirty the returned page as |
| 798 | * needed. If the requested fault_type cannot be done NULL and an | |
| 799 | * error is returned. | |
| 9ad0147b MD |
800 | * |
| 801 | * A held (but not busied) page is returned. | |
| 802 | * | |
| 803 | * No requirements. | |
| aa542ad5 MD |
804 | */ |
| 805 | vm_page_t | |
| 806 | vm_fault_object_page(vm_object_t object, vm_ooffset_t offset, | |
| 807 | vm_prot_t fault_type, int fault_flags, int *errorp) | |
| 808 | { | |
| 809 | int result; | |
| 810 | vm_pindex_t first_pindex; | |
| 811 | struct faultstate fs; | |
| 812 | struct vm_map_entry entry; | |
| 813 | ||
| b12defdc | 814 | ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); |
| aa542ad5 MD |
815 | bzero(&entry, sizeof(entry)); |
| 816 | entry.object.vm_object = object; | |
| 817 | entry.maptype = VM_MAPTYPE_NORMAL; | |
| 818 | entry.protection = entry.max_protection = fault_type; | |
| 819 | ||
| aa542ad5 MD |
820 | fs.hardfault = 0; |
| 821 | fs.fault_flags = fault_flags; | |
| 822 | fs.map = NULL; | |
| 823 | KKASSERT((fault_flags & VM_FAULT_WIRE_MASK) == 0); | |
| 824 | ||
| 825 | RetryFault: | |
| 826 | ||
| 827 | fs.first_object = object; | |
| 828 | first_pindex = OFF_TO_IDX(offset); | |
| 829 | fs.entry = &entry; | |
| 830 | fs.first_prot = fault_type; | |
| 831 | fs.wired = 0; | |
| 832 | /*fs.map_generation = 0; unused */ | |
| 833 | ||
| 834 | /* | |
| 835 | * Make a reference to this object to prevent its disposal while we | |
| 836 | * are messing with it. Once we have the reference, the map is free | |
| 837 | * to be diddled. Since objects reference their shadows (and copies), | |
| 838 | * they will stay around as well. | |
| 839 | * | |
| b12defdc MD |
840 | * The reference should also prevent an unexpected collapse of the |
| 841 | * parent that might move pages from the current object into the | |
| 842 | * parent unexpectedly, resulting in corruption. | |
| 843 | * | |
| aa542ad5 MD |
844 | * Bump the paging-in-progress count to prevent size changes (e.g. |
| 845 | * truncation operations) during I/O. This must be done after | |
| 846 | * obtaining the vnode lock in order to avoid possible deadlocks. | |
| 847 | */ | |
| aa542ad5 | 848 | fs.vp = vnode_pager_lock(fs.first_object); |
| aa542ad5 MD |
849 | |
| 850 | fs.lookup_still_valid = TRUE; | |
| 851 | fs.first_m = NULL; | |
| 852 | fs.object = fs.first_object; /* so unlock_and_deallocate works */ | |
| 853 | ||
| 854 | #if 0 | |
| 855 | /* XXX future - ability to operate on VM object using vpagetable */ | |
| 856 | if (fs.entry->maptype == VM_MAPTYPE_VPAGETABLE) { | |
| 857 | result = vm_fault_vpagetable(&fs, &first_pindex, | |
| 858 | fs.entry->aux.master_pde, | |
| 859 | fault_type); | |
| 860 | if (result == KERN_TRY_AGAIN) | |
| 861 | goto RetryFault; | |
| 862 | if (result != KERN_SUCCESS) { | |
| 863 | *errorp = result; | |
| 864 | return (NULL); | |
| 865 | } | |
| 866 | } | |
| 867 | #endif | |
| 868 | ||
| 869 | /* | |
| 870 | * Now we have the actual (object, pindex), fault in the page. If | |
| 871 | * vm_fault_object() fails it will unlock and deallocate the FS | |
| 872 | * data. If it succeeds everything remains locked and fs->object | |
| 873 | * will have an additinal PIP count if it is not equal to | |
| 874 | * fs->first_object | |
| 875 | */ | |
| 876 | result = vm_fault_object(&fs, first_pindex, fault_type); | |
| 877 | ||
| 878 | if (result == KERN_TRY_AGAIN) | |
| 879 | goto RetryFault; | |
| 880 | if (result != KERN_SUCCESS) { | |
| 881 | *errorp = result; | |
| 882 | return(NULL); | |
| 883 | } | |
| 884 | ||
| 17cde63e MD |
885 | if ((fault_type & VM_PROT_WRITE) && (fs.prot & VM_PROT_WRITE) == 0) { |
| 886 | *errorp = KERN_PROTECTION_FAILURE; | |
| 887 | unlock_and_deallocate(&fs); | |
| 888 | return(NULL); | |
| 889 | } | |
| 890 | ||
| aa542ad5 | 891 | /* |
| d2d8515b MD |
892 | * On success vm_fault_object() does not unlock or deallocate, so we |
| 893 | * do it here. Note that the returned fs.m will be busied. | |
| aa542ad5 MD |
894 | */ |
| 895 | unlock_things(&fs); | |
| 896 | ||
| 897 | /* | |
| 898 | * Return a held page. We are not doing any pmap manipulation so do | |
| 899 | * not set PG_MAPPED. However, adjust the page flags according to | |
| 900 | * the fault type because the caller may not use a managed pmapping | |
| 901 | * (so we don't want to lose the fact that the page will be dirtied | |
| 902 | * if a write fault was specified). | |
| 903 | */ | |
| 904 | vm_page_hold(fs.m); | |
| 54341a3b MD |
905 | vm_page_activate(fs.m); |
| 906 | if ((fault_type & VM_PROT_WRITE) || (fault_flags & VM_FAULT_DIRTY)) | |
| 9f3543c6 MD |
907 | vm_page_dirty(fs.m); |
| 908 | if (fault_flags & VM_FAULT_UNSWAP) | |
| 909 | swap_pager_unswapped(fs.m); | |
| 910 | ||
| aa542ad5 MD |
911 | /* |
| 912 | * Indicate that the page was accessed. | |
| 913 | */ | |
| 914 | vm_page_flag_set(fs.m, PG_REFERENCED); | |
| 915 | ||
| aa542ad5 MD |
916 | if (curthread->td_lwp) { |
| 917 | if (fs.hardfault) { | |
| aa542ad5 MD |
918 | curthread->td_lwp->lwp_ru.ru_majflt++; |
| 919 | } else { | |
| 920 | curthread->td_lwp->lwp_ru.ru_minflt++; | |
| 921 | } | |
| 922 | } | |
| 923 | ||
| 924 | /* | |
| 925 | * Unlock everything, and return the held page. | |
| 926 | */ | |
| 927 | vm_page_wakeup(fs.m); | |
| b12defdc | 928 | /*vm_object_deallocate(fs.first_object);*/ |
| a491077e | 929 | /*fs.first_object = NULL; */ |
| aa542ad5 MD |
930 | |
| 931 | *errorp = 0; | |
| 932 | return(fs.m); | |
| 933 | } | |
| 934 | ||
| 935 | /* | |
| 72579d2e | 936 | * Translate the virtual page number (first_pindex) that is relative |
| afeabdca MD |
937 | * to the address space into a logical page number that is relative to the |
| 938 | * backing object. Use the virtual page table pointed to by (vpte). | |
| 939 | * | |
| 940 | * This implements an N-level page table. Any level can terminate the | |
| 941 | * scan by setting VPTE_PS. A linear mapping is accomplished by setting | |
| 942 | * VPTE_PS in the master page directory entry set via mcontrol(MADV_SETMAP). | |
| 943 | */ | |
| 944 | static | |
| 945 | int | |
| 4e7c41c5 MD |
946 | vm_fault_vpagetable(struct faultstate *fs, vm_pindex_t *pindex, |
| 947 | vpte_t vpte, int fault_type) | |
| afeabdca | 948 | { |
| 5c5185ae | 949 | struct lwbuf *lwb; |
| 7a683a24 | 950 | struct lwbuf lwb_cache; |
| 8608b858 | 951 | int vshift = VPTE_FRAME_END - PAGE_SHIFT; /* index bits remaining */ |
| 72579d2e | 952 | int result = KERN_SUCCESS; |
| 4e7c41c5 | 953 | vpte_t *ptep; |
| afeabdca | 954 | |
| b12defdc | 955 | ASSERT_LWKT_TOKEN_HELD(vm_object_token(fs->first_object)); |
| afeabdca | 956 | for (;;) { |
| 4e7c41c5 MD |
957 | /* |
| 958 | * We cannot proceed if the vpte is not valid, not readable | |
| 959 | * for a read fault, or not writable for a write fault. | |
| 960 | */ | |
| afeabdca MD |
961 | if ((vpte & VPTE_V) == 0) { |
| 962 | unlock_and_deallocate(fs); | |
| 963 | return (KERN_FAILURE); | |
| 964 | } | |
| 4e7c41c5 MD |
965 | if ((fault_type & VM_PROT_READ) && (vpte & VPTE_R) == 0) { |
| 966 | unlock_and_deallocate(fs); | |
| 967 | return (KERN_FAILURE); | |
| 968 | } | |
| 969 | if ((fault_type & VM_PROT_WRITE) && (vpte & VPTE_W) == 0) { | |
| 970 | unlock_and_deallocate(fs); | |
| 971 | return (KERN_FAILURE); | |
| 972 | } | |
| afeabdca MD |
973 | if ((vpte & VPTE_PS) || vshift == 0) |
| 974 | break; | |
| 975 | KKASSERT(vshift >= VPTE_PAGE_BITS); | |
| 976 | ||
| 977 | /* | |
| 4e7c41c5 MD |
978 | * Get the page table page. Nominally we only read the page |
| 979 | * table, but since we are actively setting VPTE_M and VPTE_A, | |
| 980 | * tell vm_fault_object() that we are writing it. | |
| 981 | * | |
| 982 | * There is currently no real need to optimize this. | |
| afeabdca | 983 | */ |
| 61cddc1c | 984 | result = vm_fault_object(fs, (vpte & VPTE_FRAME) >> PAGE_SHIFT, |
| 0035dca9 | 985 | VM_PROT_READ|VM_PROT_WRITE); |
| afeabdca MD |
986 | if (result != KERN_SUCCESS) |
| 987 | return (result); | |
| 988 | ||
| 989 | /* | |
| 990 | * Process the returned fs.m and look up the page table | |
| 991 | * entry in the page table page. | |
| 992 | */ | |
| 993 | vshift -= VPTE_PAGE_BITS; | |
| 7a683a24 | 994 | lwb = lwbuf_alloc(fs->m, &lwb_cache); |
| 5c5185ae | 995 | ptep = ((vpte_t *)lwbuf_kva(lwb) + |
| 4e7c41c5 MD |
996 | ((*pindex >> vshift) & VPTE_PAGE_MASK)); |
| 997 | vpte = *ptep; | |
| 998 | ||
| 999 | /* | |
| 1000 | * Page table write-back. If the vpte is valid for the | |
| 1001 | * requested operation, do a write-back to the page table. | |
| 1002 | * | |
| 1003 | * XXX VPTE_M is not set properly for page directory pages. | |
| 1004 | * It doesn't get set in the page directory if the page table | |
| 1005 | * is modified during a read access. | |
| 1006 | */ | |
| 54341a3b | 1007 | vm_page_activate(fs->m); |
| 4e7c41c5 MD |
1008 | if ((fault_type & VM_PROT_WRITE) && (vpte & VPTE_V) && |
| 1009 | (vpte & VPTE_W)) { | |
| 0035dca9 | 1010 | if ((vpte & (VPTE_M|VPTE_A)) != (VPTE_M|VPTE_A)) { |
| 8608b858 | 1011 | atomic_set_long(ptep, VPTE_M | VPTE_A); |
| 4e7c41c5 MD |
1012 | vm_page_dirty(fs->m); |
| 1013 | } | |
| 1014 | } | |
| 1015 | if ((fault_type & VM_PROT_READ) && (vpte & VPTE_V) && | |
| 1016 | (vpte & VPTE_R)) { | |
| 1017 | if ((vpte & VPTE_A) == 0) { | |
| 61cddc1c | 1018 | atomic_set_long(ptep, VPTE_A); |
| 4e7c41c5 MD |
1019 | vm_page_dirty(fs->m); |
| 1020 | } | |
| 1021 | } | |
| 5c5185ae | 1022 | lwbuf_free(lwb); |
| afeabdca | 1023 | vm_page_flag_set(fs->m, PG_REFERENCED); |
| afeabdca | 1024 | vm_page_wakeup(fs->m); |
| a491077e | 1025 | fs->m = NULL; |
| afeabdca MD |
1026 | cleanup_successful_fault(fs); |
| 1027 | } | |
| afeabdca MD |
1028 | /* |
| 1029 | * Combine remaining address bits with the vpte. | |
| 1030 | */ | |
| 61cddc1c JG |
1031 | /* JG how many bits from each? */ |
| 1032 | *pindex = ((vpte & VPTE_FRAME) >> PAGE_SHIFT) + | |
| 1033 | (*pindex & ((1L << vshift) - 1)); | |
| afeabdca MD |
1034 | return (KERN_SUCCESS); |
| 1035 | } | |
| 1036 | ||
| 1037 | ||
| 1038 | /* | |
| 9ad0147b MD |
1039 | * This is the core of the vm_fault code. |
| 1040 | * | |
| 72579d2e | 1041 | * Do all operations required to fault-in (fs.first_object, pindex). Run |
| 568e6804 MD |
1042 | * through the shadow chain as necessary and do required COW or virtual |
| 1043 | * copy operations. The caller has already fully resolved the vm_map_entry | |
| 1044 | * and, if appropriate, has created a copy-on-write layer. All we need to | |
| 1045 | * do is iterate the object chain. | |
| 1046 | * | |
| 1047 | * On failure (fs) is unlocked and deallocated and the caller may return or | |
| 75f59a66 MD |
1048 | * retry depending on the failure code. On success (fs) is NOT unlocked or |
| 1049 | * deallocated, fs.m will contained a resolved, busied page, and fs.object | |
| 1050 | * will have an additional PIP count if it is not equal to fs.first_object. | |
| 9ad0147b | 1051 | * |
| b12defdc | 1052 | * fs->first_object must be held on call. |
| 568e6804 MD |
1053 | */ |
| 1054 | static | |
| 1055 | int | |
| 72579d2e MD |
1056 | vm_fault_object(struct faultstate *fs, |
| 1057 | vm_pindex_t first_pindex, vm_prot_t fault_type) | |
| 568e6804 MD |
1058 | { |
| 1059 | vm_object_t next_object; | |
| 72579d2e | 1060 | vm_pindex_t pindex; |
| b12defdc | 1061 | int error; |
| 568e6804 | 1062 | |
| b12defdc | 1063 | ASSERT_LWKT_TOKEN_HELD(vm_object_token(fs->first_object)); |
| 72579d2e MD |
1064 | fs->prot = fs->first_prot; |
| 1065 | fs->object = fs->first_object; | |
| 1066 | pindex = first_pindex; | |
| 1067 | ||
| b12defdc MD |
1068 | vm_object_chain_acquire(fs->first_object); |
| 1069 | vm_object_pip_add(fs->first_object, 1); | |
| 1070 | ||
| 4e7c41c5 MD |
1071 | /* |
| 1072 | * If a read fault occurs we try to make the page writable if | |
| 1073 | * possible. There are three cases where we cannot make the | |
| 1074 | * page mapping writable: | |
| 1075 | * | |
| 1076 | * (1) The mapping is read-only or the VM object is read-only, | |
| 0035dca9 | 1077 | * fs->prot above will simply not have VM_PROT_WRITE set. |
| 4e7c41c5 MD |
1078 | * |
| 1079 | * (2) If the mapping is a virtual page table we need to be able | |
| 70fc5283 MD |
1080 | * to detect writes so we can set VPTE_M in the virtual page |
| 1081 | * table. | |
| 4e7c41c5 MD |
1082 | * |
| 1083 | * (3) If the VM page is read-only or copy-on-write, upgrading would | |
| 1084 | * just result in an unnecessary COW fault. | |
| 0035dca9 MD |
1085 | * |
| 1086 | * VM_PROT_VPAGED is set if faulting via a virtual page table and | |
| 1087 | * causes adjustments to the 'M'odify bit to also turn off write | |
| 1088 | * access to force a re-fault. | |
| 4e7c41c5 | 1089 | */ |
| 0035dca9 MD |
1090 | if (fs->entry->maptype == VM_MAPTYPE_VPAGETABLE) { |
| 1091 | if ((fault_type & VM_PROT_WRITE) == 0) | |
| 1092 | fs->prot &= ~VM_PROT_WRITE; | |
| 4e7c41c5 MD |
1093 | } |
| 1094 | ||
| b12defdc | 1095 | /* vm_object_hold(fs->object); implied b/c object == first_object */ |
| 9ad0147b | 1096 | |
| 568e6804 MD |
1097 | for (;;) { |
| 1098 | /* | |
| d2d8515b MD |
1099 | * The entire backing chain from first_object to object |
| 1100 | * inclusive is chainlocked. | |
| 1101 | * | |
| 568e6804 MD |
1102 | * If the object is dead, we stop here |
| 1103 | */ | |
| 1104 | if (fs->object->flags & OBJ_DEAD) { | |
| b12defdc MD |
1105 | vm_object_pip_wakeup(fs->first_object); |
| 1106 | vm_object_chain_release_all(fs->first_object, | |
| 1107 | fs->object); | |
| 1108 | if (fs->object != fs->first_object) | |
| 1109 | vm_object_drop(fs->object); | |
| 568e6804 | 1110 | unlock_and_deallocate(fs); |
| 984263bc MD |
1111 | return (KERN_PROTECTION_FAILURE); |
| 1112 | } | |
| 1113 | ||
| 1114 | /* | |
| b12defdc MD |
1115 | * See if the page is resident. Wait/Retry if the page is |
| 1116 | * busy (lots of stuff may have changed so we can't continue | |
| 1117 | * in that case). | |
| 1118 | * | |
| 1119 | * We can theoretically allow the soft-busy case on a read | |
| 1120 | * fault if the page is marked valid, but since such | |
| 1121 | * pages are typically already pmap'd, putting that | |
| 1122 | * special case in might be more effort then it is | |
| 1123 | * worth. We cannot under any circumstances mess | |
| 1124 | * around with a vm_page_t->busy page except, perhaps, | |
| 1125 | * to pmap it. | |
| 984263bc | 1126 | */ |
| b12defdc MD |
1127 | fs->m = vm_page_lookup_busy_try(fs->object, pindex, |
| 1128 | TRUE, &error); | |
| 1129 | if (error) { | |
| 1130 | vm_object_pip_wakeup(fs->first_object); | |
| 1131 | vm_object_chain_release_all(fs->first_object, | |
| 1132 | fs->object); | |
| 1133 | if (fs->object != fs->first_object) | |
| 1134 | vm_object_drop(fs->object); | |
| 1135 | unlock_things(fs); | |
| 1136 | vm_page_sleep_busy(fs->m, TRUE, "vmpfw"); | |
| 1137 | mycpu->gd_cnt.v_intrans++; | |
| 1138 | /*vm_object_deallocate(fs->first_object);*/ | |
| 1139 | /*fs->first_object = NULL;*/ | |
| 1140 | fs->m = NULL; | |
| 1141 | return (KERN_TRY_AGAIN); | |
| 1142 | } | |
| 1143 | if (fs->m) { | |
| 984263bc | 1144 | /* |
| b12defdc | 1145 | * The page is busied for us. |
| 984263bc | 1146 | * |
| 568e6804 MD |
1147 | * If reactivating a page from PQ_CACHE we may have |
| 1148 | * to rate-limit. | |
| 1149 | */ | |
| b12defdc | 1150 | int queue = fs->m->queue; |
| 568e6804 | 1151 | vm_page_unqueue_nowakeup(fs->m); |
| 984263bc | 1152 | |
| 568e6804 MD |
1153 | if ((queue - fs->m->pc) == PQ_CACHE && |
| 1154 | vm_page_count_severe()) { | |
| 1155 | vm_page_activate(fs->m); | |
| b12defdc MD |
1156 | vm_page_wakeup(fs->m); |
| 1157 | fs->m = NULL; | |
| 1158 | vm_object_pip_wakeup(fs->first_object); | |
| 1159 | vm_object_chain_release_all(fs->first_object, | |
| 1160 | fs->object); | |
| 1161 | if (fs->object != fs->first_object) | |
| 1162 | vm_object_drop(fs->object); | |
| 568e6804 | 1163 | unlock_and_deallocate(fs); |
| 659c6a07 | 1164 | vm_waitpfault(); |
| 568e6804 | 1165 | return (KERN_TRY_AGAIN); |
| 984263bc MD |
1166 | } |
| 1167 | ||
| 1168 | /* | |
| b12defdc MD |
1169 | * If it still isn't completely valid (readable), |
| 1170 | * or if a read-ahead-mark is set on the VM page, | |
| 1171 | * jump to readrest, else we found the page and | |
| 1172 | * can return. | |
| 06ecca5a MD |
1173 | * |
| 1174 | * We can release the spl once we have marked the | |
| 1175 | * page busy. | |
| 984263bc | 1176 | */ |
| cf1bb2a8 MD |
1177 | if (fs->m->object != &kernel_object) { |
| 1178 | if ((fs->m->valid & VM_PAGE_BITS_ALL) != | |
| 1179 | VM_PAGE_BITS_ALL) { | |
| 1180 | goto readrest; | |
| 1181 | } | |
| 1182 | if (fs->m->flags & PG_RAM) { | |
| 1183 | if (debug_cluster) | |
| 1184 | kprintf("R"); | |
| 1185 | vm_page_flag_clear(fs->m, PG_RAM); | |
| 1186 | goto readrest; | |
| 1187 | } | |
| 984263bc | 1188 | } |
| 568e6804 | 1189 | break; /* break to PAGE HAS BEEN FOUND */ |
| 984263bc MD |
1190 | } |
| 1191 | ||
| 1192 | /* | |
| 1193 | * Page is not resident, If this is the search termination | |
| 1194 | * or the pager might contain the page, allocate a new page. | |
| 1195 | */ | |
| 568e6804 MD |
1196 | if (TRYPAGER(fs) || fs->object == fs->first_object) { |
| 1197 | /* | |
| 1198 | * If the page is beyond the object size we fail | |
| 1199 | */ | |
| 72579d2e | 1200 | if (pindex >= fs->object->size) { |
| b12defdc MD |
1201 | vm_object_pip_wakeup(fs->first_object); |
| 1202 | vm_object_chain_release_all(fs->first_object, | |
| 1203 | fs->object); | |
| 1204 | if (fs->object != fs->first_object) | |
| 1205 | vm_object_drop(fs->object); | |
| 568e6804 | 1206 | unlock_and_deallocate(fs); |
| 984263bc MD |
1207 | return (KERN_PROTECTION_FAILURE); |
| 1208 | } | |
| 1209 | ||
| 1210 | /* | |
| 1211 | * Allocate a new page for this object/offset pair. | |
| d2d8515b MD |
1212 | * |
| 1213 | * It is possible for the allocation to race, so | |
| 1214 | * handle the case. | |
| 984263bc | 1215 | */ |
| 568e6804 | 1216 | fs->m = NULL; |
| 984263bc | 1217 | if (!vm_page_count_severe()) { |
| 72579d2e | 1218 | fs->m = vm_page_alloc(fs->object, pindex, |
| b12defdc | 1219 | ((fs->vp || fs->object->backing_object) ? |
| d2d8515b MD |
1220 | VM_ALLOC_NULL_OK | VM_ALLOC_NORMAL : |
| 1221 | VM_ALLOC_NULL_OK | VM_ALLOC_NORMAL | | |
| 54341a3b | 1222 | VM_ALLOC_USE_GD | VM_ALLOC_ZERO)); |
| 984263bc | 1223 | } |
| 568e6804 | 1224 | if (fs->m == NULL) { |
| b12defdc MD |
1225 | vm_object_pip_wakeup(fs->first_object); |
| 1226 | vm_object_chain_release_all(fs->first_object, | |
| 1227 | fs->object); | |
| 1228 | if (fs->object != fs->first_object) | |
| 1229 | vm_object_drop(fs->object); | |
| 568e6804 | 1230 | unlock_and_deallocate(fs); |
| 659c6a07 | 1231 | vm_waitpfault(); |
| 568e6804 | 1232 | return (KERN_TRY_AGAIN); |
| 984263bc | 1233 | } |
| b12defdc MD |
1234 | |
| 1235 | /* | |
| 1236 | * Fall through to readrest. We have a new page which | |
| 1237 | * will have to be paged (since m->valid will be 0). | |
| 1238 | */ | |
| 984263bc MD |
1239 | } |
| 1240 | ||
| 1241 | readrest: | |
| 1242 | /* | |
| 1b9d3514 | 1243 | * We have found an invalid or partially valid page, a |
| 1c9602b3 MD |
1244 | * page with a read-ahead mark which might be partially or |
| 1245 | * fully valid (and maybe dirty too), or we have allocated | |
| 1246 | * a new page. | |
| 984263bc MD |
1247 | * |
| 1248 | * Attempt to fault-in the page if there is a chance that the | |
| 1249 | * pager has it, and potentially fault in additional pages | |
| 1250 | * at the same time. | |
| 06ecca5a | 1251 | * |
| d2d8515b MD |
1252 | * If TRYPAGER is true then fs.m will be non-NULL and busied |
| 1253 | * for us. | |
| 984263bc | 1254 | */ |
| 568e6804 | 1255 | if (TRYPAGER(fs)) { |
| 984263bc | 1256 | int rv; |
| 1b9d3514 | 1257 | int seqaccess; |
| 568e6804 | 1258 | u_char behavior = vm_map_entry_behavior(fs->entry); |
| 984263bc | 1259 | |
| 1b9d3514 MD |
1260 | if (behavior == MAP_ENTRY_BEHAV_RANDOM) |
| 1261 | seqaccess = 0; | |
| 1262 | else | |
| 1263 | seqaccess = -1; | |
| 984263bc | 1264 | |
| 54341a3b | 1265 | #if 0 |
| 1b9d3514 MD |
1266 | /* |
| 1267 | * If sequential access is detected then attempt | |
| 1268 | * to deactivate/cache pages behind the scan to | |
| 1269 | * prevent resource hogging. | |
| 1270 | * | |
| 1271 | * Use of PG_RAM to detect sequential access | |
| 1272 | * also simulates multi-zone sequential access | |
| 1273 | * detection for free. | |
| 1274 | * | |
| 1275 | * NOTE: Partially valid dirty pages cannot be | |
| 1276 | * deactivated without causing NFS picemeal | |
| 1277 | * writes to barf. | |
| 1278 | */ | |
| 568e6804 | 1279 | if ((fs->first_object->type != OBJT_DEVICE) && |
| 984263bc MD |
1280 | (behavior == MAP_ENTRY_BEHAV_SEQUENTIAL || |
| 1281 | (behavior != MAP_ENTRY_BEHAV_RANDOM && | |
| 1b9d3514 | 1282 | (fs->m->flags & PG_RAM))) |
| 984263bc | 1283 | ) { |
| 1b9d3514 MD |
1284 | vm_pindex_t scan_pindex; |
| 1285 | int scan_count = 16; | |
| 1286 | ||
| 1287 | if (first_pindex < 16) { | |
| 1288 | scan_pindex = 0; | |
| 1289 | scan_count = 0; | |
| 1290 | } else { | |
| 1291 | scan_pindex = first_pindex - 16; | |
| 1292 | if (scan_pindex < 16) | |
| 1293 | scan_count = scan_pindex; | |
| 1294 | else | |
| 1295 | scan_count = 16; | |
| 1296 | } | |
| 984263bc | 1297 | |
| 1b9d3514 | 1298 | while (scan_count) { |
| 984263bc | 1299 | vm_page_t mt; |
| 568e6804 | 1300 | |
| 1b9d3514 MD |
1301 | mt = vm_page_lookup(fs->first_object, |
| 1302 | scan_pindex); | |
| b12defdc MD |
1303 | if (mt == NULL) |
| 1304 | break; | |
| 1305 | if (vm_page_busy_try(mt, TRUE)) | |
| 1306 | goto skip; | |
| 1307 | ||
| 1308 | if (mt->valid != VM_PAGE_BITS_ALL) { | |
| 1309 | vm_page_wakeup(mt); | |
| 984263bc | 1310 | break; |
| 1b9d3514 | 1311 | } |
| b12defdc MD |
1312 | if ((mt->flags & |
| 1313 | (PG_FICTITIOUS | PG_UNMANAGED)) || | |
| 1b9d3514 MD |
1314 | mt->hold_count || |
| 1315 | mt->wire_count) { | |
| b12defdc | 1316 | vm_page_wakeup(mt); |
| 1b9d3514 MD |
1317 | goto skip; |
| 1318 | } | |
| 984263bc MD |
1319 | if (mt->dirty == 0) |
| 1320 | vm_page_test_dirty(mt); | |
| 1321 | if (mt->dirty) { | |
| 1b9d3514 MD |
1322 | vm_page_protect(mt, |
| 1323 | VM_PROT_NONE); | |
| 984263bc | 1324 | vm_page_deactivate(mt); |
| 17cde63e | 1325 | vm_page_wakeup(mt); |
| 984263bc MD |
1326 | } else { |
| 1327 | vm_page_cache(mt); | |
| 1328 | } | |
| 1b9d3514 MD |
1329 | skip: |
| 1330 | --scan_count; | |
| 1331 | --scan_pindex; | |
| 984263bc MD |
1332 | } |
| 1333 | ||
| 1b9d3514 | 1334 | seqaccess = 1; |
| 984263bc | 1335 | } |
| 54341a3b | 1336 | #endif |
| 984263bc MD |
1337 | |
| 1338 | /* | |
| 1b9d3514 MD |
1339 | * Avoid deadlocking against the map when doing I/O. |
| 1340 | * fs.object and the page is PG_BUSY'd. | |
| 6e9c0867 MD |
1341 | * |
| 1342 | * NOTE: Once unlocked, fs->entry can become stale | |
| 1343 | * so this will NULL it out. | |
| 1344 | * | |
| 1345 | * NOTE: fs->entry is invalid until we relock the | |
| 1346 | * map and verify that the timestamp has not | |
| 1347 | * changed. | |
| 984263bc | 1348 | */ |
| 1b9d3514 | 1349 | unlock_map(fs); |
| 984263bc MD |
1350 | |
| 1351 | /* | |
| 1b9d3514 MD |
1352 | * Acquire the page data. We still hold a ref on |
| 1353 | * fs.object and the page has been PG_BUSY's. | |
| 1354 | * | |
| 1355 | * The pager may replace the page (for example, in | |
| 1356 | * order to enter a fictitious page into the | |
| 1357 | * object). If it does so it is responsible for | |
| 1358 | * cleaning up the passed page and properly setting | |
| 1359 | * the new page PG_BUSY. | |
| 1c9602b3 MD |
1360 | * |
| 1361 | * If we got here through a PG_RAM read-ahead | |
| 1362 | * mark the page may be partially dirty and thus | |
| 1363 | * not freeable. Don't bother checking to see | |
| 1364 | * if the pager has the page because we can't free | |
| 1365 | * it anyway. We have to depend on the get_page | |
| 1366 | * operation filling in any gaps whether there is | |
| 1367 | * backing store or not. | |
| 984263bc | 1368 | */ |
| 1c9602b3 | 1369 | rv = vm_pager_get_page(fs->object, &fs->m, seqaccess); |
| 984263bc MD |
1370 | |
| 1371 | if (rv == VM_PAGER_OK) { | |
| 1372 | /* | |
| 984263bc MD |
1373 | * Relookup in case pager changed page. Pager |
| 1374 | * is responsible for disposition of old page | |
| 1375 | * if moved. | |
| 06ecca5a MD |
1376 | * |
| 1377 | * XXX other code segments do relookups too. | |
| 1378 | * It's a bad abstraction that needs to be | |
| 1379 | * fixed/removed. | |
| 984263bc | 1380 | */ |
| 72579d2e | 1381 | fs->m = vm_page_lookup(fs->object, pindex); |
| 568e6804 | 1382 | if (fs->m == NULL) { |
| b12defdc MD |
1383 | vm_object_pip_wakeup(fs->first_object); |
| 1384 | vm_object_chain_release_all( | |
| 1385 | fs->first_object, fs->object); | |
| 1386 | if (fs->object != fs->first_object) | |
| 1387 | vm_object_drop(fs->object); | |
| 568e6804 MD |
1388 | unlock_and_deallocate(fs); |
| 1389 | return (KERN_TRY_AGAIN); | |
| 984263bc MD |
1390 | } |
| 1391 | ||
| 568e6804 | 1392 | ++fs->hardfault; |
| 984263bc MD |
1393 | break; /* break to PAGE HAS BEEN FOUND */ |
| 1394 | } | |
| 568e6804 | 1395 | |
| 984263bc MD |
1396 | /* |
| 1397 | * Remove the bogus page (which does not exist at this | |
| 1398 | * object/offset); before doing so, we must get back | |
| 1399 | * our object lock to preserve our invariant. | |
| 1400 | * | |
| 1401 | * Also wake up any other process that may want to bring | |
| 1402 | * in this page. | |
| 1403 | * | |
| 1404 | * If this is the top-level object, we must leave the | |
| 1405 | * busy page to prevent another process from rushing | |
| 1406 | * past us, and inserting the page in that object at | |
| 1407 | * the same time that we are. | |
| 1408 | */ | |
| a0bc8638 | 1409 | if (rv == VM_PAGER_ERROR) { |
| b12defdc MD |
1410 | if (curproc) { |
| 1411 | kprintf("vm_fault: pager read error, " | |
| 1412 | "pid %d (%s)\n", | |
| 1413 | curproc->p_pid, | |
| 1414 | curproc->p_comm); | |
| 1415 | } else { | |
| 1416 | kprintf("vm_fault: pager read error, " | |
| 1417 | "thread %p (%s)\n", | |
| 1418 | curthread, | |
| 1419 | curproc->p_comm); | |
| 1420 | } | |
| a0bc8638 | 1421 | } |
| 1b9d3514 | 1422 | |
| 984263bc MD |
1423 | /* |
| 1424 | * Data outside the range of the pager or an I/O error | |
| a55afca2 MD |
1425 | * |
| 1426 | * The page may have been wired during the pagein, | |
| 1427 | * e.g. by the buffer cache, and cannot simply be | |
| 1b9d3514 | 1428 | * freed. Call vnode_pager_freepage() to deal with it. |
| 984263bc MD |
1429 | */ |
| 1430 | /* | |
| 1431 | * XXX - the check for kernel_map is a kludge to work | |
| 1432 | * around having the machine panic on a kernel space | |
| 1433 | * fault w/ I/O error. | |
| 1434 | */ | |
| 1b9d3514 MD |
1435 | if (((fs->map != &kernel_map) && |
| 1436 | (rv == VM_PAGER_ERROR)) || (rv == VM_PAGER_BAD)) { | |
| a55afca2 | 1437 | vnode_pager_freepage(fs->m); |
| 568e6804 | 1438 | fs->m = NULL; |
| b12defdc MD |
1439 | vm_object_pip_wakeup(fs->first_object); |
| 1440 | vm_object_chain_release_all(fs->first_object, | |
| 1441 | fs->object); | |
| 1442 | if (fs->object != fs->first_object) | |
| 1443 | vm_object_drop(fs->object); | |
| 568e6804 MD |
1444 | unlock_and_deallocate(fs); |
| 1445 | if (rv == VM_PAGER_ERROR) | |
| 1446 | return (KERN_FAILURE); | |
| 1447 | else | |
| 1448 | return (KERN_PROTECTION_FAILURE); | |
| 1449 | /* NOT REACHED */ | |
| 984263bc | 1450 | } |
| 568e6804 | 1451 | if (fs->object != fs->first_object) { |
| a55afca2 | 1452 | vnode_pager_freepage(fs->m); |
| 568e6804 | 1453 | fs->m = NULL; |
| 984263bc MD |
1454 | /* |
| 1455 | * XXX - we cannot just fall out at this | |
| 1456 | * point, m has been freed and is invalid! | |
| 1457 | */ | |
| 1458 | } | |
| 1459 | } | |
| 1460 | ||
| 1461 | /* | |
| 568e6804 | 1462 | * We get here if the object has a default pager (or unwiring) |
| 984263bc MD |
1463 | * or the pager doesn't have the page. |
| 1464 | */ | |
| 568e6804 MD |
1465 | if (fs->object == fs->first_object) |
| 1466 | fs->first_m = fs->m; | |
| 984263bc MD |
1467 | |
| 1468 | /* | |
| b12defdc MD |
1469 | * Move on to the next object. The chain lock should prevent |
| 1470 | * the backing_object from getting ripped out from under us. | |
| 984263bc | 1471 | */ |
| b12defdc MD |
1472 | if ((next_object = fs->object->backing_object) != NULL) { |
| 1473 | vm_object_hold(next_object); | |
| 1474 | vm_object_chain_acquire(next_object); | |
| 1475 | KKASSERT(next_object == fs->object->backing_object); | |
| 1476 | pindex += OFF_TO_IDX(fs->object->backing_object_offset); | |
| 1477 | } | |
| 1478 | ||
| 984263bc MD |
1479 | if (next_object == NULL) { |
| 1480 | /* | |
| 1481 | * If there's no object left, fill the page in the top | |
| 1482 | * object with zeros. | |
| 1483 | */ | |
| 568e6804 | 1484 | if (fs->object != fs->first_object) { |
| b12defdc MD |
1485 | if (fs->first_object->backing_object != |
| 1486 | fs->object) { | |
| 1487 | vm_object_hold(fs->first_object->backing_object); | |
| 1488 | } | |
| 1489 | vm_object_chain_release_all( | |
| 1490 | fs->first_object->backing_object, | |
| 1491 | fs->object); | |
| 1492 | if (fs->first_object->backing_object != | |
| 1493 | fs->object) { | |
| 1494 | vm_object_drop(fs->first_object->backing_object); | |
| 1495 | } | |
| 568e6804 | 1496 | vm_object_pip_wakeup(fs->object); |
| b12defdc | 1497 | vm_object_drop(fs->object); |
| 568e6804 | 1498 | fs->object = fs->first_object; |
| 72579d2e | 1499 | pindex = first_pindex; |
| 568e6804 | 1500 | fs->m = fs->first_m; |
| 984263bc | 1501 | } |
| 568e6804 | 1502 | fs->first_m = NULL; |
| 984263bc MD |
1503 | |
| 1504 | /* | |
| 1505 | * Zero the page if necessary and mark it valid. | |
| 1506 | */ | |
| 568e6804 MD |
1507 | if ((fs->m->flags & PG_ZERO) == 0) { |
| 1508 | vm_page_zero_fill(fs->m); | |
| 984263bc | 1509 | } else { |
| 080c00e6 MD |
1510 | #ifdef PMAP_DEBUG |
| 1511 | pmap_page_assertzero(VM_PAGE_TO_PHYS(fs->m)); | |
| 1512 | #endif | |
| 1513 | vm_page_flag_clear(fs->m, PG_ZERO); | |
| 12e4aaff | 1514 | mycpu->gd_cnt.v_ozfod++; |
| 984263bc | 1515 | } |
| 12e4aaff | 1516 | mycpu->gd_cnt.v_zfod++; |
| 568e6804 | 1517 | fs->m->valid = VM_PAGE_BITS_ALL; |
| 984263bc | 1518 | break; /* break to PAGE HAS BEEN FOUND */ |
| 984263bc | 1519 | } |
| 9ad0147b MD |
1520 | if (fs->object != fs->first_object) { |
| 1521 | vm_object_pip_wakeup(fs->object); | |
| b12defdc MD |
1522 | vm_object_lock_swap(); |
| 1523 | vm_object_drop(fs->object); | |
| 9ad0147b MD |
1524 | } |
| 1525 | KASSERT(fs->object != next_object, | |
| 1526 | ("object loop %p", next_object)); | |
| 1527 | fs->object = next_object; | |
| 1528 | vm_object_pip_add(fs->object, 1); | |
| 984263bc MD |
1529 | } |
| 1530 | ||
| 984263bc MD |
1531 | /* |
| 1532 | * PAGE HAS BEEN FOUND. [Loop invariant still holds -- the object lock | |
| 1533 | * is held.] | |
| 1b9d3514 | 1534 | * |
| b12defdc | 1535 | * object still held. |
| 9ad0147b | 1536 | * |
| 984263bc MD |
1537 | * If the page is being written, but isn't already owned by the |
| 1538 | * top-level object, we have to copy it into a new page owned by the | |
| 1539 | * top-level object. | |
| 1540 | */ | |
| 1b9d3514 MD |
1541 | KASSERT((fs->m->flags & PG_BUSY) != 0, |
| 1542 | ("vm_fault: not busy after main loop")); | |
| 1543 | ||
| 568e6804 | 1544 | if (fs->object != fs->first_object) { |
| 984263bc MD |
1545 | /* |
| 1546 | * We only really need to copy if we want to write it. | |
| 1547 | */ | |
| 984263bc MD |
1548 | if (fault_type & VM_PROT_WRITE) { |
| 1549 | /* | |
| 1550 | * This allows pages to be virtually copied from a | |
| 1551 | * backing_object into the first_object, where the | |
| 1552 | * backing object has no other refs to it, and cannot | |
| 1553 | * gain any more refs. Instead of a bcopy, we just | |
| 1554 | * move the page from the backing object to the | |
| 1555 | * first object. Note that we must mark the page | |
| 1556 | * dirty in the first object so that it will go out | |
| 1557 | * to swap when needed. | |
| 1558 | */ | |
| aa542ad5 MD |
1559 | if ( |
| 1560 | /* | |
| 1561 | * Map, if present, has not changed | |
| 1562 | */ | |
| 1563 | (fs->map == NULL || | |
| 1564 | fs->map_generation == fs->map->timestamp) && | |
| 984263bc MD |
1565 | /* |
| 1566 | * Only one shadow object | |
| 1567 | */ | |
| 568e6804 | 1568 | (fs->object->shadow_count == 1) && |
| 984263bc MD |
1569 | /* |
| 1570 | * No COW refs, except us | |
| 1571 | */ | |
| 568e6804 | 1572 | (fs->object->ref_count == 1) && |
| 984263bc MD |
1573 | /* |
| 1574 | * No one else can look this object up | |
| 1575 | */ | |
| 568e6804 | 1576 | (fs->object->handle == NULL) && |
| 984263bc MD |
1577 | /* |
| 1578 | * No other ways to look the object up | |
| 1579 | */ | |
| 568e6804 MD |
1580 | ((fs->object->type == OBJT_DEFAULT) || |
| 1581 | (fs->object->type == OBJT_SWAP)) && | |
| 984263bc MD |
1582 | /* |
| 1583 | * We don't chase down the shadow chain | |
| 1584 | */ | |
| 568e6804 | 1585 | (fs->object == fs->first_object->backing_object) && |
| 984263bc MD |
1586 | |
| 1587 | /* | |
| 1588 | * grab the lock if we need to | |
| 1589 | */ | |
| 568e6804 | 1590 | (fs->lookup_still_valid || |
| aa542ad5 | 1591 | fs->map == NULL || |
| 568e6804 | 1592 | lockmgr(&fs->map->lock, LK_EXCLUSIVE|LK_NOWAIT) == 0) |
| 984263bc | 1593 | ) { |
| 984263bc | 1594 | /* |
| b12defdc MD |
1595 | * (first_m) and (m) are both busied. We have |
| 1596 | * move (m) into (first_m)'s object/pindex | |
| 1597 | * in an atomic fashion, then free (first_m). | |
| 1598 | * | |
| 1599 | * first_object is held so second remove | |
| 1600 | * followed by the rename should wind | |
| 1601 | * up being atomic. vm_page_free() might | |
| 1602 | * block so we don't do it until after the | |
| 1603 | * rename. | |
| 984263bc | 1604 | */ |
| b12defdc | 1605 | fs->lookup_still_valid = 1; |
| 568e6804 | 1606 | vm_page_protect(fs->first_m, VM_PROT_NONE); |
| b12defdc MD |
1607 | vm_page_remove(fs->first_m); |
| 1608 | vm_page_rename(fs->m, fs->first_object, | |
| 1609 | first_pindex); | |
| 568e6804 | 1610 | vm_page_free(fs->first_m); |
| 568e6804 | 1611 | fs->first_m = fs->m; |
| 568e6804 | 1612 | fs->m = NULL; |
| 12e4aaff | 1613 | mycpu->gd_cnt.v_cow_optim++; |
| 984263bc MD |
1614 | } else { |
| 1615 | /* | |
| 1616 | * Oh, well, lets copy it. | |
| 1617 | */ | |
| 568e6804 | 1618 | vm_page_copy(fs->m, fs->first_m); |
| 10192bae | 1619 | vm_page_event(fs->m, VMEVENT_COW); |
| 984263bc MD |
1620 | } |
| 1621 | ||
| 568e6804 | 1622 | if (fs->m) { |
| 984263bc MD |
1623 | /* |
| 1624 | * We no longer need the old page or object. | |
| 1625 | */ | |
| 568e6804 | 1626 | release_page(fs); |
| 984263bc MD |
1627 | } |
| 1628 | ||
| 1629 | /* | |
| b12defdc MD |
1630 | * We intend to revert to first_object, undo the |
| 1631 | * chain lock through to that. | |
| 1632 | */ | |
| 1633 | if (fs->first_object->backing_object != fs->object) | |
| 1634 | vm_object_hold(fs->first_object->backing_object); | |
| 1635 | vm_object_chain_release_all( | |
| 1636 | fs->first_object->backing_object, | |
| 1637 | fs->object); | |
| 1638 | if (fs->first_object->backing_object != fs->object) | |
| 1639 | vm_object_drop(fs->first_object->backing_object); | |
| 1640 | ||
| 1641 | /* | |
| 568e6804 | 1642 | * fs->object != fs->first_object due to above |
| 984263bc MD |
1643 | * conditional |
| 1644 | */ | |
| 568e6804 | 1645 | vm_object_pip_wakeup(fs->object); |
| b12defdc | 1646 | vm_object_drop(fs->object); |
| 984263bc MD |
1647 | |
| 1648 | /* | |
| 1649 | * Only use the new page below... | |
| 1650 | */ | |
| 1651 | ||
| 12e4aaff | 1652 | mycpu->gd_cnt.v_cow_faults++; |
| 568e6804 MD |
1653 | fs->m = fs->first_m; |
| 1654 | fs->object = fs->first_object; | |
| 72579d2e | 1655 | pindex = first_pindex; |
| 984263bc | 1656 | } else { |
| 568e6804 MD |
1657 | /* |
| 1658 | * If it wasn't a write fault avoid having to copy | |
| 1659 | * the page by mapping it read-only. | |
| 1660 | */ | |
| 1661 | fs->prot &= ~VM_PROT_WRITE; | |
| 984263bc MD |
1662 | } |
| 1663 | } | |
| 1664 | ||
| 1665 | /* | |
| 6e9c0867 MD |
1666 | * Relock the map if necessary, then check the generation count. |
| 1667 | * relock_map() will update fs->timestamp to account for the | |
| 1668 | * relocking if necessary. | |
| 1669 | * | |
| 1670 | * If the count has changed after relocking then all sorts of | |
| 1671 | * crap may have happened and we have to retry. | |
| 625a2937 MD |
1672 | * |
| 1673 | * NOTE: The relock_map() can fail due to a deadlock against | |
| 1674 | * the vm_page we are holding BUSY. | |
| 984263bc | 1675 | */ |
| 6e9c0867 | 1676 | if (fs->lookup_still_valid == FALSE && fs->map) { |
| 625a2937 MD |
1677 | if (relock_map(fs) || |
| 1678 | fs->map->timestamp != fs->map_generation) { | |
| 6e9c0867 | 1679 | release_page(fs); |
| b12defdc MD |
1680 | vm_object_pip_wakeup(fs->first_object); |
| 1681 | vm_object_chain_release_all(fs->first_object, | |
| 1682 | fs->object); | |
| 1683 | if (fs->object != fs->first_object) | |
| 1684 | vm_object_drop(fs->object); | |
| 6e9c0867 MD |
1685 | unlock_and_deallocate(fs); |
| 1686 | return (KERN_TRY_AGAIN); | |
| 1687 | } | |
| 568e6804 MD |
1688 | } |
| 1689 | ||
| 984263bc | 1690 | /* |
| 17cde63e MD |
1691 | * If the fault is a write, we know that this page is being |
| 1692 | * written NOW so dirty it explicitly to save on pmap_is_modified() | |
| 1693 | * calls later. | |
| 1694 | * | |
| 1695 | * If this is a NOSYNC mmap we do not want to set PG_NOSYNC | |
| 1696 | * if the page is already dirty to prevent data written with | |
| 1697 | * the expectation of being synced from not being synced. | |
| 1698 | * Likewise if this entry does not request NOSYNC then make | |
| 1699 | * sure the page isn't marked NOSYNC. Applications sharing | |
| 1700 | * data should use the same flags to avoid ping ponging. | |
| 1701 | * | |
| 1702 | * Also tell the backing pager, if any, that it should remove | |
| 1703 | * any swap backing since the page is now dirty. | |
| 984263bc | 1704 | */ |
| 54341a3b | 1705 | vm_page_activate(fs->m); |
| 568e6804 | 1706 | if (fs->prot & VM_PROT_WRITE) { |
| 568e6804 | 1707 | vm_object_set_writeable_dirty(fs->m->object); |
| 2421aac7 | 1708 | vm_set_nosync(fs->m, fs->entry); |
| 568e6804 | 1709 | if (fs->fault_flags & VM_FAULT_DIRTY) { |
| 568e6804 | 1710 | vm_page_dirty(fs->m); |
| 107e9bcc | 1711 | swap_pager_unswapped(fs->m); |
| 984263bc MD |
1712 | } |
| 1713 | } | |
| 1714 | ||
| b12defdc MD |
1715 | vm_object_pip_wakeup(fs->first_object); |
| 1716 | vm_object_chain_release_all(fs->first_object, fs->object); | |
| 1717 | if (fs->object != fs->first_object) | |
| 1718 | vm_object_drop(fs->object); | |
| 9ad0147b | 1719 | |
| 984263bc | 1720 | /* |
| 75f59a66 MD |
1721 | * Page had better still be busy. We are still locked up and |
| 1722 | * fs->object will have another PIP reference if it is not equal | |
| 1723 | * to fs->first_object. | |
| 984263bc | 1724 | */ |
| 568e6804 MD |
1725 | KASSERT(fs->m->flags & PG_BUSY, |
| 1726 | ("vm_fault: page %p not busy!", fs->m)); | |
| 984263bc | 1727 | |
| 984263bc MD |
1728 | /* |
| 1729 | * Sanity check: page must be completely valid or it is not fit to | |
| 1730 | * map into user space. vm_pager_get_pages() ensures this. | |
| 1731 | */ | |
| 568e6804 MD |
1732 | if (fs->m->valid != VM_PAGE_BITS_ALL) { |
| 1733 | vm_page_zero_invalid(fs->m, TRUE); | |
| 086c1d7e | 1734 | kprintf("Warning: page %p partially invalid on fault\n", fs->m); |
| 984263bc | 1735 | } |
| 2a418930 | 1736 | vm_page_flag_clear(fs->m, PG_ZERO); |
| 984263bc | 1737 | |
| 984263bc | 1738 | return (KERN_SUCCESS); |
| 984263bc MD |
1739 | } |
| 1740 | ||
| 1741 | /* | |
| f2d22ebf MD |
1742 | * Wire down a range of virtual addresses in a map. The entry in question |
| 1743 | * should be marked in-transition and the map must be locked. We must | |
| 1744 | * release the map temporarily while faulting-in the page to avoid a | |
| 1745 | * deadlock. Note that the entry may be clipped while we are blocked but | |
| 1746 | * will never be freed. | |
| 9ad0147b MD |
1747 | * |
| 1748 | * No requirements. | |
| 984263bc MD |
1749 | */ |
| 1750 | int | |
| f2d22ebf | 1751 | vm_fault_wire(vm_map_t map, vm_map_entry_t entry, boolean_t user_wire) |
| 984263bc | 1752 | { |
| f2d22ebf MD |
1753 | boolean_t fictitious; |
| 1754 | vm_offset_t start; | |
| 1755 | vm_offset_t end; | |
| 5f910b2f | 1756 | vm_offset_t va; |
| f2d22ebf | 1757 | vm_paddr_t pa; |
| b12defdc | 1758 | vm_page_t m; |
| 5f910b2f | 1759 | pmap_t pmap; |
| 984263bc MD |
1760 | int rv; |
| 1761 | ||
| b12defdc MD |
1762 | lwkt_gettoken(&map->token); |
| 1763 | ||
| 984263bc | 1764 | pmap = vm_map_pmap(map); |
| f2d22ebf MD |
1765 | start = entry->start; |
| 1766 | end = entry->end; | |
| 1767 | fictitious = entry->object.vm_object && | |
| 1768 | (entry->object.vm_object->type == OBJT_DEVICE); | |
| e40cfbd7 MD |
1769 | if (entry->eflags & MAP_ENTRY_KSTACK) |
| 1770 | start += PAGE_SIZE; | |
| f2d22ebf | 1771 | map->timestamp++; |
| 6e9c0867 | 1772 | vm_map_unlock(map); |
| 984263bc MD |
1773 | |
| 1774 | /* | |
| 984263bc MD |
1775 | * We simulate a fault to get the page and enter it in the physical |
| 1776 | * map. | |
| 1777 | */ | |
| 1778 | for (va = start; va < end; va += PAGE_SIZE) { | |
| f2d22ebf MD |
1779 | if (user_wire) { |
| 1780 | rv = vm_fault(map, va, VM_PROT_READ, | |
| 1781 | VM_FAULT_USER_WIRE); | |
| 1782 | } else { | |
| 1783 | rv = vm_fault(map, va, VM_PROT_READ|VM_PROT_WRITE, | |
| 1784 | VM_FAULT_CHANGE_WIRING); | |
| 1785 | } | |
| 984263bc | 1786 | if (rv) { |
| f2d22ebf MD |
1787 | while (va > start) { |
| 1788 | va -= PAGE_SIZE; | |
| 1789 | if ((pa = pmap_extract(pmap, va)) == 0) | |
| 1790 | continue; | |
| 1791 | pmap_change_wiring(pmap, va, FALSE); | |
| b12defdc MD |
1792 | if (!fictitious) { |
| 1793 | m = PHYS_TO_VM_PAGE(pa); | |
| 1794 | vm_page_busy_wait(m, FALSE, "vmwrpg"); | |
| 1795 | vm_page_unwire(m, 1); | |
| 1796 | vm_page_wakeup(m); | |
| 1797 | } | |
| f2d22ebf | 1798 | } |
| b12defdc | 1799 | goto done; |
| 984263bc MD |
1800 | } |
| 1801 | } | |
| b12defdc MD |
1802 | rv = KERN_SUCCESS; |
| 1803 | done: | |
| f2d22ebf | 1804 | vm_map_lock(map); |
| b12defdc MD |
1805 | lwkt_reltoken(&map->token); |
| 1806 | return (rv); | |
| 984263bc MD |
1807 | } |
| 1808 | ||
| 984263bc | 1809 | /* |
| f2d22ebf MD |
1810 | * Unwire a range of virtual addresses in a map. The map should be |
| 1811 | * locked. | |
| 984263bc MD |
1812 | */ |
| 1813 | void | |
| f2d22ebf | 1814 | vm_fault_unwire(vm_map_t map, vm_map_entry_t entry) |
| 984263bc | 1815 | { |
| f2d22ebf MD |
1816 | boolean_t fictitious; |
| 1817 | vm_offset_t start; | |
| 1818 | vm_offset_t end; | |
| 6ef943a3 MD |
1819 | vm_offset_t va; |
| 1820 | vm_paddr_t pa; | |
| b12defdc | 1821 | vm_page_t m; |
| 5f910b2f | 1822 | pmap_t pmap; |
| 984263bc | 1823 | |
| b12defdc MD |
1824 | lwkt_gettoken(&map->token); |
| 1825 | ||
| 984263bc | 1826 | pmap = vm_map_pmap(map); |
| f2d22ebf MD |
1827 | start = entry->start; |
| 1828 | end = entry->end; | |
| 1829 | fictitious = entry->object.vm_object && | |
| 1830 | (entry->object.vm_object->type == OBJT_DEVICE); | |
| e40cfbd7 MD |
1831 | if (entry->eflags & MAP_ENTRY_KSTACK) |
| 1832 | start += PAGE_SIZE; | |
| 984263bc MD |
1833 | |
| 1834 | /* | |
| 1835 | * Since the pages are wired down, we must be able to get their | |
| 1836 | * mappings from the physical map system. | |
| 1837 | */ | |
| 984263bc MD |
1838 | for (va = start; va < end; va += PAGE_SIZE) { |
| 1839 | pa = pmap_extract(pmap, va); | |
| 6ef943a3 | 1840 | if (pa != 0) { |
| 984263bc | 1841 | pmap_change_wiring(pmap, va, FALSE); |
| b12defdc MD |
1842 | if (!fictitious) { |
| 1843 | m = PHYS_TO_VM_PAGE(pa); | |
| 1844 | vm_page_busy_wait(m, FALSE, "vmwupg"); | |
| 1845 | vm_page_unwire(m, 1); | |
| 1846 | vm_page_wakeup(m); | |
| 1847 | } | |
| 984263bc MD |
1848 | } |
| 1849 | } | |
| b12defdc | 1850 | lwkt_reltoken(&map->token); |
| 984263bc MD |
1851 | } |
| 1852 | ||
| 1853 | /* | |
| 9ad0147b MD |
1854 | * Copy all of the pages from a wired-down map entry to another. |
| 1855 | * | |
| 1856 | * The source and destination maps must be locked for write. | |
| b12defdc | 1857 | * The source and destination maps token must be held |
| 9ad0147b MD |
1858 | * The source map entry must be wired down (or be a sharing map |
| 1859 | * entry corresponding to a main map entry that is wired down). | |
| 984263bc | 1860 | * |
| 9ad0147b | 1861 | * No other requirements. |
| 984263bc | 1862 | */ |
| 984263bc | 1863 | void |
| 57e43348 | 1864 | vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map, |
| 9ad0147b | 1865 | vm_map_entry_t dst_entry, vm_map_entry_t src_entry) |
| 984263bc MD |
1866 | { |
| 1867 | vm_object_t dst_object; | |
| 1868 | vm_object_t src_object; | |
| 1869 | vm_ooffset_t dst_offset; | |
| 1870 | vm_ooffset_t src_offset; | |
| 1871 | vm_prot_t prot; | |
| 1872 | vm_offset_t vaddr; | |
| 1873 | vm_page_t dst_m; | |
| 1874 | vm_page_t src_m; | |
| 1875 | ||
| 984263bc MD |
1876 | src_object = src_entry->object.vm_object; |
| 1877 | src_offset = src_entry->offset; | |
| 1878 | ||
| 1879 | /* | |
| 1880 | * Create the top-level object for the destination entry. (Doesn't | |
| 1881 | * actually shadow anything - we copy the pages directly.) | |
| 1882 | */ | |
| 53025830 MD |
1883 | vm_map_entry_allocate_object(dst_entry); |
| 1884 | dst_object = dst_entry->object.vm_object; | |
| 984263bc MD |
1885 | |
| 1886 | prot = dst_entry->max_protection; | |
| 1887 | ||
| 1888 | /* | |
| 1889 | * Loop through all of the pages in the entry's range, copying each | |
| 1890 | * one from the source object (it should be there) to the destination | |
| 1891 | * object. | |
| 1892 | */ | |
| 1893 | for (vaddr = dst_entry->start, dst_offset = 0; | |
| 1894 | vaddr < dst_entry->end; | |
| 1895 | vaddr += PAGE_SIZE, dst_offset += PAGE_SIZE) { | |
| 1896 | ||
| 1897 | /* | |
| 1898 | * Allocate a page in the destination object | |
| 1899 | */ | |
| 1900 | do { | |
| 1901 | dst_m = vm_page_alloc(dst_object, | |
| d2d8515b MD |
1902 | OFF_TO_IDX(dst_offset), |
| 1903 | VM_ALLOC_NORMAL); | |
| 984263bc | 1904 | if (dst_m == NULL) { |
| 4ecf7cc9 | 1905 | vm_wait(0); |
| 984263bc MD |
1906 | } |
| 1907 | } while (dst_m == NULL); | |
| 1908 | ||
| 1909 | /* | |
| 1910 | * Find the page in the source object, and copy it in. | |
| 1911 | * (Because the source is wired down, the page will be in | |
| 1912 | * memory.) | |
| 1913 | */ | |
| 1914 | src_m = vm_page_lookup(src_object, | |
| 080c00e6 | 1915 | OFF_TO_IDX(dst_offset + src_offset)); |
| 984263bc MD |
1916 | if (src_m == NULL) |
| 1917 | panic("vm_fault_copy_wired: page missing"); | |
| 1918 | ||
| 1919 | vm_page_copy(src_m, dst_m); | |
| 10192bae | 1920 | vm_page_event(src_m, VMEVENT_COW); |
| 984263bc MD |
1921 | |
| 1922 | /* | |
| 1923 | * Enter it in the pmap... | |
| 1924 | */ | |
| 1925 | ||
| 1926 | vm_page_flag_clear(dst_m, PG_ZERO); | |
| 1927 | pmap_enter(dst_map->pmap, vaddr, dst_m, prot, FALSE); | |
| 984263bc MD |
1928 | |
| 1929 | /* | |
| 1930 | * Mark it no longer busy, and put it on the active list. | |
| 1931 | */ | |
| 1932 | vm_page_activate(dst_m); | |
| 1933 | vm_page_wakeup(dst_m); | |
| 1934 | } | |
| 1935 | } | |
| 1936 | ||
| 1b9d3514 | 1937 | #if 0 |
| 984263bc MD |
1938 | |
| 1939 | /* | |
| 1940 | * This routine checks around the requested page for other pages that | |
| 1941 | * might be able to be faulted in. This routine brackets the viable | |
| 1942 | * pages for the pages to be paged in. | |
| 1943 | * | |
| 1944 | * Inputs: | |
| 1945 | * m, rbehind, rahead | |
| 1946 | * | |
| 1947 | * Outputs: | |
| 1948 | * marray (array of vm_page_t), reqpage (index of requested page) | |
| 1949 | * | |
| 1950 | * Return value: | |
| 1951 | * number of pages in marray | |
| 1952 | */ | |
| 1953 | static int | |
| 57e43348 | 1954 | vm_fault_additional_pages(vm_page_t m, int rbehind, int rahead, |
| bc823b32 | 1955 | vm_page_t *marray, int *reqpage) |
| 984263bc MD |
1956 | { |
| 1957 | int i,j; | |
| 1958 | vm_object_t object; | |
| 1959 | vm_pindex_t pindex, startpindex, endpindex, tpindex; | |
| 1960 | vm_page_t rtm; | |
| 1961 | int cbehind, cahead; | |
| 1962 | ||
| 1963 | object = m->object; | |
| 1964 | pindex = m->pindex; | |
| 1965 | ||
| 1966 | /* | |
| 1967 | * we don't fault-ahead for device pager | |
| 1968 | */ | |
| 1969 | if (object->type == OBJT_DEVICE) { | |
| 1970 | *reqpage = 0; | |
| 1971 | marray[0] = m; | |
| 1972 | return 1; | |
| 1973 | } | |
| 1974 | ||
| 1975 | /* | |
| 1976 | * if the requested page is not available, then give up now | |
| 1977 | */ | |
| 984263bc | 1978 | if (!vm_pager_has_page(object, pindex, &cbehind, &cahead)) { |
| 17cde63e | 1979 | *reqpage = 0; /* not used by caller, fix compiler warn */ |
| 984263bc MD |
1980 | return 0; |
| 1981 | } | |
| 1982 | ||
| 1983 | if ((cbehind == 0) && (cahead == 0)) { | |
| 1984 | *reqpage = 0; | |
| 1985 | marray[0] = m; | |
| 1986 | return 1; | |
| 1987 | } | |
| 1988 | ||
| 1989 | if (rahead > cahead) { | |
| 1990 | rahead = cahead; | |
| 1991 | } | |
| 1992 | ||
| 1993 | if (rbehind > cbehind) { | |
| 1994 | rbehind = cbehind; | |
| 1995 | } | |
| 1996 | ||
| 1997 | /* | |
| bc823b32 MD |
1998 | * Do not do any readahead if we have insufficient free memory. |
| 1999 | * | |
| 2000 | * XXX code was broken disabled before and has instability | |
| 2001 | * with this conditonal fixed, so shortcut for now. | |
| 984263bc | 2002 | */ |
| bc823b32 | 2003 | if (burst_fault == 0 || vm_page_count_severe()) { |
| 984263bc MD |
2004 | marray[0] = m; |
| 2005 | *reqpage = 0; | |
| 2006 | return 1; | |
| 2007 | } | |
| 2008 | ||
| 2009 | /* | |
| 2010 | * scan backward for the read behind pages -- in memory | |
| 06ecca5a MD |
2011 | * |
| 2012 | * Assume that if the page is not found an interrupt will not | |
| 2013 | * create it. Theoretically interrupts can only remove (busy) | |
| 2014 | * pages, not create new associations. | |
| 984263bc MD |
2015 | */ |
| 2016 | if (pindex > 0) { | |
| 2017 | if (rbehind > pindex) { | |
| 2018 | rbehind = pindex; | |
| 2019 | startpindex = 0; | |
| 2020 | } else { | |
| 2021 | startpindex = pindex - rbehind; | |
| 2022 | } | |
| 2023 | ||
| b12defdc | 2024 | vm_object_hold(object); |
| bc823b32 MD |
2025 | for (tpindex = pindex; tpindex > startpindex; --tpindex) { |
| 2026 | if (vm_page_lookup(object, tpindex - 1)) | |
| 984263bc MD |
2027 | break; |
| 2028 | } | |
| 2029 | ||
| bc823b32 MD |
2030 | i = 0; |
| 2031 | while (tpindex < pindex) { | |
| d2d8515b MD |
2032 | rtm = vm_page_alloc(object, tpindex, VM_ALLOC_SYSTEM | |
| 2033 | VM_ALLOC_NULL_OK); | |
| 984263bc MD |
2034 | if (rtm == NULL) { |
| 2035 | for (j = 0; j < i; j++) { | |
| 2036 | vm_page_free(marray[j]); | |
| 2037 | } | |
| b12defdc | 2038 | vm_object_drop(object); |
| 984263bc MD |
2039 | marray[0] = m; |
| 2040 | *reqpage = 0; | |
| 2041 | return 1; | |
| 2042 | } | |
| 984263bc | 2043 | marray[i] = rtm; |
| bc823b32 MD |
2044 | ++i; |
| 2045 | ++tpindex; | |
| 984263bc | 2046 | } |
| b12defdc | 2047 | vm_object_drop(object); |
| 984263bc | 2048 | } else { |
| 984263bc MD |
2049 | i = 0; |
| 2050 | } | |
| 2051 | ||
| bc823b32 MD |
2052 | /* |
| 2053 | * Assign requested page | |
| 2054 | */ | |
| 984263bc | 2055 | marray[i] = m; |
| 984263bc | 2056 | *reqpage = i; |
| bc823b32 | 2057 | ++i; |
| 984263bc MD |
2058 | |
| 2059 | /* | |
| bc823b32 | 2060 | * Scan forwards for read-ahead pages |
| 984263bc | 2061 | */ |
| bc823b32 | 2062 | tpindex = pindex + 1; |
| 984263bc MD |
2063 | endpindex = tpindex + rahead; |
| 2064 | if (endpindex > object->size) | |
| 2065 | endpindex = object->size; | |
| 2066 | ||
| b12defdc | 2067 | vm_object_hold(object); |
| bc823b32 MD |
2068 | while (tpindex < endpindex) { |
| 2069 | if (vm_page_lookup(object, tpindex)) | |
| 984263bc | 2070 | break; |
| d2d8515b MD |
2071 | rtm = vm_page_alloc(object, tpindex, VM_ALLOC_SYSTEM | |
| 2072 | VM_ALLOC_NULL_OK); | |
| bc823b32 | 2073 | if (rtm == NULL) |
| 984263bc | 2074 | break; |
| 984263bc | 2075 | marray[i] = rtm; |
| bc823b32 MD |
2076 | ++i; |
| 2077 | ++tpindex; | |
| 984263bc | 2078 | } |
| b12defdc | 2079 | vm_object_drop(object); |
| 984263bc | 2080 | |
| bc823b32 | 2081 | return (i); |
| 984263bc | 2082 | } |
| 1b9d3514 MD |
2083 | |
| 2084 | #endif | |
| 2085 | ||
| 2086 | /* | |
| 2087 | * vm_prefault() provides a quick way of clustering pagefaults into a | |
| 2088 | * processes address space. It is a "cousin" of pmap_object_init_pt, | |
| 2089 | * except it runs at page fault time instead of mmap time. | |
| 2090 | * | |
| 85946b6c MD |
2091 | * vm.fast_fault Enables pre-faulting zero-fill pages |
| 2092 | * | |
| 2093 | * vm.prefault_pages Number of pages (1/2 negative, 1/2 positive) to | |
| 2094 | * prefault. Scan stops in either direction when | |
| 2095 | * a page is found to already exist. | |
| 2096 | * | |
| 1b9d3514 MD |
2097 | * This code used to be per-platform pmap_prefault(). It is now |
| 2098 | * machine-independent and enhanced to also pre-fault zero-fill pages | |
| 2099 | * (see vm.fast_fault) as well as make them writable, which greatly | |
| 2100 | * reduces the number of page faults programs incur. | |
| 2101 | * | |
| 2102 | * Application performance when pre-faulting zero-fill pages is heavily | |
| 2103 | * dependent on the application. Very tiny applications like /bin/echo | |
| 2104 | * lose a little performance while applications of any appreciable size | |
| 2105 | * gain performance. Prefaulting multiple pages also reduces SMP | |
| 2106 | * congestion and can improve SMP performance significantly. | |
| 2107 | * | |
| 2108 | * NOTE! prot may allow writing but this only applies to the top level | |
| 2109 | * object. If we wind up mapping a page extracted from a backing | |
| 2110 | * object we have to make sure it is read-only. | |
| 2111 | * | |
| 2112 | * NOTE! The caller has already handled any COW operations on the | |
| 2113 | * vm_map_entry via the normal fault code. Do NOT call this | |
| 2114 | * shortcut unless the normal fault code has run on this entry. | |
| 9ad0147b | 2115 | * |
| d2d8515b | 2116 | * The related map must be locked. |
| 9ad0147b | 2117 | * No other requirements. |
| 1b9d3514 | 2118 | */ |
| 85946b6c MD |
2119 | static int vm_prefault_pages = 8; |
| 2120 | SYSCTL_INT(_vm, OID_AUTO, prefault_pages, CTLFLAG_RW, &vm_prefault_pages, 0, | |
| 2121 | "Maximum number of pages to pre-fault"); | |
| 2122 | static int vm_fast_fault = 1; | |
| 2123 | SYSCTL_INT(_vm, OID_AUTO, fast_fault, CTLFLAG_RW, &vm_fast_fault, 0, | |
| 2124 | "Burst fault zero-fill regions"); | |
| 1b9d3514 | 2125 | |
| 2421aac7 MD |
2126 | /* |
| 2127 | * Set PG_NOSYNC if the map entry indicates so, but only if the page | |
| 2128 | * is not already dirty by other means. This will prevent passive | |
| 2129 | * filesystem syncing as well as 'sync' from writing out the page. | |
| 2130 | */ | |
| 2131 | static void | |
| 2132 | vm_set_nosync(vm_page_t m, vm_map_entry_t entry) | |
| 2133 | { | |
| 2134 | if (entry->eflags & MAP_ENTRY_NOSYNC) { | |
| 2135 | if (m->dirty == 0) | |
| 2136 | vm_page_flag_set(m, PG_NOSYNC); | |
| 2137 | } else { | |
| 2138 | vm_page_flag_clear(m, PG_NOSYNC); | |
| 2139 | } | |
| 2140 | } | |
| 2141 | ||
| 1b9d3514 | 2142 | static void |
| 54341a3b MD |
2143 | vm_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry, int prot, |
| 2144 | int fault_flags) | |
| 1b9d3514 MD |
2145 | { |
| 2146 | struct lwp *lp; | |
| 2147 | vm_page_t m; | |
| 1b9d3514 MD |
2148 | vm_offset_t addr; |
| 2149 | vm_pindex_t index; | |
| 2150 | vm_pindex_t pindex; | |
| 2151 | vm_object_t object; | |
| 2152 | int pprot; | |
| 2153 | int i; | |
| 85946b6c MD |
2154 | int noneg; |
| 2155 | int nopos; | |
| 2156 | int maxpages; | |
| 2157 | ||
| 2158 | /* | |
| 2159 | * Get stable max count value, disabled if set to 0 | |
| 2160 | */ | |
| 2161 | maxpages = vm_prefault_pages; | |
| 2162 | cpu_ccfence(); | |
| 2163 | if (maxpages <= 0) | |
| 2164 | return; | |
| 1b9d3514 MD |
2165 | |
| 2166 | /* | |
| 2167 | * We do not currently prefault mappings that use virtual page | |
| 2168 | * tables. We do not prefault foreign pmaps. | |
| 2169 | */ | |
| 2170 | if (entry->maptype == VM_MAPTYPE_VPAGETABLE) | |
| 2171 | return; | |
| 2172 | lp = curthread->td_lwp; | |
| 2173 | if (lp == NULL || (pmap != vmspace_pmap(lp->lwp_vmspace))) | |
| 2174 | return; | |
| 2175 | ||
| 85946b6c MD |
2176 | /* |
| 2177 | * Limit pre-fault count to 1024 pages. | |
| 2178 | */ | |
| 2179 | if (maxpages > 1024) | |
| 2180 | maxpages = 1024; | |
| 1b9d3514 | 2181 | |
| b12defdc MD |
2182 | object = entry->object.vm_object; |
| 2183 | KKASSERT(object != NULL); | |
| a31129d8 | 2184 | KKASSERT(object == entry->object.vm_object); |
| 54341a3b | 2185 | vm_object_hold(object); |
| b12defdc MD |
2186 | vm_object_chain_acquire(object); |
| 2187 | ||
| 85946b6c MD |
2188 | noneg = 0; |
| 2189 | nopos = 0; | |
| 2190 | for (i = 0; i < maxpages; ++i) { | |
| 1b9d3514 | 2191 | vm_object_t lobject; |
| a31129d8 | 2192 | vm_object_t nobject; |
| 3bb7eedb | 2193 | int allocated = 0; |
| b12defdc | 2194 | int error; |
| 1b9d3514 | 2195 | |
| 85946b6c | 2196 | /* |
| d2d8515b MD |
2197 | * This can eat a lot of time on a heavily contended |
| 2198 | * machine so yield on the tick if needed. | |
| 2199 | */ | |
| 2200 | if ((i & 7) == 7) | |
| 2201 | lwkt_yield(); | |
| 2202 | ||
| 2203 | /* | |
| 85946b6c MD |
2204 | * Calculate the page to pre-fault, stopping the scan in |
| 2205 | * each direction separately if the limit is reached. | |
| 2206 | */ | |
| 2207 | if (i & 1) { | |
| 2208 | if (noneg) | |
| 2209 | continue; | |
| 2210 | addr = addra - ((i + 1) >> 1) * PAGE_SIZE; | |
| 2211 | } else { | |
| 2212 | if (nopos) | |
| 2213 | continue; | |
| 2214 | addr = addra + ((i + 2) >> 1) * PAGE_SIZE; | |
| 2215 | } | |
| 2216 | if (addr < entry->start) { | |
| 2217 | noneg = 1; | |
| 2218 | if (noneg && nopos) | |
| 2219 | break; | |
| 2220 | continue; | |
| 2221 | } | |
| 2222 | if (addr >= entry->end) { | |
| 2223 | nopos = 1; | |
| 2224 | if (noneg && nopos) | |
| 2225 | break; | |
| 1b9d3514 | 2226 | continue; |
| 85946b6c | 2227 | } |
| 1b9d3514 | 2228 | |
| 85946b6c MD |
2229 | /* |
| 2230 | * Skip pages already mapped, and stop scanning in that | |
| 2231 | * direction. When the scan terminates in both directions | |
| 2232 | * we are done. | |
| 2233 | */ | |
| 2234 | if (pmap_prefault_ok(pmap, addr) == 0) { | |
| 2235 | if (i & 1) | |
| 2236 | noneg = 1; | |
| 2237 | else | |
| 2238 | nopos = 1; | |
| 2239 | if (noneg && nopos) | |
| 2240 | break; | |
| 1b9d3514 | 2241 | continue; |
| 85946b6c | 2242 | } |
| 1b9d3514 MD |
2243 | |
| 2244 | /* | |
| 2245 | * Follow the VM object chain to obtain the page to be mapped | |
| 2246 | * into the pmap. | |
| 2247 | * | |
| 2248 | * If we reach the terminal object without finding a page | |
| 2249 | * and we determine it would be advantageous, then allocate | |
| 2250 | * a zero-fill page for the base object. The base object | |
| 2251 | * is guaranteed to be OBJT_DEFAULT for this case. | |
| 3bb7eedb MD |
2252 | * |
| 2253 | * In order to not have to check the pager via *haspage*() | |
| 2254 | * we stop if any non-default object is encountered. e.g. | |
| 2255 | * a vnode or swap object would stop the loop. | |
| 1b9d3514 MD |
2256 | */ |
| 2257 | index = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT; | |
| 2258 | lobject = object; | |
| 2259 | pindex = index; | |
| 2260 | pprot = prot; | |
| 2261 | ||
| a31129d8 | 2262 | KKASSERT(lobject == entry->object.vm_object); |
| b12defdc | 2263 | /*vm_object_hold(lobject); implied */ |
| a31129d8 | 2264 | |
| b12defdc MD |
2265 | while ((m = vm_page_lookup_busy_try(lobject, pindex, |
| 2266 | TRUE, &error)) == NULL) { | |
| 1b9d3514 MD |
2267 | if (lobject->type != OBJT_DEFAULT) |
| 2268 | break; | |
| 2269 | if (lobject->backing_object == NULL) { | |
| 2270 | if (vm_fast_fault == 0) | |
| 2271 | break; | |
| 85946b6c | 2272 | if ((prot & VM_PROT_WRITE) == 0 || |
| 1b9d3514 MD |
2273 | vm_page_count_min(0)) { |
| 2274 | break; | |
| 2275 | } | |
| a31129d8 | 2276 | |
| b12defdc MD |
2277 | /* |
| 2278 | * NOTE: Allocated from base object | |
| 2279 | */ | |
| 1b9d3514 | 2280 | m = vm_page_alloc(object, index, |
| d2d8515b MD |
2281 | VM_ALLOC_NORMAL | |
| 2282 | VM_ALLOC_ZERO | | |
| 54341a3b | 2283 | VM_ALLOC_USE_GD | |
| d2d8515b MD |
2284 | VM_ALLOC_NULL_OK); |
| 2285 | if (m == NULL) | |
| 2286 | break; | |
| 3bb7eedb | 2287 | allocated = 1; |
| 1b9d3514 MD |
2288 | pprot = prot; |
| 2289 | /* lobject = object .. not needed */ | |
| 2290 | break; | |
| 2291 | } | |
| 2292 | if (lobject->backing_object_offset & PAGE_MASK) | |
| 2293 | break; | |
| b12defdc MD |
2294 | nobject = lobject->backing_object; |
| 2295 | vm_object_hold(nobject); | |
| 2296 | KKASSERT(nobject == lobject->backing_object); | |
| 2297 | pindex += lobject->backing_object_offset >> PAGE_SHIFT; | |
| 2298 | if (lobject != object) { | |
| 2299 | vm_object_lock_swap(); | |
| 2300 | vm_object_drop(lobject); | |
| a31129d8 | 2301 | } |
| b12defdc | 2302 | lobject = nobject; |
| 1b9d3514 | 2303 | pprot &= ~VM_PROT_WRITE; |
| b12defdc | 2304 | vm_object_chain_acquire(lobject); |
| 1b9d3514 | 2305 | } |
| a31129d8 | 2306 | |
| 1b9d3514 | 2307 | /* |
| b12defdc MD |
2308 | * NOTE: A non-NULL (m) will be associated with lobject if |
| 2309 | * it was found there, otherwise it is probably a | |
| 2310 | * zero-fill page associated with the base object. | |
| 1b9d3514 | 2311 | * |
| b12defdc | 2312 | * Give-up if no page is available. |
| 1b9d3514 | 2313 | */ |
| b12defdc MD |
2314 | if (m == NULL) { |
| 2315 | if (lobject != object) { | |
| 2316 | if (object->backing_object != lobject) | |
| 2317 | vm_object_hold(object->backing_object); | |
| 2318 | vm_object_chain_release_all( | |
| 2319 | object->backing_object, lobject); | |
| 2320 | if (object->backing_object != lobject) | |
| 2321 | vm_object_drop(object->backing_object); | |
| 2322 | vm_object_drop(lobject); | |
| 2323 | } | |
| 1b9d3514 | 2324 | break; |
| b12defdc | 2325 | } |
| 1b9d3514 MD |
2326 | |
| 2327 | /* | |
| 54341a3b MD |
2328 | * The object must be marked dirty if we are mapping a |
| 2329 | * writable page. m->object is either lobject or object, | |
| 2330 | * both of which are still held. Do this before we | |
| 2331 | * potentially drop the object. | |
| 2332 | */ | |
| 2333 | if (pprot & VM_PROT_WRITE) | |
| 2334 | vm_object_set_writeable_dirty(m->object); | |
| 2335 | ||
| 2336 | /* | |
| 1b9d3514 MD |
2337 | * Do not conditionalize on PG_RAM. If pages are present in |
| 2338 | * the VM system we assume optimal caching. If caching is | |
| 2339 | * not optimal the I/O gravy train will be restarted when we | |
| 2340 | * hit an unavailable page. We do not want to try to restart | |
| 2341 | * the gravy train now because we really don't know how much | |
| 2342 | * of the object has been cached. The cost for restarting | |
| 2343 | * the gravy train should be low (since accesses will likely | |
| 2344 | * be I/O bound anyway). | |
| 1b9d3514 | 2345 | */ |
| b12defdc MD |
2346 | if (lobject != object) { |
| 2347 | if (object->backing_object != lobject) | |
| 2348 | vm_object_hold(object->backing_object); | |
| 2349 | vm_object_chain_release_all(object->backing_object, | |
| 2350 | lobject); | |
| 2351 | if (object->backing_object != lobject) | |
| 2352 | vm_object_drop(object->backing_object); | |
| 2353 | vm_object_drop(lobject); | |
| 2354 | } | |
| 2355 | ||
| 1b9d3514 | 2356 | /* |
| 3bb7eedb MD |
2357 | * Enter the page into the pmap if appropriate. If we had |
| 2358 | * allocated the page we have to place it on a queue. If not | |
| 2359 | * we just have to make sure it isn't on the cache queue | |
| 2360 | * (pages on the cache queue are not allowed to be mapped). | |
| 1b9d3514 | 2361 | */ |
| 3bb7eedb | 2362 | if (allocated) { |
| 54341a3b MD |
2363 | /* |
| 2364 | * Page must be zerod. | |
| 2365 | */ | |
| 2366 | if ((m->flags & PG_ZERO) == 0) { | |
| 2367 | vm_page_zero_fill(m); | |
| 2368 | } else { | |
| 2369 | #ifdef PMAP_DEBUG | |
| 2370 | pmap_page_assertzero( | |
| 2371 | VM_PAGE_TO_PHYS(m)); | |
| 2372 | #endif | |
| 2373 | vm_page_flag_clear(m, PG_ZERO); | |
| 2374 | mycpu->gd_cnt.v_ozfod++; | |
| 2375 | } | |
| 2376 | mycpu->gd_cnt.v_zfod++; | |
| 2377 | m->valid = VM_PAGE_BITS_ALL; | |
| 2378 | ||
| 2379 | /* | |
| 2380 | * Handle dirty page case | |
| 2381 | */ | |
| 2421aac7 MD |
2382 | if (pprot & VM_PROT_WRITE) |
| 2383 | vm_set_nosync(m, entry); | |
| 3bb7eedb | 2384 | pmap_enter(pmap, addr, m, pprot, 0); |
| 54341a3b MD |
2385 | mycpu->gd_cnt.v_vm_faults++; |
| 2386 | if (curthread->td_lwp) | |
| 2387 | ++curthread->td_lwp->lwp_ru.ru_minflt; | |
| 3bb7eedb | 2388 | vm_page_deactivate(m); |
| 54341a3b MD |
2389 | if (pprot & VM_PROT_WRITE) { |
| 2390 | /*vm_object_set_writeable_dirty(m->object);*/ | |
| 2391 | vm_set_nosync(m, entry); | |
| 2392 | if (fault_flags & VM_FAULT_DIRTY) { | |
| 2393 | vm_page_dirty(m); | |
| 2394 | /*XXX*/ | |
| 2395 | swap_pager_unswapped(m); | |
| 2396 | } | |
| 2397 | } | |
| 3bb7eedb | 2398 | vm_page_wakeup(m); |
| b12defdc MD |
2399 | } else if (error) { |
| 2400 | /* couldn't busy page, no wakeup */ | |
| a31129d8 MD |
2401 | } else if ( |
| 2402 | ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && | |
| b12defdc | 2403 | (m->flags & PG_FICTITIOUS) == 0) { |
| a31129d8 MD |
2404 | /* |
| 2405 | * A fully valid page not undergoing soft I/O can | |
| 2406 | * be immediately entered into the pmap. | |
| 2407 | */ | |
| b12defdc | 2408 | if ((m->queue - m->pc) == PQ_CACHE) |
| 1b9d3514 | 2409 | vm_page_deactivate(m); |
| 54341a3b MD |
2410 | if (pprot & VM_PROT_WRITE) { |
| 2411 | /*vm_object_set_writeable_dirty(m->object);*/ | |
| 2412 | vm_set_nosync(m, entry); | |
| 2413 | if (fault_flags & VM_FAULT_DIRTY) { | |
| 2414 | vm_page_dirty(m); | |
| 2415 | /*XXX*/ | |
| 2416 | swap_pager_unswapped(m); | |
| 2417 | } | |
| 2418 | } | |
| 2421aac7 MD |
2419 | if (pprot & VM_PROT_WRITE) |
| 2420 | vm_set_nosync(m, entry); | |
| 1b9d3514 | 2421 | pmap_enter(pmap, addr, m, pprot, 0); |
| 54341a3b MD |
2422 | mycpu->gd_cnt.v_vm_faults++; |
| 2423 | if (curthread->td_lwp) | |
| 2424 | ++curthread->td_lwp->lwp_ru.ru_minflt; | |
| 1b9d3514 | 2425 | vm_page_wakeup(m); |
| b12defdc MD |
2426 | } else { |
| 2427 | vm_page_wakeup(m); | |
| 1b9d3514 MD |
2428 | } |
| 2429 | } | |
| b12defdc | 2430 | vm_object_chain_release(object); |
| a31129d8 | 2431 | vm_object_drop(object); |
| 1b9d3514 | 2432 | } |
| 54341a3b MD |
2433 | |
| 2434 | static void | |
| 2435 | vm_prefault_quick(pmap_t pmap, vm_offset_t addra, | |
| 2436 | vm_map_entry_t entry, int prot, int fault_flags) | |
| 2437 | { | |
| 2438 | struct lwp *lp; | |
| 2439 | vm_page_t m; | |
| 2440 | vm_offset_t addr; | |
| 2441 | vm_pindex_t pindex; | |
| 2442 | vm_object_t object; | |
| 2443 | int i; | |
| 2444 | int noneg; | |
| 2445 | int nopos; | |
| 2446 | int maxpages; | |
| 2447 | ||
| 2448 | /* | |
| 2449 | * Get stable max count value, disabled if set to 0 | |
| 2450 | */ | |
| 2451 | maxpages = vm_prefault_pages; | |
| 2452 | cpu_ccfence(); | |
| 2453 | if (maxpages <= 0) | |
| 2454 | return; | |
| 2455 | ||
| 2456 | /* | |
| 2457 | * We do not currently prefault mappings that use virtual page | |
| 2458 | * tables. We do not prefault foreign pmaps. | |
| 2459 | */ | |
| 2460 | if (entry->maptype == VM_MAPTYPE_VPAGETABLE) | |
| 2461 | return; | |
| 2462 | lp = curthread->td_lwp; | |
| 2463 | if (lp == NULL || (pmap != vmspace_pmap(lp->lwp_vmspace))) | |
| 2464 | return; | |
| 2465 | ||
| 2466 | /* | |
| 2467 | * Limit pre-fault count to 1024 pages. | |
| 2468 | */ | |
| 2469 | if (maxpages > 1024) | |
| 2470 | maxpages = 1024; | |
| 2471 | ||
| 2472 | object = entry->object.vm_object; | |
| 2473 | ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); | |
| 2474 | KKASSERT(object->backing_object == NULL); | |
| 2475 | ||
| 2476 | noneg = 0; | |
| 2477 | nopos = 0; | |
| 2478 | for (i = 0; i < maxpages; ++i) { | |
| 2479 | int error; | |
| 2480 | ||
| 2481 | /* | |
| 2482 | * Calculate the page to pre-fault, stopping the scan in | |
| 2483 | * each direction separately if the limit is reached. | |
| 2484 | */ | |
| 2485 | if (i & 1) { | |
| 2486 | if (noneg) | |
| 2487 | continue; | |
| 2488 | addr = addra - ((i + 1) >> 1) * PAGE_SIZE; | |
| 2489 | } else { | |
| 2490 | if (nopos) | |
| 2491 | continue; | |
| 2492 | addr = addra + ((i + 2) >> 1) * PAGE_SIZE; | |
| 2493 | } | |
| 2494 | if (addr < entry->start) { | |
| 2495 | noneg = 1; | |
| 2496 | if (noneg && nopos) | |
| 2497 | break; | |
| 2498 | continue; | |
| 2499 | } | |
| 2500 | if (addr >= entry->end) { | |
| 2501 | nopos = 1; | |
| 2502 | if (noneg && nopos) | |
| 2503 | break; | |
| 2504 | continue; | |
| 2505 | } | |
| 2506 | ||
| 2507 | /* | |
| 2508 | * Skip pages already mapped, and stop scanning in that | |
| 2509 | * direction. When the scan terminates in both directions | |
| 2510 | * we are done. | |
| 2511 | */ | |
| 2512 | if (pmap_prefault_ok(pmap, addr) == 0) { | |
| 2513 | if (i & 1) | |
| 2514 | noneg = 1; | |
| 2515 | else | |
| 2516 | nopos = 1; | |
| 2517 | if (noneg && nopos) | |
| 2518 | break; | |
| 2519 | continue; | |
| 2520 | } | |
| 2521 | ||
| 2522 | /* | |
| 2523 | * Follow the VM object chain to obtain the page to be mapped | |
| 2524 | * into the pmap. This version of the prefault code only | |
| 2525 | * works with terminal objects. | |
| 2526 | * | |
| 2527 | * WARNING! We cannot call swap_pager_unswapped() with a | |
| 2528 | * shared token. | |
| 2529 | */ | |
| 2530 | pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT; | |
| 2531 | ||
| 2532 | m = vm_page_lookup_busy_try(object, pindex, TRUE, &error); | |
| 2533 | if (m == NULL || error) | |
| 2534 | continue; | |
| 2535 | ||
| 2536 | if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && | |
| 2537 | (m->flags & PG_FICTITIOUS) == 0 && | |
| 2538 | ((m->flags & PG_SWAPPED) == 0 || | |
| 2539 | (prot & VM_PROT_WRITE) == 0 || | |
| 2540 | (fault_flags & VM_FAULT_DIRTY) == 0)) { | |
| 2541 | /* | |
| 2542 | * A fully valid page not undergoing soft I/O can | |
| 2543 | * be immediately entered into the pmap. | |
| 2544 | */ | |
| 2545 | if ((m->queue - m->pc) == PQ_CACHE) | |
| 2546 | vm_page_deactivate(m); | |
| 2547 | if (prot & VM_PROT_WRITE) { | |
| 2548 | vm_object_set_writeable_dirty(m->object); | |
| 2549 | vm_set_nosync(m, entry); | |
| 2550 | if (fault_flags & VM_FAULT_DIRTY) { | |
| 2551 | vm_page_dirty(m); | |
| 2552 | /*XXX*/ | |
| 2553 | swap_pager_unswapped(m); | |
| 2554 | } | |
| 2555 | } | |
| 2556 | pmap_enter(pmap, addr, m, prot, 0); | |
| 2557 | mycpu->gd_cnt.v_vm_faults++; | |
| 2558 | if (curthread->td_lwp) | |
| 2559 | ++curthread->td_lwp->lwp_ru.ru_minflt; | |
| 2560 | } | |
| 2561 | vm_page_wakeup(m); | |
| 2562 | } | |
| 2563 | } |