kernel - Refactor cpumask_t to extend cpus past 64, part 2/2
[dragonfly.git] / sys / vm / vm_fault.c
1 /*
2  * (MPSAFE)
3  *
4  * Copyright (c) 1991, 1993
5  *      The Regents of the University of California.  All rights reserved.
6  * Copyright (c) 1994 John S. Dyson
7  * All rights reserved.
8  * Copyright (c) 1994 David Greenman
9  * All rights reserved.
10  *
11  *
12  * This code is derived from software contributed to Berkeley by
13  * The Mach Operating System project at Carnegie-Mellon University.
14  *
15  * Redistribution and use in source and binary forms, with or without
16  * modification, are permitted provided that the following conditions
17  * are met:
18  * 1. Redistributions of source code must retain the above copyright
19  *    notice, this list of conditions and the following disclaimer.
20  * 2. Redistributions in binary form must reproduce the above copyright
21  *    notice, this list of conditions and the following disclaimer in the
22  *    documentation and/or other materials provided with the distribution.
23  * 3. Neither the name of the University nor the names of its contributors
24  *    may be used to endorse or promote products derived from this software
25  *    without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37  * SUCH DAMAGE.
38  *
39  *      from: @(#)vm_fault.c    8.4 (Berkeley) 1/12/94
40  *
41  *
42  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
43  * All rights reserved.
44  *
45  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
46  *
47  * Permission to use, copy, modify and distribute this software and
48  * its documentation is hereby granted, provided that both the copyright
49  * notice and this permission notice appear in all copies of the
50  * software, derivative works or modified versions, and any portions
51  * thereof, and that both notices appear in supporting documentation.
52  *
53  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
54  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
55  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
56  *
57  * Carnegie Mellon requests users of this software to return to
58  *
59  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
60  *  School of Computer Science
61  *  Carnegie Mellon University
62  *  Pittsburgh PA 15213-3890
63  *
64  * any improvements or extensions that they make and grant Carnegie the
65  * rights to redistribute these changes.
66  *
67  * $FreeBSD: src/sys/vm/vm_fault.c,v 1.108.2.8 2002/02/26 05:49:27 silby Exp $
68  * $DragonFly: src/sys/vm/vm_fault.c,v 1.47 2008/07/01 02:02:56 dillon Exp $
69  */
70
71 /*
72  *      Page fault handling module.
73  */
74
75 #include <sys/param.h>
76 #include <sys/systm.h>
77 #include <sys/kernel.h>
78 #include <sys/proc.h>
79 #include <sys/vnode.h>
80 #include <sys/resourcevar.h>
81 #include <sys/vmmeter.h>
82 #include <sys/vkernel.h>
83 #include <sys/lock.h>
84 #include <sys/sysctl.h>
85
86 #include <cpu/lwbuf.h>
87
88 #include <vm/vm.h>
89 #include <vm/vm_param.h>
90 #include <vm/pmap.h>
91 #include <vm/vm_map.h>
92 #include <vm/vm_object.h>
93 #include <vm/vm_page.h>
94 #include <vm/vm_pageout.h>
95 #include <vm/vm_kern.h>
96 #include <vm/vm_pager.h>
97 #include <vm/vnode_pager.h>
98 #include <vm/vm_extern.h>
99
100 #include <sys/thread2.h>
101 #include <vm/vm_page2.h>
102
103 struct faultstate {
104         vm_page_t m;
105         vm_object_t object;
106         vm_pindex_t pindex;
107         vm_prot_t prot;
108         vm_page_t first_m;
109         vm_object_t first_object;
110         vm_prot_t first_prot;
111         vm_map_t map;
112         vm_map_entry_t entry;
113         int lookup_still_valid;
114         int hardfault;
115         int fault_flags;
116         int map_generation;
117         int shared;
118         int first_shared;
119         boolean_t wired;
120         struct vnode *vp;
121 };
122
123 static int debug_fault = 0;
124 SYSCTL_INT(_vm, OID_AUTO, debug_fault, CTLFLAG_RW, &debug_fault, 0, "");
125 static int debug_cluster = 0;
126 SYSCTL_INT(_vm, OID_AUTO, debug_cluster, CTLFLAG_RW, &debug_cluster, 0, "");
127 int vm_shared_fault = 1;
128 TUNABLE_INT("vm.shared_fault", &vm_shared_fault);
129 SYSCTL_INT(_vm, OID_AUTO, shared_fault, CTLFLAG_RW, &vm_shared_fault, 0,
130            "Allow shared token on vm_object");
131 static long vm_shared_hit = 0;
132 SYSCTL_LONG(_vm, OID_AUTO, shared_hit, CTLFLAG_RW, &vm_shared_hit, 0,
133            "Successful shared faults");
134 static long vm_shared_count = 0;
135 SYSCTL_LONG(_vm, OID_AUTO, shared_count, CTLFLAG_RW, &vm_shared_count, 0,
136            "Shared fault attempts");
137 static long vm_shared_miss = 0;
138 SYSCTL_LONG(_vm, OID_AUTO, shared_miss, CTLFLAG_RW, &vm_shared_miss, 0,
139            "Unsuccessful shared faults");
140
141 static int vm_fault_object(struct faultstate *, vm_pindex_t, vm_prot_t, int);
142 static int vm_fault_vpagetable(struct faultstate *, vm_pindex_t *,
143                         vpte_t, int, int);
144 #if 0
145 static int vm_fault_additional_pages (vm_page_t, int, int, vm_page_t *, int *);
146 #endif
147 static void vm_set_nosync(vm_page_t m, vm_map_entry_t entry);
148 static void vm_prefault(pmap_t pmap, vm_offset_t addra,
149                         vm_map_entry_t entry, int prot, int fault_flags);
150 static void vm_prefault_quick(pmap_t pmap, vm_offset_t addra,
151                         vm_map_entry_t entry, int prot, int fault_flags);
152
153 static __inline void
154 release_page(struct faultstate *fs)
155 {
156         vm_page_deactivate(fs->m);
157         vm_page_wakeup(fs->m);
158         fs->m = NULL;
159 }
160
161 /*
162  * NOTE: Once unlocked any cached fs->entry becomes invalid, any reuse
163  *       requires relocking and then checking the timestamp.
164  *
165  * NOTE: vm_map_lock_read() does not bump fs->map->timestamp so we do
166  *       not have to update fs->map_generation here.
167  *
168  * NOTE: This function can fail due to a deadlock against the caller's
169  *       holding of a vm_page BUSY.
170  */
171 static __inline int
172 relock_map(struct faultstate *fs)
173 {
174         int error;
175
176         if (fs->lookup_still_valid == FALSE && fs->map) {
177                 error = vm_map_lock_read_to(fs->map);
178                 if (error == 0)
179                         fs->lookup_still_valid = TRUE;
180         } else {
181                 error = 0;
182         }
183         return error;
184 }
185
186 static __inline void
187 unlock_map(struct faultstate *fs)
188 {
189         if (fs->lookup_still_valid && fs->map) {
190                 vm_map_lookup_done(fs->map, fs->entry, 0);
191                 fs->lookup_still_valid = FALSE;
192         }
193 }
194
195 /*
196  * Clean up after a successful call to vm_fault_object() so another call
197  * to vm_fault_object() can be made.
198  */
199 static void
200 _cleanup_successful_fault(struct faultstate *fs, int relock)
201 {
202         /*
203          * We allocated a junk page for a COW operation that did
204          * not occur, the page must be freed.
205          */
206         if (fs->object != fs->first_object) {
207                 KKASSERT(fs->first_shared == 0);
208                 vm_page_free(fs->first_m);
209                 vm_object_pip_wakeup(fs->object);
210                 fs->first_m = NULL;
211         }
212
213         /*
214          * Reset fs->object.
215          */
216         fs->object = fs->first_object;
217         if (relock && fs->lookup_still_valid == FALSE) {
218                 if (fs->map)
219                         vm_map_lock_read(fs->map);
220                 fs->lookup_still_valid = TRUE;
221         }
222 }
223
224 static void
225 _unlock_things(struct faultstate *fs, int dealloc)
226 {
227         _cleanup_successful_fault(fs, 0);
228         if (dealloc) {
229                 /*vm_object_deallocate(fs->first_object);*/
230                 /*fs->first_object = NULL; drop used later on */
231         }
232         unlock_map(fs); 
233         if (fs->vp != NULL) { 
234                 vput(fs->vp);
235                 fs->vp = NULL;
236         }
237 }
238
239 #define unlock_things(fs) _unlock_things(fs, 0)
240 #define unlock_and_deallocate(fs) _unlock_things(fs, 1)
241 #define cleanup_successful_fault(fs) _cleanup_successful_fault(fs, 1)
242
243 /*
244  * TRYPAGER 
245  *
246  * Determine if the pager for the current object *might* contain the page.
247  *
248  * We only need to try the pager if this is not a default object (default
249  * objects are zero-fill and have no real pager), and if we are not taking
250  * a wiring fault or if the FS entry is wired.
251  */
252 #define TRYPAGER(fs)    \
253                 (fs->object->type != OBJT_DEFAULT && \
254                 (((fs->fault_flags & VM_FAULT_WIRE_MASK) == 0) || fs->wired))
255
256 /*
257  * vm_fault:
258  *
259  * Handle a page fault occuring at the given address, requiring the given
260  * permissions, in the map specified.  If successful, the page is inserted
261  * into the associated physical map.
262  *
263  * NOTE: The given address should be truncated to the proper page address.
264  *
265  * KERN_SUCCESS is returned if the page fault is handled; otherwise,
266  * a standard error specifying why the fault is fatal is returned.
267  *
268  * The map in question must be referenced, and remains so.
269  * The caller may hold no locks.
270  * No other requirements.
271  */
272 int
273 vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags)
274 {
275         int result;
276         vm_pindex_t first_pindex;
277         struct faultstate fs;
278         struct lwp *lp;
279         int growstack;
280         int retry = 0;
281         int inherit_prot;
282
283         inherit_prot = fault_type & VM_PROT_NOSYNC;
284         vm_page_pcpu_cache();
285         fs.hardfault = 0;
286         fs.fault_flags = fault_flags;
287         fs.vp = NULL;
288         fs.shared = vm_shared_fault;
289         fs.first_shared = vm_shared_fault;
290         growstack = 1;
291         if (vm_shared_fault)
292                 ++vm_shared_count;
293
294         /*
295          * vm_map interactions
296          */
297         if ((lp = curthread->td_lwp) != NULL)
298                 lp->lwp_flags |= LWP_PAGING;
299         lwkt_gettoken(&map->token);
300
301 RetryFault:
302         /*
303          * Find the vm_map_entry representing the backing store and resolve
304          * the top level object and page index.  This may have the side
305          * effect of executing a copy-on-write on the map entry and/or
306          * creating a shadow object, but will not COW any actual VM pages.
307          *
308          * On success fs.map is left read-locked and various other fields 
309          * are initialized but not otherwise referenced or locked.
310          *
311          * NOTE!  vm_map_lookup will try to upgrade the fault_type to
312          * VM_FAULT_WRITE if the map entry is a virtual page table and also
313          * writable, so we can set the 'A'accessed bit in the virtual page
314          * table entry.
315          */
316         fs.map = map;
317         result = vm_map_lookup(&fs.map, vaddr, fault_type,
318                                &fs.entry, &fs.first_object,
319                                &first_pindex, &fs.first_prot, &fs.wired);
320
321         /*
322          * If the lookup failed or the map protections are incompatible,
323          * the fault generally fails.  However, if the caller is trying
324          * to do a user wiring we have more work to do.
325          */
326         if (result != KERN_SUCCESS) {
327                 if (result != KERN_PROTECTION_FAILURE ||
328                     (fs.fault_flags & VM_FAULT_WIRE_MASK) != VM_FAULT_USER_WIRE)
329                 {
330                         if (result == KERN_INVALID_ADDRESS && growstack &&
331                             map != &kernel_map && curproc != NULL) {
332                                 result = vm_map_growstack(curproc, vaddr);
333                                 if (result == KERN_SUCCESS) {
334                                         growstack = 0;
335                                         ++retry;
336                                         goto RetryFault;
337                                 }
338                                 result = KERN_FAILURE;
339                         }
340                         goto done;
341                 }
342
343                 /*
344                  * If we are user-wiring a r/w segment, and it is COW, then
345                  * we need to do the COW operation.  Note that we don't
346                  * currently COW RO sections now, because it is NOT desirable
347                  * to COW .text.  We simply keep .text from ever being COW'ed
348                  * and take the heat that one cannot debug wired .text sections.
349                  */
350                 result = vm_map_lookup(&fs.map, vaddr,
351                                        VM_PROT_READ|VM_PROT_WRITE|
352                                         VM_PROT_OVERRIDE_WRITE,
353                                        &fs.entry, &fs.first_object,
354                                        &first_pindex, &fs.first_prot,
355                                        &fs.wired);
356                 if (result != KERN_SUCCESS) {
357                         result = KERN_FAILURE;
358                         goto done;
359                 }
360
361                 /*
362                  * If we don't COW now, on a user wire, the user will never
363                  * be able to write to the mapping.  If we don't make this
364                  * restriction, the bookkeeping would be nearly impossible.
365                  *
366                  * XXX We have a shared lock, this will have a MP race but
367                  * I don't see how it can hurt anything.
368                  */
369                 if ((fs.entry->protection & VM_PROT_WRITE) == 0)
370                         fs.entry->max_protection &= ~VM_PROT_WRITE;
371         }
372
373         /*
374          * fs.map is read-locked
375          *
376          * Misc checks.  Save the map generation number to detect races.
377          */
378         fs.map_generation = fs.map->timestamp;
379         fs.lookup_still_valid = TRUE;
380         fs.first_m = NULL;
381         fs.object = fs.first_object;    /* so unlock_and_deallocate works */
382
383         if (fs.entry->eflags & (MAP_ENTRY_NOFAULT | MAP_ENTRY_KSTACK)) {
384                 if (fs.entry->eflags & MAP_ENTRY_NOFAULT) {
385                         panic("vm_fault: fault on nofault entry, addr: %p",
386                               (void *)vaddr);
387                 }
388                 if ((fs.entry->eflags & MAP_ENTRY_KSTACK) &&
389                     vaddr >= fs.entry->start &&
390                     vaddr < fs.entry->start + PAGE_SIZE) {
391                         panic("vm_fault: fault on stack guard, addr: %p",
392                               (void *)vaddr);
393                 }
394         }
395
396         /*
397          * A system map entry may return a NULL object.  No object means
398          * no pager means an unrecoverable kernel fault.
399          */
400         if (fs.first_object == NULL) {
401                 panic("vm_fault: unrecoverable fault at %p in entry %p",
402                         (void *)vaddr, fs.entry);
403         }
404
405         /*
406          * Fail here if not a trivial anonymous page fault and TDF_NOFAULT
407          * is set.
408          */
409         if ((curthread->td_flags & TDF_NOFAULT) &&
410             (retry ||
411              fs.first_object->type == OBJT_VNODE ||
412              fs.first_object->backing_object)) {
413                 result = KERN_FAILURE;
414                 unlock_things(&fs);
415                 goto done2;
416         }
417
418         /*
419          * If the entry is wired we cannot change the page protection.
420          */
421         if (fs.wired)
422                 fault_type = fs.first_prot;
423
424         /*
425          * We generally want to avoid unnecessary exclusive modes on backing
426          * and terminal objects because this can seriously interfere with
427          * heavily fork()'d processes (particularly /bin/sh scripts).
428          *
429          * However, we also want to avoid unnecessary retries due to needed
430          * shared->exclusive promotion for common faults.  Exclusive mode is
431          * always needed if any page insertion, rename, or free occurs in an
432          * object (and also indirectly if any I/O is done).
433          *
434          * The main issue here is going to be fs.first_shared.  If the
435          * first_object has a backing object which isn't shadowed and the
436          * process is single-threaded we might as well use an exclusive
437          * lock/chain right off the bat.
438          */
439         if (fs.first_shared && fs.first_object->backing_object &&
440             LIST_EMPTY(&fs.first_object->shadow_head) &&
441             curthread->td_proc && curthread->td_proc->p_nthreads == 1) {
442                 fs.first_shared = 0;
443         }
444
445         /*
446          * swap_pager_unswapped() needs an exclusive object
447          */
448         if (fault_flags & (VM_FAULT_UNSWAP | VM_FAULT_DIRTY)) {
449                 fs.first_shared = 0;
450         }
451
452         /*
453          * Obtain a top-level object lock, shared or exclusive depending
454          * on fs.first_shared.  If a shared lock winds up being insufficient
455          * we will retry with an exclusive lock.
456          *
457          * The vnode pager lock is always shared.
458          */
459         if (fs.first_shared)
460                 vm_object_hold_shared(fs.first_object);
461         else
462                 vm_object_hold(fs.first_object);
463         if (fs.vp == NULL)
464                 fs.vp = vnode_pager_lock(fs.first_object);
465
466         /*
467          * The page we want is at (first_object, first_pindex), but if the
468          * vm_map_entry is VM_MAPTYPE_VPAGETABLE we have to traverse the
469          * page table to figure out the actual pindex.
470          *
471          * NOTE!  DEVELOPMENT IN PROGRESS, THIS IS AN INITIAL IMPLEMENTATION
472          * ONLY
473          */
474         if (fs.entry->maptype == VM_MAPTYPE_VPAGETABLE) {
475                 result = vm_fault_vpagetable(&fs, &first_pindex,
476                                              fs.entry->aux.master_pde,
477                                              fault_type, 1);
478                 if (result == KERN_TRY_AGAIN) {
479                         vm_object_drop(fs.first_object);
480                         ++retry;
481                         goto RetryFault;
482                 }
483                 if (result != KERN_SUCCESS)
484                         goto done;
485         }
486
487         /*
488          * Now we have the actual (object, pindex), fault in the page.  If
489          * vm_fault_object() fails it will unlock and deallocate the FS
490          * data.   If it succeeds everything remains locked and fs->object
491          * will have an additional PIP count if it is not equal to
492          * fs->first_object
493          *
494          * vm_fault_object will set fs->prot for the pmap operation.  It is
495          * allowed to set VM_PROT_WRITE if fault_type == VM_PROT_READ if the
496          * page can be safely written.  However, it will force a read-only
497          * mapping for a read fault if the memory is managed by a virtual
498          * page table.
499          *
500          * If the fault code uses the shared object lock shortcut
501          * we must not try to burst (we can't allocate VM pages).
502          */
503         result = vm_fault_object(&fs, first_pindex, fault_type, 1);
504
505         if (debug_fault > 0) {
506                 --debug_fault;
507                 kprintf("VM_FAULT result %d addr=%jx type=%02x flags=%02x "
508                         "fs.m=%p fs.prot=%02x fs.wired=%02x fs.entry=%p\n",
509                         result, (intmax_t)vaddr, fault_type, fault_flags,
510                         fs.m, fs.prot, fs.wired, fs.entry);
511         }
512
513         if (result == KERN_TRY_AGAIN) {
514                 vm_object_drop(fs.first_object);
515                 ++retry;
516                 goto RetryFault;
517         }
518         if (result != KERN_SUCCESS)
519                 goto done;
520
521         /*
522          * On success vm_fault_object() does not unlock or deallocate, and fs.m
523          * will contain a busied page.
524          *
525          * Enter the page into the pmap and do pmap-related adjustments.
526          */
527         KKASSERT(fs.lookup_still_valid == TRUE);
528         vm_page_flag_set(fs.m, PG_REFERENCED);
529         pmap_enter(fs.map->pmap, vaddr, fs.m, fs.prot | inherit_prot,
530                    fs.wired, fs.entry);
531         mycpu->gd_cnt.v_vm_faults++;
532         if (curthread->td_lwp)
533                 ++curthread->td_lwp->lwp_ru.ru_minflt;
534
535         /*KKASSERT(fs.m->queue == PQ_NONE); page-in op may deactivate page */
536         KKASSERT(fs.m->flags & PG_BUSY);
537
538         /*
539          * If the page is not wired down, then put it where the pageout daemon
540          * can find it.
541          */
542         if (fs.fault_flags & VM_FAULT_WIRE_MASK) {
543                 if (fs.wired)
544                         vm_page_wire(fs.m);
545                 else
546                         vm_page_unwire(fs.m, 1);
547         } else {
548                 vm_page_activate(fs.m);
549         }
550         vm_page_wakeup(fs.m);
551
552         /*
553          * Burst in a few more pages if possible.  The fs.map should still
554          * be locked.  To avoid interlocking against a vnode->getblk
555          * operation we had to be sure to unbusy our primary vm_page above
556          * first.
557          *
558          * A normal burst can continue down backing store, only execute
559          * if we are holding an exclusive lock, otherwise the exclusive
560          * locks the burst code gets might cause excessive SMP collisions.
561          *
562          * A quick burst can be utilized when there is no backing object
563          * (i.e. a shared file mmap).
564          */
565         if ((fault_flags & VM_FAULT_BURST) &&
566             (fs.fault_flags & VM_FAULT_WIRE_MASK) == 0 &&
567             fs.wired == 0) {
568                 if (fs.first_shared == 0 && fs.shared == 0) {
569                         vm_prefault(fs.map->pmap, vaddr,
570                                     fs.entry, fs.prot, fault_flags);
571                 } else {
572                         vm_prefault_quick(fs.map->pmap, vaddr,
573                                           fs.entry, fs.prot, fault_flags);
574                 }
575         }
576
577         /*
578          * Unlock everything, and return
579          */
580         unlock_things(&fs);
581
582         if (curthread->td_lwp) {
583                 if (fs.hardfault) {
584                         curthread->td_lwp->lwp_ru.ru_majflt++;
585                 } else {
586                         curthread->td_lwp->lwp_ru.ru_minflt++;
587                 }
588         }
589
590         /*vm_object_deallocate(fs.first_object);*/
591         /*fs.m = NULL; */
592         /*fs.first_object = NULL; must still drop later */
593
594         result = KERN_SUCCESS;
595 done:
596         if (fs.first_object)
597                 vm_object_drop(fs.first_object);
598 done2:
599         lwkt_reltoken(&map->token);
600         if (lp)
601                 lp->lwp_flags &= ~LWP_PAGING;
602         if (vm_shared_fault && fs.shared == 0)
603                 ++vm_shared_miss;
604         return (result);
605 }
606
607 /*
608  * Fault in the specified virtual address in the current process map, 
609  * returning a held VM page or NULL.  See vm_fault_page() for more 
610  * information.
611  *
612  * No requirements.
613  */
614 vm_page_t
615 vm_fault_page_quick(vm_offset_t va, vm_prot_t fault_type, int *errorp)
616 {
617         struct lwp *lp = curthread->td_lwp;
618         vm_page_t m;
619
620         m = vm_fault_page(&lp->lwp_vmspace->vm_map, va, 
621                           fault_type, VM_FAULT_NORMAL, errorp);
622         return(m);
623 }
624
625 /*
626  * Fault in the specified virtual address in the specified map, doing all
627  * necessary manipulation of the object store and all necessary I/O.  Return
628  * a held VM page or NULL, and set *errorp.  The related pmap is not
629  * updated.
630  *
631  * The returned page will be properly dirtied if VM_PROT_WRITE was specified,
632  * and marked PG_REFERENCED as well.
633  *
634  * If the page cannot be faulted writable and VM_PROT_WRITE was specified, an
635  * error will be returned.
636  *
637  * No requirements.
638  */
639 vm_page_t
640 vm_fault_page(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
641               int fault_flags, int *errorp)
642 {
643         vm_pindex_t first_pindex;
644         struct faultstate fs;
645         int result;
646         int retry = 0;
647         vm_prot_t orig_fault_type = fault_type;
648
649         fs.hardfault = 0;
650         fs.fault_flags = fault_flags;
651         KKASSERT((fault_flags & VM_FAULT_WIRE_MASK) == 0);
652
653         /*
654          * Dive the pmap (concurrency possible).  If we find the
655          * appropriate page we can terminate early and quickly.
656          */
657         fs.m = pmap_fault_page_quick(map->pmap, vaddr, fault_type);
658         if (fs.m) {
659                 *errorp = 0;
660                 return(fs.m);
661         }
662
663         /*
664          * Otherwise take a concurrency hit and do a formal page
665          * fault.
666          */
667         fs.shared = vm_shared_fault;
668         fs.first_shared = vm_shared_fault;
669         fs.vp = NULL;
670         lwkt_gettoken(&map->token);
671
672         /*
673          * swap_pager_unswapped() needs an exclusive object
674          */
675         if (fault_flags & (VM_FAULT_UNSWAP | VM_FAULT_DIRTY)) {
676                 fs.first_shared = 0;
677         }
678
679 RetryFault:
680         /*
681          * Find the vm_map_entry representing the backing store and resolve
682          * the top level object and page index.  This may have the side
683          * effect of executing a copy-on-write on the map entry and/or
684          * creating a shadow object, but will not COW any actual VM pages.
685          *
686          * On success fs.map is left read-locked and various other fields 
687          * are initialized but not otherwise referenced or locked.
688          *
689          * NOTE!  vm_map_lookup will upgrade the fault_type to VM_FAULT_WRITE
690          * if the map entry is a virtual page table and also writable,
691          * so we can set the 'A'accessed bit in the virtual page table entry.
692          */
693         fs.map = map;
694         result = vm_map_lookup(&fs.map, vaddr, fault_type,
695                                &fs.entry, &fs.first_object,
696                                &first_pindex, &fs.first_prot, &fs.wired);
697
698         if (result != KERN_SUCCESS) {
699                 *errorp = result;
700                 fs.m = NULL;
701                 goto done;
702         }
703
704         /*
705          * fs.map is read-locked
706          *
707          * Misc checks.  Save the map generation number to detect races.
708          */
709         fs.map_generation = fs.map->timestamp;
710         fs.lookup_still_valid = TRUE;
711         fs.first_m = NULL;
712         fs.object = fs.first_object;    /* so unlock_and_deallocate works */
713
714         if (fs.entry->eflags & MAP_ENTRY_NOFAULT) {
715                 panic("vm_fault: fault on nofault entry, addr: %lx",
716                     (u_long)vaddr);
717         }
718
719         /*
720          * A system map entry may return a NULL object.  No object means
721          * no pager means an unrecoverable kernel fault.
722          */
723         if (fs.first_object == NULL) {
724                 panic("vm_fault: unrecoverable fault at %p in entry %p",
725                         (void *)vaddr, fs.entry);
726         }
727
728         /*
729          * Fail here if not a trivial anonymous page fault and TDF_NOFAULT
730          * is set.
731          */
732         if ((curthread->td_flags & TDF_NOFAULT) &&
733             (retry ||
734              fs.first_object->type == OBJT_VNODE ||
735              fs.first_object->backing_object)) {
736                 *errorp = KERN_FAILURE;
737                 unlock_things(&fs);
738                 goto done2;
739         }
740
741         /*
742          * If the entry is wired we cannot change the page protection.
743          */
744         if (fs.wired)
745                 fault_type = fs.first_prot;
746
747         /*
748          * Make a reference to this object to prevent its disposal while we
749          * are messing with it.  Once we have the reference, the map is free
750          * to be diddled.  Since objects reference their shadows (and copies),
751          * they will stay around as well.
752          *
753          * The reference should also prevent an unexpected collapse of the
754          * parent that might move pages from the current object into the
755          * parent unexpectedly, resulting in corruption.
756          *
757          * Bump the paging-in-progress count to prevent size changes (e.g.
758          * truncation operations) during I/O.  This must be done after
759          * obtaining the vnode lock in order to avoid possible deadlocks.
760          */
761         if (fs.first_shared)
762                 vm_object_hold_shared(fs.first_object);
763         else
764                 vm_object_hold(fs.first_object);
765         if (fs.vp == NULL)
766                 fs.vp = vnode_pager_lock(fs.first_object);      /* shared */
767
768         /*
769          * The page we want is at (first_object, first_pindex), but if the
770          * vm_map_entry is VM_MAPTYPE_VPAGETABLE we have to traverse the
771          * page table to figure out the actual pindex.
772          *
773          * NOTE!  DEVELOPMENT IN PROGRESS, THIS IS AN INITIAL IMPLEMENTATION
774          * ONLY
775          */
776         if (fs.entry->maptype == VM_MAPTYPE_VPAGETABLE) {
777                 result = vm_fault_vpagetable(&fs, &first_pindex,
778                                              fs.entry->aux.master_pde,
779                                              fault_type, 1);
780                 if (result == KERN_TRY_AGAIN) {
781                         vm_object_drop(fs.first_object);
782                         ++retry;
783                         goto RetryFault;
784                 }
785                 if (result != KERN_SUCCESS) {
786                         *errorp = result;
787                         fs.m = NULL;
788                         goto done;
789                 }
790         }
791
792         /*
793          * Now we have the actual (object, pindex), fault in the page.  If
794          * vm_fault_object() fails it will unlock and deallocate the FS
795          * data.   If it succeeds everything remains locked and fs->object
796          * will have an additinal PIP count if it is not equal to
797          * fs->first_object
798          */
799         fs.m = NULL;
800         result = vm_fault_object(&fs, first_pindex, fault_type, 1);
801
802         if (result == KERN_TRY_AGAIN) {
803                 vm_object_drop(fs.first_object);
804                 ++retry;
805                 goto RetryFault;
806         }
807         if (result != KERN_SUCCESS) {
808                 *errorp = result;
809                 fs.m = NULL;
810                 goto done;
811         }
812
813         if ((orig_fault_type & VM_PROT_WRITE) &&
814             (fs.prot & VM_PROT_WRITE) == 0) {
815                 *errorp = KERN_PROTECTION_FAILURE;
816                 unlock_and_deallocate(&fs);
817                 fs.m = NULL;
818                 goto done;
819         }
820
821         /*
822          * DO NOT UPDATE THE PMAP!!!  This function may be called for
823          * a pmap unrelated to the current process pmap, in which case
824          * the current cpu core will not be listed in the pmap's pm_active
825          * mask.  Thus invalidation interlocks will fail to work properly.
826          *
827          * (for example, 'ps' uses procfs to read program arguments from
828          * each process's stack).
829          *
830          * In addition to the above this function will be called to acquire
831          * a page that might already be faulted in, re-faulting it
832          * continuously is a waste of time.
833          *
834          * XXX could this have been the cause of our random seg-fault
835          *     issues?  procfs accesses user stacks.
836          */
837         vm_page_flag_set(fs.m, PG_REFERENCED);
838 #if 0
839         pmap_enter(fs.map->pmap, vaddr, fs.m, fs.prot, fs.wired, NULL);
840         mycpu->gd_cnt.v_vm_faults++;
841         if (curthread->td_lwp)
842                 ++curthread->td_lwp->lwp_ru.ru_minflt;
843 #endif
844
845         /*
846          * On success vm_fault_object() does not unlock or deallocate, and fs.m
847          * will contain a busied page.  So we must unlock here after having
848          * messed with the pmap.
849          */
850         unlock_things(&fs);
851
852         /*
853          * Return a held page.  We are not doing any pmap manipulation so do
854          * not set PG_MAPPED.  However, adjust the page flags according to
855          * the fault type because the caller may not use a managed pmapping
856          * (so we don't want to lose the fact that the page will be dirtied
857          * if a write fault was specified).
858          */
859         vm_page_hold(fs.m);
860         vm_page_activate(fs.m);
861         if (fault_type & VM_PROT_WRITE)
862                 vm_page_dirty(fs.m);
863
864         if (curthread->td_lwp) {
865                 if (fs.hardfault) {
866                         curthread->td_lwp->lwp_ru.ru_majflt++;
867                 } else {
868                         curthread->td_lwp->lwp_ru.ru_minflt++;
869                 }
870         }
871
872         /*
873          * Unlock everything, and return the held page.
874          */
875         vm_page_wakeup(fs.m);
876         /*vm_object_deallocate(fs.first_object);*/
877         /*fs.first_object = NULL; */
878         *errorp = 0;
879
880 done:
881         if (fs.first_object)
882                 vm_object_drop(fs.first_object);
883 done2:
884         lwkt_reltoken(&map->token);
885         return(fs.m);
886 }
887
888 /*
889  * Fault in the specified (object,offset), dirty the returned page as
890  * needed.  If the requested fault_type cannot be done NULL and an
891  * error is returned.
892  *
893  * A held (but not busied) page is returned.
894  *
895  * The passed in object must be held as specified by the shared
896  * argument.
897  */
898 vm_page_t
899 vm_fault_object_page(vm_object_t object, vm_ooffset_t offset,
900                      vm_prot_t fault_type, int fault_flags,
901                      int *sharedp, int *errorp)
902 {
903         int result;
904         vm_pindex_t first_pindex;
905         struct faultstate fs;
906         struct vm_map_entry entry;
907
908         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
909         bzero(&entry, sizeof(entry));
910         entry.object.vm_object = object;
911         entry.maptype = VM_MAPTYPE_NORMAL;
912         entry.protection = entry.max_protection = fault_type;
913
914         fs.hardfault = 0;
915         fs.fault_flags = fault_flags;
916         fs.map = NULL;
917         fs.shared = vm_shared_fault;
918         fs.first_shared = *sharedp;
919         fs.vp = NULL;
920         KKASSERT((fault_flags & VM_FAULT_WIRE_MASK) == 0);
921
922         /*
923          * Might require swap block adjustments
924          */
925         if (fs.first_shared && (fault_flags & (VM_FAULT_UNSWAP | VM_FAULT_DIRTY))) {
926                 fs.first_shared = 0;
927                 vm_object_upgrade(object);
928         }
929
930         /*
931          * Retry loop as needed (typically for shared->exclusive transitions)
932          */
933 RetryFault:
934         *sharedp = fs.first_shared;
935         first_pindex = OFF_TO_IDX(offset);
936         fs.first_object = object;
937         fs.entry = &entry;
938         fs.first_prot = fault_type;
939         fs.wired = 0;
940         /*fs.map_generation = 0; unused */
941
942         /*
943          * Make a reference to this object to prevent its disposal while we
944          * are messing with it.  Once we have the reference, the map is free
945          * to be diddled.  Since objects reference their shadows (and copies),
946          * they will stay around as well.
947          *
948          * The reference should also prevent an unexpected collapse of the
949          * parent that might move pages from the current object into the
950          * parent unexpectedly, resulting in corruption.
951          *
952          * Bump the paging-in-progress count to prevent size changes (e.g.
953          * truncation operations) during I/O.  This must be done after
954          * obtaining the vnode lock in order to avoid possible deadlocks.
955          */
956         if (fs.vp == NULL)
957                 fs.vp = vnode_pager_lock(fs.first_object);
958
959         fs.lookup_still_valid = TRUE;
960         fs.first_m = NULL;
961         fs.object = fs.first_object;    /* so unlock_and_deallocate works */
962
963 #if 0
964         /* XXX future - ability to operate on VM object using vpagetable */
965         if (fs.entry->maptype == VM_MAPTYPE_VPAGETABLE) {
966                 result = vm_fault_vpagetable(&fs, &first_pindex,
967                                              fs.entry->aux.master_pde,
968                                              fault_type, 0);
969                 if (result == KERN_TRY_AGAIN) {
970                         if (fs.first_shared == 0 && *sharedp)
971                                 vm_object_upgrade(object);
972                         goto RetryFault;
973                 }
974                 if (result != KERN_SUCCESS) {
975                         *errorp = result;
976                         return (NULL);
977                 }
978         }
979 #endif
980
981         /*
982          * Now we have the actual (object, pindex), fault in the page.  If
983          * vm_fault_object() fails it will unlock and deallocate the FS
984          * data.   If it succeeds everything remains locked and fs->object
985          * will have an additinal PIP count if it is not equal to
986          * fs->first_object
987          *
988          * On KERN_TRY_AGAIN vm_fault_object() leaves fs.first_object intact.
989          * We may have to upgrade its lock to handle the requested fault.
990          */
991         result = vm_fault_object(&fs, first_pindex, fault_type, 0);
992
993         if (result == KERN_TRY_AGAIN) {
994                 if (fs.first_shared == 0 && *sharedp)
995                         vm_object_upgrade(object);
996                 goto RetryFault;
997         }
998         if (result != KERN_SUCCESS) {
999                 *errorp = result;
1000                 return(NULL);
1001         }
1002
1003         if ((fault_type & VM_PROT_WRITE) && (fs.prot & VM_PROT_WRITE) == 0) {
1004                 *errorp = KERN_PROTECTION_FAILURE;
1005                 unlock_and_deallocate(&fs);
1006                 return(NULL);
1007         }
1008
1009         /*
1010          * On success vm_fault_object() does not unlock or deallocate, so we
1011          * do it here.  Note that the returned fs.m will be busied.
1012          */
1013         unlock_things(&fs);
1014
1015         /*
1016          * Return a held page.  We are not doing any pmap manipulation so do
1017          * not set PG_MAPPED.  However, adjust the page flags according to
1018          * the fault type because the caller may not use a managed pmapping
1019          * (so we don't want to lose the fact that the page will be dirtied
1020          * if a write fault was specified).
1021          */
1022         vm_page_hold(fs.m);
1023         vm_page_activate(fs.m);
1024         if ((fault_type & VM_PROT_WRITE) || (fault_flags & VM_FAULT_DIRTY))
1025                 vm_page_dirty(fs.m);
1026         if (fault_flags & VM_FAULT_UNSWAP)
1027                 swap_pager_unswapped(fs.m);
1028
1029         /*
1030          * Indicate that the page was accessed.
1031          */
1032         vm_page_flag_set(fs.m, PG_REFERENCED);
1033
1034         if (curthread->td_lwp) {
1035                 if (fs.hardfault) {
1036                         curthread->td_lwp->lwp_ru.ru_majflt++;
1037                 } else {
1038                         curthread->td_lwp->lwp_ru.ru_minflt++;
1039                 }
1040         }
1041
1042         /*
1043          * Unlock everything, and return the held page.
1044          */
1045         vm_page_wakeup(fs.m);
1046         /*vm_object_deallocate(fs.first_object);*/
1047         /*fs.first_object = NULL; */
1048
1049         *errorp = 0;
1050         return(fs.m);
1051 }
1052
1053 /*
1054  * Translate the virtual page number (first_pindex) that is relative
1055  * to the address space into a logical page number that is relative to the
1056  * backing object.  Use the virtual page table pointed to by (vpte).
1057  *
1058  * This implements an N-level page table.  Any level can terminate the
1059  * scan by setting VPTE_PS.   A linear mapping is accomplished by setting
1060  * VPTE_PS in the master page directory entry set via mcontrol(MADV_SETMAP).
1061  */
1062 static
1063 int
1064 vm_fault_vpagetable(struct faultstate *fs, vm_pindex_t *pindex,
1065                     vpte_t vpte, int fault_type, int allow_nofault)
1066 {
1067         struct lwbuf *lwb;
1068         struct lwbuf lwb_cache;
1069         int vshift = VPTE_FRAME_END - PAGE_SHIFT; /* index bits remaining */
1070         int result = KERN_SUCCESS;
1071         vpte_t *ptep;
1072
1073         ASSERT_LWKT_TOKEN_HELD(vm_object_token(fs->first_object));
1074         for (;;) {
1075                 /*
1076                  * We cannot proceed if the vpte is not valid, not readable
1077                  * for a read fault, or not writable for a write fault.
1078                  */
1079                 if ((vpte & VPTE_V) == 0) {
1080                         unlock_and_deallocate(fs);
1081                         return (KERN_FAILURE);
1082                 }
1083                 if ((fault_type & VM_PROT_WRITE) && (vpte & VPTE_RW) == 0) {
1084                         unlock_and_deallocate(fs);
1085                         return (KERN_FAILURE);
1086                 }
1087                 if ((vpte & VPTE_PS) || vshift == 0)
1088                         break;
1089                 KKASSERT(vshift >= VPTE_PAGE_BITS);
1090
1091                 /*
1092                  * Get the page table page.  Nominally we only read the page
1093                  * table, but since we are actively setting VPTE_M and VPTE_A,
1094                  * tell vm_fault_object() that we are writing it. 
1095                  *
1096                  * There is currently no real need to optimize this.
1097                  */
1098                 result = vm_fault_object(fs, (vpte & VPTE_FRAME) >> PAGE_SHIFT,
1099                                          VM_PROT_READ|VM_PROT_WRITE,
1100                                          allow_nofault);
1101                 if (result != KERN_SUCCESS)
1102                         return (result);
1103
1104                 /*
1105                  * Process the returned fs.m and look up the page table
1106                  * entry in the page table page.
1107                  */
1108                 vshift -= VPTE_PAGE_BITS;
1109                 lwb = lwbuf_alloc(fs->m, &lwb_cache);
1110                 ptep = ((vpte_t *)lwbuf_kva(lwb) +
1111                         ((*pindex >> vshift) & VPTE_PAGE_MASK));
1112                 vpte = *ptep;
1113
1114                 /*
1115                  * Page table write-back.  If the vpte is valid for the
1116                  * requested operation, do a write-back to the page table.
1117                  *
1118                  * XXX VPTE_M is not set properly for page directory pages.
1119                  * It doesn't get set in the page directory if the page table
1120                  * is modified during a read access.
1121                  */
1122                 vm_page_activate(fs->m);
1123                 if ((fault_type & VM_PROT_WRITE) && (vpte & VPTE_V) &&
1124                     (vpte & VPTE_RW)) {
1125                         if ((vpte & (VPTE_M|VPTE_A)) != (VPTE_M|VPTE_A)) {
1126                                 atomic_set_long(ptep, VPTE_M | VPTE_A);
1127                                 vm_page_dirty(fs->m);
1128                         }
1129                 }
1130                 if ((fault_type & VM_PROT_READ) && (vpte & VPTE_V)) {
1131                         if ((vpte & VPTE_A) == 0) {
1132                                 atomic_set_long(ptep, VPTE_A);
1133                                 vm_page_dirty(fs->m);
1134                         }
1135                 }
1136                 lwbuf_free(lwb);
1137                 vm_page_flag_set(fs->m, PG_REFERENCED);
1138                 vm_page_wakeup(fs->m);
1139                 fs->m = NULL;
1140                 cleanup_successful_fault(fs);
1141         }
1142         /*
1143          * Combine remaining address bits with the vpte.
1144          */
1145         /* JG how many bits from each? */
1146         *pindex = ((vpte & VPTE_FRAME) >> PAGE_SHIFT) +
1147                   (*pindex & ((1L << vshift) - 1));
1148         return (KERN_SUCCESS);
1149 }
1150
1151
1152 /*
1153  * This is the core of the vm_fault code.
1154  *
1155  * Do all operations required to fault-in (fs.first_object, pindex).  Run
1156  * through the shadow chain as necessary and do required COW or virtual
1157  * copy operations.  The caller has already fully resolved the vm_map_entry
1158  * and, if appropriate, has created a copy-on-write layer.  All we need to
1159  * do is iterate the object chain.
1160  *
1161  * On failure (fs) is unlocked and deallocated and the caller may return or
1162  * retry depending on the failure code.  On success (fs) is NOT unlocked or
1163  * deallocated, fs.m will contained a resolved, busied page, and fs.object
1164  * will have an additional PIP count if it is not equal to fs.first_object.
1165  *
1166  * If locks based on fs->first_shared or fs->shared are insufficient,
1167  * clear the appropriate field(s) and return RETRY.  COWs require that
1168  * first_shared be 0, while page allocations (or frees) require that
1169  * shared be 0.  Renames require that both be 0.
1170  *
1171  * fs->first_object must be held on call.
1172  */
1173 static
1174 int
1175 vm_fault_object(struct faultstate *fs, vm_pindex_t first_pindex,
1176                 vm_prot_t fault_type, int allow_nofault)
1177 {
1178         vm_object_t next_object;
1179         vm_pindex_t pindex;
1180         int error;
1181
1182         ASSERT_LWKT_TOKEN_HELD(vm_object_token(fs->first_object));
1183         fs->prot = fs->first_prot;
1184         fs->object = fs->first_object;
1185         pindex = first_pindex;
1186
1187         vm_object_chain_acquire(fs->first_object, fs->shared);
1188         vm_object_pip_add(fs->first_object, 1);
1189
1190         /* 
1191          * If a read fault occurs we try to make the page writable if
1192          * possible.  There are three cases where we cannot make the
1193          * page mapping writable:
1194          *
1195          * (1) The mapping is read-only or the VM object is read-only,
1196          *     fs->prot above will simply not have VM_PROT_WRITE set.
1197          *
1198          * (2) If the mapping is a virtual page table we need to be able
1199          *     to detect writes so we can set VPTE_M in the virtual page
1200          *     table.
1201          *
1202          * (3) If the VM page is read-only or copy-on-write, upgrading would
1203          *     just result in an unnecessary COW fault.
1204          *
1205          * VM_PROT_VPAGED is set if faulting via a virtual page table and
1206          * causes adjustments to the 'M'odify bit to also turn off write
1207          * access to force a re-fault.
1208          */
1209         if (fs->entry->maptype == VM_MAPTYPE_VPAGETABLE) {
1210                 if ((fault_type & VM_PROT_WRITE) == 0)
1211                         fs->prot &= ~VM_PROT_WRITE;
1212         }
1213
1214         if (curthread->td_lwp && curthread->td_lwp->lwp_vmspace &&
1215             pmap_emulate_ad_bits(&curthread->td_lwp->lwp_vmspace->vm_pmap)) {
1216                 if ((fault_type & VM_PROT_WRITE) == 0)
1217                         fs->prot &= ~VM_PROT_WRITE;
1218         }
1219
1220         /* vm_object_hold(fs->object); implied b/c object == first_object */
1221
1222         for (;;) {
1223                 /*
1224                  * The entire backing chain from first_object to object
1225                  * inclusive is chainlocked.
1226                  *
1227                  * If the object is dead, we stop here
1228                  */
1229                 if (fs->object->flags & OBJ_DEAD) {
1230                         vm_object_pip_wakeup(fs->first_object);
1231                         vm_object_chain_release_all(fs->first_object,
1232                                                     fs->object);
1233                         if (fs->object != fs->first_object)
1234                                 vm_object_drop(fs->object);
1235                         unlock_and_deallocate(fs);
1236                         return (KERN_PROTECTION_FAILURE);
1237                 }
1238
1239                 /*
1240                  * See if the page is resident.  Wait/Retry if the page is
1241                  * busy (lots of stuff may have changed so we can't continue
1242                  * in that case).
1243                  *
1244                  * We can theoretically allow the soft-busy case on a read
1245                  * fault if the page is marked valid, but since such
1246                  * pages are typically already pmap'd, putting that
1247                  * special case in might be more effort then it is
1248                  * worth.  We cannot under any circumstances mess
1249                  * around with a vm_page_t->busy page except, perhaps,
1250                  * to pmap it.
1251                  */
1252                 fs->m = vm_page_lookup_busy_try(fs->object, pindex,
1253                                                 TRUE, &error);
1254                 if (error) {
1255                         vm_object_pip_wakeup(fs->first_object);
1256                         vm_object_chain_release_all(fs->first_object,
1257                                                     fs->object);
1258                         if (fs->object != fs->first_object)
1259                                 vm_object_drop(fs->object);
1260                         unlock_things(fs);
1261                         vm_page_sleep_busy(fs->m, TRUE, "vmpfw");
1262                         mycpu->gd_cnt.v_intrans++;
1263                         /*vm_object_deallocate(fs->first_object);*/
1264                         /*fs->first_object = NULL;*/
1265                         fs->m = NULL;
1266                         return (KERN_TRY_AGAIN);
1267                 }
1268                 if (fs->m) {
1269                         /*
1270                          * The page is busied for us.
1271                          *
1272                          * If reactivating a page from PQ_CACHE we may have
1273                          * to rate-limit.
1274                          */
1275                         int queue = fs->m->queue;
1276                         vm_page_unqueue_nowakeup(fs->m);
1277
1278                         if ((queue - fs->m->pc) == PQ_CACHE && 
1279                             vm_page_count_severe()) {
1280                                 vm_page_activate(fs->m);
1281                                 vm_page_wakeup(fs->m);
1282                                 fs->m = NULL;
1283                                 vm_object_pip_wakeup(fs->first_object);
1284                                 vm_object_chain_release_all(fs->first_object,
1285                                                             fs->object);
1286                                 if (fs->object != fs->first_object)
1287                                         vm_object_drop(fs->object);
1288                                 unlock_and_deallocate(fs);
1289                                 if (allow_nofault == 0 ||
1290                                     (curthread->td_flags & TDF_NOFAULT) == 0) {
1291                                         vm_wait_pfault();
1292                                 }
1293                                 return (KERN_TRY_AGAIN);
1294                         }
1295
1296                         /*
1297                          * If it still isn't completely valid (readable),
1298                          * or if a read-ahead-mark is set on the VM page,
1299                          * jump to readrest, else we found the page and
1300                          * can return.
1301                          *
1302                          * We can release the spl once we have marked the
1303                          * page busy.
1304                          */
1305                         if (fs->m->object != &kernel_object) {
1306                                 if ((fs->m->valid & VM_PAGE_BITS_ALL) !=
1307                                     VM_PAGE_BITS_ALL) {
1308                                         goto readrest;
1309                                 }
1310                                 if (fs->m->flags & PG_RAM) {
1311                                         if (debug_cluster)
1312                                                 kprintf("R");
1313                                         vm_page_flag_clear(fs->m, PG_RAM);
1314                                         goto readrest;
1315                                 }
1316                         }
1317                         break; /* break to PAGE HAS BEEN FOUND */
1318                 }
1319
1320                 /*
1321                  * Page is not resident, If this is the search termination
1322                  * or the pager might contain the page, allocate a new page.
1323                  */
1324                 if (TRYPAGER(fs) || fs->object == fs->first_object) {
1325                         /*
1326                          * Allocating, must be exclusive.
1327                          */
1328                         if (fs->object == fs->first_object &&
1329                             fs->first_shared) {
1330                                 fs->first_shared = 0;
1331                                 vm_object_pip_wakeup(fs->first_object);
1332                                 vm_object_chain_release_all(fs->first_object,
1333                                                             fs->object);
1334                                 if (fs->object != fs->first_object)
1335                                         vm_object_drop(fs->object);
1336                                 unlock_and_deallocate(fs);
1337                                 return (KERN_TRY_AGAIN);
1338                         }
1339                         if (fs->object != fs->first_object &&
1340                             fs->shared) {
1341                                 fs->first_shared = 0;
1342                                 fs->shared = 0;
1343                                 vm_object_pip_wakeup(fs->first_object);
1344                                 vm_object_chain_release_all(fs->first_object,
1345                                                             fs->object);
1346                                 if (fs->object != fs->first_object)
1347                                         vm_object_drop(fs->object);
1348                                 unlock_and_deallocate(fs);
1349                                 return (KERN_TRY_AGAIN);
1350                         }
1351
1352                         /*
1353                          * If the page is beyond the object size we fail
1354                          */
1355                         if (pindex >= fs->object->size) {
1356                                 vm_object_pip_wakeup(fs->first_object);
1357                                 vm_object_chain_release_all(fs->first_object,
1358                                                             fs->object);
1359                                 if (fs->object != fs->first_object)
1360                                         vm_object_drop(fs->object);
1361                                 unlock_and_deallocate(fs);
1362                                 return (KERN_PROTECTION_FAILURE);
1363                         }
1364
1365                         /*
1366                          * Allocate a new page for this object/offset pair.
1367                          *
1368                          * It is possible for the allocation to race, so
1369                          * handle the case.
1370                          */
1371                         fs->m = NULL;
1372                         if (!vm_page_count_severe()) {
1373                                 fs->m = vm_page_alloc(fs->object, pindex,
1374                                     ((fs->vp || fs->object->backing_object) ?
1375                                         VM_ALLOC_NULL_OK | VM_ALLOC_NORMAL :
1376                                         VM_ALLOC_NULL_OK | VM_ALLOC_NORMAL |
1377                                         VM_ALLOC_USE_GD | VM_ALLOC_ZERO));
1378                         }
1379                         if (fs->m == NULL) {
1380                                 vm_object_pip_wakeup(fs->first_object);
1381                                 vm_object_chain_release_all(fs->first_object,
1382                                                             fs->object);
1383                                 if (fs->object != fs->first_object)
1384                                         vm_object_drop(fs->object);
1385                                 unlock_and_deallocate(fs);
1386                                 if (allow_nofault == 0 ||
1387                                     (curthread->td_flags & TDF_NOFAULT) == 0) {
1388                                         vm_wait_pfault();
1389                                 }
1390                                 return (KERN_TRY_AGAIN);
1391                         }
1392
1393                         /*
1394                          * Fall through to readrest.  We have a new page which
1395                          * will have to be paged (since m->valid will be 0).
1396                          */
1397                 }
1398
1399 readrest:
1400                 /*
1401                  * We have found an invalid or partially valid page, a
1402                  * page with a read-ahead mark which might be partially or
1403                  * fully valid (and maybe dirty too), or we have allocated
1404                  * a new page.
1405                  *
1406                  * Attempt to fault-in the page if there is a chance that the
1407                  * pager has it, and potentially fault in additional pages
1408                  * at the same time.
1409                  *
1410                  * If TRYPAGER is true then fs.m will be non-NULL and busied
1411                  * for us.
1412                  */
1413                 if (TRYPAGER(fs)) {
1414                         int rv;
1415                         int seqaccess;
1416                         u_char behavior = vm_map_entry_behavior(fs->entry);
1417
1418                         if (behavior == MAP_ENTRY_BEHAV_RANDOM)
1419                                 seqaccess = 0;
1420                         else
1421                                 seqaccess = -1;
1422
1423                         /*
1424                          * Doing I/O may synchronously insert additional
1425                          * pages so we can't be shared at this point either.
1426                          *
1427                          * NOTE: We can't free fs->m here in the allocated
1428                          *       case (fs->object != fs->first_object) as
1429                          *       this would require an exclusively locked
1430                          *       VM object.
1431                          */
1432                         if (fs->object == fs->first_object &&
1433                             fs->first_shared) {
1434                                 vm_page_deactivate(fs->m);
1435                                 vm_page_wakeup(fs->m);
1436                                 fs->m = NULL;
1437                                 fs->first_shared = 0;
1438                                 vm_object_pip_wakeup(fs->first_object);
1439                                 vm_object_chain_release_all(fs->first_object,
1440                                                             fs->object);
1441                                 if (fs->object != fs->first_object)
1442                                         vm_object_drop(fs->object);
1443                                 unlock_and_deallocate(fs);
1444                                 return (KERN_TRY_AGAIN);
1445                         }
1446                         if (fs->object != fs->first_object &&
1447                             fs->shared) {
1448                                 vm_page_deactivate(fs->m);
1449                                 vm_page_wakeup(fs->m);
1450                                 fs->m = NULL;
1451                                 fs->first_shared = 0;
1452                                 fs->shared = 0;
1453                                 vm_object_pip_wakeup(fs->first_object);
1454                                 vm_object_chain_release_all(fs->first_object,
1455                                                             fs->object);
1456                                 if (fs->object != fs->first_object)
1457                                         vm_object_drop(fs->object);
1458                                 unlock_and_deallocate(fs);
1459                                 return (KERN_TRY_AGAIN);
1460                         }
1461
1462                         /*
1463                          * Avoid deadlocking against the map when doing I/O.
1464                          * fs.object and the page is PG_BUSY'd.
1465                          *
1466                          * NOTE: Once unlocked, fs->entry can become stale
1467                          *       so this will NULL it out.
1468                          *
1469                          * NOTE: fs->entry is invalid until we relock the
1470                          *       map and verify that the timestamp has not
1471                          *       changed.
1472                          */
1473                         unlock_map(fs);
1474
1475                         /*
1476                          * Acquire the page data.  We still hold a ref on
1477                          * fs.object and the page has been PG_BUSY's.
1478                          *
1479                          * The pager may replace the page (for example, in
1480                          * order to enter a fictitious page into the
1481                          * object).  If it does so it is responsible for
1482                          * cleaning up the passed page and properly setting
1483                          * the new page PG_BUSY.
1484                          *
1485                          * If we got here through a PG_RAM read-ahead
1486                          * mark the page may be partially dirty and thus
1487                          * not freeable.  Don't bother checking to see
1488                          * if the pager has the page because we can't free
1489                          * it anyway.  We have to depend on the get_page
1490                          * operation filling in any gaps whether there is
1491                          * backing store or not.
1492                          */
1493                         rv = vm_pager_get_page(fs->object, &fs->m, seqaccess);
1494
1495                         if (rv == VM_PAGER_OK) {
1496                                 /*
1497                                  * Relookup in case pager changed page. Pager
1498                                  * is responsible for disposition of old page
1499                                  * if moved.
1500                                  *
1501                                  * XXX other code segments do relookups too.
1502                                  * It's a bad abstraction that needs to be
1503                                  * fixed/removed.
1504                                  */
1505                                 fs->m = vm_page_lookup(fs->object, pindex);
1506                                 if (fs->m == NULL) {
1507                                         vm_object_pip_wakeup(fs->first_object);
1508                                         vm_object_chain_release_all(
1509                                                 fs->first_object, fs->object);
1510                                         if (fs->object != fs->first_object)
1511                                                 vm_object_drop(fs->object);
1512                                         unlock_and_deallocate(fs);
1513                                         return (KERN_TRY_AGAIN);
1514                                 }
1515                                 ++fs->hardfault;
1516                                 break; /* break to PAGE HAS BEEN FOUND */
1517                         }
1518
1519                         /*
1520                          * Remove the bogus page (which does not exist at this
1521                          * object/offset); before doing so, we must get back
1522                          * our object lock to preserve our invariant.
1523                          *
1524                          * Also wake up any other process that may want to bring
1525                          * in this page.
1526                          *
1527                          * If this is the top-level object, we must leave the
1528                          * busy page to prevent another process from rushing
1529                          * past us, and inserting the page in that object at
1530                          * the same time that we are.
1531                          */
1532                         if (rv == VM_PAGER_ERROR) {
1533                                 if (curproc) {
1534                                         kprintf("vm_fault: pager read error, "
1535                                                 "pid %d (%s)\n",
1536                                                 curproc->p_pid,
1537                                                 curproc->p_comm);
1538                                 } else {
1539                                         kprintf("vm_fault: pager read error, "
1540                                                 "thread %p (%s)\n",
1541                                                 curthread,
1542                                                 curproc->p_comm);
1543                                 }
1544                         }
1545
1546                         /*
1547                          * Data outside the range of the pager or an I/O error
1548                          *
1549                          * The page may have been wired during the pagein,
1550                          * e.g. by the buffer cache, and cannot simply be
1551                          * freed.  Call vnode_pager_freepage() to deal with it.
1552                          *
1553                          * Also note that we cannot free the page if we are
1554                          * holding the related object shared. XXX not sure
1555                          * what to do in that case.
1556                          */
1557                         if (fs->object != fs->first_object) {
1558                                 vnode_pager_freepage(fs->m);
1559                                 fs->m = NULL;
1560                                 /*
1561                                  * XXX - we cannot just fall out at this
1562                                  * point, m has been freed and is invalid!
1563                                  */
1564                         }
1565                         /*
1566                          * XXX - the check for kernel_map is a kludge to work
1567                          * around having the machine panic on a kernel space
1568                          * fault w/ I/O error.
1569                          */
1570                         if (((fs->map != &kernel_map) &&
1571                             (rv == VM_PAGER_ERROR)) || (rv == VM_PAGER_BAD)) {
1572                                 if (fs->m) {
1573                                         if (fs->first_shared) {
1574                                                 vm_page_deactivate(fs->m);
1575                                                 vm_page_wakeup(fs->m);
1576                                         } else {
1577                                                 vnode_pager_freepage(fs->m);
1578                                         }
1579                                         fs->m = NULL;
1580                                 }
1581                                 vm_object_pip_wakeup(fs->first_object);
1582                                 vm_object_chain_release_all(fs->first_object,
1583                                                             fs->object);
1584                                 if (fs->object != fs->first_object)
1585                                         vm_object_drop(fs->object);
1586                                 unlock_and_deallocate(fs);
1587                                 if (rv == VM_PAGER_ERROR)
1588                                         return (KERN_FAILURE);
1589                                 else
1590                                         return (KERN_PROTECTION_FAILURE);
1591                                 /* NOT REACHED */
1592                         }
1593                 }
1594
1595                 /*
1596                  * We get here if the object has a default pager (or unwiring) 
1597                  * or the pager doesn't have the page.
1598                  *
1599                  * fs->first_m will be used for the COW unless we find a
1600                  * deeper page to be mapped read-only, in which case the
1601                  * unlock*(fs) will free first_m.
1602                  */
1603                 if (fs->object == fs->first_object)
1604                         fs->first_m = fs->m;
1605
1606                 /*
1607                  * Move on to the next object.  The chain lock should prevent
1608                  * the backing_object from getting ripped out from under us.
1609                  *
1610                  * The object lock for the next object is governed by
1611                  * fs->shared.
1612                  */
1613                 if ((next_object = fs->object->backing_object) != NULL) {
1614                         if (fs->shared)
1615                                 vm_object_hold_shared(next_object);
1616                         else
1617                                 vm_object_hold(next_object);
1618                         vm_object_chain_acquire(next_object, fs->shared);
1619                         KKASSERT(next_object == fs->object->backing_object);
1620                         pindex += OFF_TO_IDX(fs->object->backing_object_offset);
1621                 }
1622
1623                 if (next_object == NULL) {
1624                         /*
1625                          * If there's no object left, fill the page in the top
1626                          * object with zeros.
1627                          */
1628                         if (fs->object != fs->first_object) {
1629 #if 0
1630                                 if (fs->first_object->backing_object !=
1631                                     fs->object) {
1632                                         vm_object_hold(fs->first_object->backing_object);
1633                                 }
1634 #endif
1635                                 vm_object_chain_release_all(
1636                                         fs->first_object->backing_object,
1637                                         fs->object);
1638 #if 0
1639                                 if (fs->first_object->backing_object !=
1640                                     fs->object) {
1641                                         vm_object_drop(fs->first_object->backing_object);
1642                                 }
1643 #endif
1644                                 vm_object_pip_wakeup(fs->object);
1645                                 vm_object_drop(fs->object);
1646                                 fs->object = fs->first_object;
1647                                 pindex = first_pindex;
1648                                 fs->m = fs->first_m;
1649                         }
1650                         fs->first_m = NULL;
1651
1652                         /*
1653                          * Zero the page if necessary and mark it valid.
1654                          */
1655                         if ((fs->m->flags & PG_ZERO) == 0) {
1656                                 vm_page_zero_fill(fs->m);
1657                         } else {
1658 #ifdef PMAP_DEBUG
1659                                 pmap_page_assertzero(VM_PAGE_TO_PHYS(fs->m));
1660 #endif
1661                                 vm_page_flag_clear(fs->m, PG_ZERO);
1662                                 mycpu->gd_cnt.v_ozfod++;
1663                         }
1664                         mycpu->gd_cnt.v_zfod++;
1665                         fs->m->valid = VM_PAGE_BITS_ALL;
1666                         break;  /* break to PAGE HAS BEEN FOUND */
1667                 }
1668                 if (fs->object != fs->first_object) {
1669                         vm_object_pip_wakeup(fs->object);
1670                         vm_object_lock_swap();
1671                         vm_object_drop(fs->object);
1672                 }
1673                 KASSERT(fs->object != next_object,
1674                         ("object loop %p", next_object));
1675                 fs->object = next_object;
1676                 vm_object_pip_add(fs->object, 1);
1677         }
1678
1679         /*
1680          * PAGE HAS BEEN FOUND. [Loop invariant still holds -- the object lock
1681          * is held.]
1682          *
1683          * object still held.
1684          *
1685          * local shared variable may be different from fs->shared.
1686          *
1687          * If the page is being written, but isn't already owned by the
1688          * top-level object, we have to copy it into a new page owned by the
1689          * top-level object.
1690          */
1691         KASSERT((fs->m->flags & PG_BUSY) != 0,
1692                 ("vm_fault: not busy after main loop"));
1693
1694         if (fs->object != fs->first_object) {
1695                 /*
1696                  * We only really need to copy if we want to write it.
1697                  */
1698                 if (fault_type & VM_PROT_WRITE) {
1699                         /*
1700                          * This allows pages to be virtually copied from a 
1701                          * backing_object into the first_object, where the 
1702                          * backing object has no other refs to it, and cannot
1703                          * gain any more refs.  Instead of a bcopy, we just 
1704                          * move the page from the backing object to the 
1705                          * first object.  Note that we must mark the page 
1706                          * dirty in the first object so that it will go out 
1707                          * to swap when needed.
1708                          */
1709                         if (
1710                                 /*
1711                                  * Must be holding exclusive locks
1712                                  */
1713                                 fs->first_shared == 0 &&
1714                                 fs->shared == 0 &&
1715                                 /*
1716                                  * Map, if present, has not changed
1717                                  */
1718                                 (fs->map == NULL ||
1719                                 fs->map_generation == fs->map->timestamp) &&
1720                                 /*
1721                                  * Only one shadow object
1722                                  */
1723                                 (fs->object->shadow_count == 1) &&
1724                                 /*
1725                                  * No COW refs, except us
1726                                  */
1727                                 (fs->object->ref_count == 1) &&
1728                                 /*
1729                                  * No one else can look this object up
1730                                  */
1731                                 (fs->object->handle == NULL) &&
1732                                 /*
1733                                  * No other ways to look the object up
1734                                  */
1735                                 ((fs->object->type == OBJT_DEFAULT) ||
1736                                  (fs->object->type == OBJT_SWAP)) &&
1737                                 /*
1738                                  * We don't chase down the shadow chain
1739                                  */
1740                                 (fs->object == fs->first_object->backing_object) &&
1741
1742                                 /*
1743                                  * grab the lock if we need to
1744                                  */
1745                                 (fs->lookup_still_valid ||
1746                                  fs->map == NULL ||
1747                                  lockmgr(&fs->map->lock, LK_EXCLUSIVE|LK_NOWAIT) == 0)
1748                             ) {
1749                                 /*
1750                                  * (first_m) and (m) are both busied.  We have
1751                                  * move (m) into (first_m)'s object/pindex
1752                                  * in an atomic fashion, then free (first_m).
1753                                  *
1754                                  * first_object is held so second remove
1755                                  * followed by the rename should wind
1756                                  * up being atomic.  vm_page_free() might
1757                                  * block so we don't do it until after the
1758                                  * rename.
1759                                  */
1760                                 fs->lookup_still_valid = 1;
1761                                 vm_page_protect(fs->first_m, VM_PROT_NONE);
1762                                 vm_page_remove(fs->first_m);
1763                                 vm_page_rename(fs->m, fs->first_object,
1764                                                first_pindex);
1765                                 vm_page_free(fs->first_m);
1766                                 fs->first_m = fs->m;
1767                                 fs->m = NULL;
1768                                 mycpu->gd_cnt.v_cow_optim++;
1769                         } else {
1770                                 /*
1771                                  * Oh, well, lets copy it.
1772                                  *
1773                                  * Why are we unmapping the original page
1774                                  * here?  Well, in short, not all accessors
1775                                  * of user memory go through the pmap.  The
1776                                  * procfs code doesn't have access user memory
1777                                  * via a local pmap, so vm_fault_page*()
1778                                  * can't call pmap_enter().  And the umtx*()
1779                                  * code may modify the COW'd page via a DMAP
1780                                  * or kernel mapping and not via the pmap,
1781                                  * leaving the original page still mapped
1782                                  * read-only into the pmap.
1783                                  *
1784                                  * So we have to remove the page from at
1785                                  * least the current pmap if it is in it.
1786                                  * Just remove it from all pmaps.
1787                                  */
1788                                 KKASSERT(fs->first_shared == 0);
1789                                 vm_page_copy(fs->m, fs->first_m);
1790                                 vm_page_protect(fs->m, VM_PROT_NONE);
1791                                 vm_page_event(fs->m, VMEVENT_COW);
1792                         }
1793
1794                         /*
1795                          * We no longer need the old page or object.
1796                          */
1797                         if (fs->m)
1798                                 release_page(fs);
1799
1800                         /*
1801                          * We intend to revert to first_object, undo the
1802                          * chain lock through to that.
1803                          */
1804 #if 0
1805                         if (fs->first_object->backing_object != fs->object)
1806                                 vm_object_hold(fs->first_object->backing_object);
1807 #endif
1808                         vm_object_chain_release_all(
1809                                         fs->first_object->backing_object,
1810                                         fs->object);
1811 #if 0
1812                         if (fs->first_object->backing_object != fs->object)
1813                                 vm_object_drop(fs->first_object->backing_object);
1814 #endif
1815
1816                         /*
1817                          * fs->object != fs->first_object due to above 
1818                          * conditional
1819                          */
1820                         vm_object_pip_wakeup(fs->object);
1821                         vm_object_drop(fs->object);
1822
1823                         /*
1824                          * Only use the new page below...
1825                          */
1826                         mycpu->gd_cnt.v_cow_faults++;
1827                         fs->m = fs->first_m;
1828                         fs->object = fs->first_object;
1829                         pindex = first_pindex;
1830                 } else {
1831                         /*
1832                          * If it wasn't a write fault avoid having to copy
1833                          * the page by mapping it read-only.
1834                          */
1835                         fs->prot &= ~VM_PROT_WRITE;
1836                 }
1837         }
1838
1839         /*
1840          * Relock the map if necessary, then check the generation count.
1841          * relock_map() will update fs->timestamp to account for the
1842          * relocking if necessary.
1843          *
1844          * If the count has changed after relocking then all sorts of
1845          * crap may have happened and we have to retry.
1846          *
1847          * NOTE: The relock_map() can fail due to a deadlock against
1848          *       the vm_page we are holding BUSY.
1849          */
1850         if (fs->lookup_still_valid == FALSE && fs->map) {
1851                 if (relock_map(fs) ||
1852                     fs->map->timestamp != fs->map_generation) {
1853                         release_page(fs);
1854                         vm_object_pip_wakeup(fs->first_object);
1855                         vm_object_chain_release_all(fs->first_object,
1856                                                     fs->object);
1857                         if (fs->object != fs->first_object)
1858                                 vm_object_drop(fs->object);
1859                         unlock_and_deallocate(fs);
1860                         return (KERN_TRY_AGAIN);
1861                 }
1862         }
1863
1864         /*
1865          * If the fault is a write, we know that this page is being
1866          * written NOW so dirty it explicitly to save on pmap_is_modified()
1867          * calls later.
1868          *
1869          * If this is a NOSYNC mmap we do not want to set PG_NOSYNC
1870          * if the page is already dirty to prevent data written with
1871          * the expectation of being synced from not being synced.
1872          * Likewise if this entry does not request NOSYNC then make
1873          * sure the page isn't marked NOSYNC.  Applications sharing
1874          * data should use the same flags to avoid ping ponging.
1875          *
1876          * Also tell the backing pager, if any, that it should remove
1877          * any swap backing since the page is now dirty.
1878          */
1879         vm_page_activate(fs->m);
1880         if (fs->prot & VM_PROT_WRITE) {
1881                 vm_object_set_writeable_dirty(fs->m->object);
1882                 vm_set_nosync(fs->m, fs->entry);
1883                 if (fs->fault_flags & VM_FAULT_DIRTY) {
1884                         vm_page_dirty(fs->m);
1885                         swap_pager_unswapped(fs->m);
1886                 }
1887         }
1888
1889         vm_object_pip_wakeup(fs->first_object);
1890         vm_object_chain_release_all(fs->first_object, fs->object);
1891         if (fs->object != fs->first_object)
1892                 vm_object_drop(fs->object);
1893
1894         /*
1895          * Page had better still be busy.  We are still locked up and 
1896          * fs->object will have another PIP reference if it is not equal
1897          * to fs->first_object.
1898          */
1899         KASSERT(fs->m->flags & PG_BUSY,
1900                 ("vm_fault: page %p not busy!", fs->m));
1901
1902         /*
1903          * Sanity check: page must be completely valid or it is not fit to
1904          * map into user space.  vm_pager_get_pages() ensures this.
1905          */
1906         if (fs->m->valid != VM_PAGE_BITS_ALL) {
1907                 vm_page_zero_invalid(fs->m, TRUE);
1908                 kprintf("Warning: page %p partially invalid on fault\n", fs->m);
1909         }
1910         vm_page_flag_clear(fs->m, PG_ZERO);
1911
1912         return (KERN_SUCCESS);
1913 }
1914
1915 /*
1916  * Hold each of the physical pages that are mapped by the specified range of
1917  * virtual addresses, ["addr", "addr" + "len"), if those mappings are valid
1918  * and allow the specified types of access, "prot".  If all of the implied
1919  * pages are successfully held, then the number of held pages is returned
1920  * together with pointers to those pages in the array "ma".  However, if any
1921  * of the pages cannot be held, -1 is returned.
1922  */
1923 int
1924 vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len,
1925     vm_prot_t prot, vm_page_t *ma, int max_count)
1926 {
1927         vm_offset_t start, end;
1928         int i, npages, error;
1929
1930         start = trunc_page(addr);
1931         end = round_page(addr + len);
1932
1933         npages = howmany(end - start, PAGE_SIZE);
1934
1935         if (npages > max_count)
1936                 return -1;
1937
1938         for (i = 0; i < npages; i++) {
1939                 // XXX error handling
1940                 ma[i] = vm_fault_page_quick(start + (i * PAGE_SIZE),
1941                         prot,
1942                         &error);
1943         }
1944
1945         return npages;
1946 }
1947
1948 /*
1949  * Wire down a range of virtual addresses in a map.  The entry in question
1950  * should be marked in-transition and the map must be locked.  We must
1951  * release the map temporarily while faulting-in the page to avoid a
1952  * deadlock.  Note that the entry may be clipped while we are blocked but
1953  * will never be freed.
1954  *
1955  * No requirements.
1956  */
1957 int
1958 vm_fault_wire(vm_map_t map, vm_map_entry_t entry,
1959               boolean_t user_wire, int kmflags)
1960 {
1961         boolean_t fictitious;
1962         vm_offset_t start;
1963         vm_offset_t end;
1964         vm_offset_t va;
1965         vm_paddr_t pa;
1966         vm_page_t m;
1967         pmap_t pmap;
1968         int rv;
1969         int wire_prot;
1970         int fault_flags;
1971
1972         lwkt_gettoken(&map->token);
1973
1974         if (user_wire) {
1975                 wire_prot = VM_PROT_READ;
1976                 fault_flags = VM_FAULT_USER_WIRE;
1977         } else {
1978                 wire_prot = VM_PROT_READ | VM_PROT_WRITE;
1979                 fault_flags = VM_FAULT_CHANGE_WIRING;
1980         }
1981         if (kmflags & KM_NOTLBSYNC)
1982                 wire_prot |= VM_PROT_NOSYNC;
1983
1984         pmap = vm_map_pmap(map);
1985         start = entry->start;
1986         end = entry->end;
1987         fictitious = entry->object.vm_object &&
1988                         ((entry->object.vm_object->type == OBJT_DEVICE) ||
1989                          (entry->object.vm_object->type == OBJT_MGTDEVICE));
1990         if (entry->eflags & MAP_ENTRY_KSTACK)
1991                 start += PAGE_SIZE;
1992         map->timestamp++;
1993         vm_map_unlock(map);
1994
1995         /*
1996          * We simulate a fault to get the page and enter it in the physical
1997          * map.
1998          */
1999         for (va = start; va < end; va += PAGE_SIZE) {
2000                 rv = vm_fault(map, va, wire_prot, fault_flags);
2001                 if (rv) {
2002                         while (va > start) {
2003                                 va -= PAGE_SIZE;
2004                                 if ((pa = pmap_extract(pmap, va)) == 0)
2005                                         continue;
2006                                 pmap_change_wiring(pmap, va, FALSE, entry);
2007                                 if (!fictitious) {
2008                                         m = PHYS_TO_VM_PAGE(pa);
2009                                         vm_page_busy_wait(m, FALSE, "vmwrpg");
2010                                         vm_page_unwire(m, 1);
2011                                         vm_page_wakeup(m);
2012                                 }
2013                         }
2014                         goto done;
2015                 }
2016         }
2017         rv = KERN_SUCCESS;
2018 done:
2019         vm_map_lock(map);
2020         lwkt_reltoken(&map->token);
2021         return (rv);
2022 }
2023
2024 /*
2025  * Unwire a range of virtual addresses in a map.  The map should be
2026  * locked.
2027  */
2028 void
2029 vm_fault_unwire(vm_map_t map, vm_map_entry_t entry)
2030 {
2031         boolean_t fictitious;
2032         vm_offset_t start;
2033         vm_offset_t end;
2034         vm_offset_t va;
2035         vm_paddr_t pa;
2036         vm_page_t m;
2037         pmap_t pmap;
2038
2039         lwkt_gettoken(&map->token);
2040
2041         pmap = vm_map_pmap(map);
2042         start = entry->start;
2043         end = entry->end;
2044         fictitious = entry->object.vm_object &&
2045                         ((entry->object.vm_object->type == OBJT_DEVICE) ||
2046                          (entry->object.vm_object->type == OBJT_MGTDEVICE));
2047         if (entry->eflags & MAP_ENTRY_KSTACK)
2048                 start += PAGE_SIZE;
2049
2050         /*
2051          * Since the pages are wired down, we must be able to get their
2052          * mappings from the physical map system.
2053          */
2054         for (va = start; va < end; va += PAGE_SIZE) {
2055                 pa = pmap_extract(pmap, va);
2056                 if (pa != 0) {
2057                         pmap_change_wiring(pmap, va, FALSE, entry);
2058                         if (!fictitious) {
2059                                 m = PHYS_TO_VM_PAGE(pa);
2060                                 vm_page_busy_wait(m, FALSE, "vmwupg");
2061                                 vm_page_unwire(m, 1);
2062                                 vm_page_wakeup(m);
2063                         }
2064                 }
2065         }
2066         lwkt_reltoken(&map->token);
2067 }
2068
2069 /*
2070  * Copy all of the pages from a wired-down map entry to another.
2071  *
2072  * The source and destination maps must be locked for write.
2073  * The source and destination maps token must be held
2074  * The source map entry must be wired down (or be a sharing map
2075  * entry corresponding to a main map entry that is wired down).
2076  *
2077  * No other requirements.
2078  *
2079  * XXX do segment optimization
2080  */
2081 void
2082 vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map,
2083                     vm_map_entry_t dst_entry, vm_map_entry_t src_entry)
2084 {
2085         vm_object_t dst_object;
2086         vm_object_t src_object;
2087         vm_ooffset_t dst_offset;
2088         vm_ooffset_t src_offset;
2089         vm_prot_t prot;
2090         vm_offset_t vaddr;
2091         vm_page_t dst_m;
2092         vm_page_t src_m;
2093
2094         src_object = src_entry->object.vm_object;
2095         src_offset = src_entry->offset;
2096
2097         /*
2098          * Create the top-level object for the destination entry. (Doesn't
2099          * actually shadow anything - we copy the pages directly.)
2100          */
2101         vm_map_entry_allocate_object(dst_entry);
2102         dst_object = dst_entry->object.vm_object;
2103
2104         prot = dst_entry->max_protection;
2105
2106         /*
2107          * Loop through all of the pages in the entry's range, copying each
2108          * one from the source object (it should be there) to the destination
2109          * object.
2110          */
2111         vm_object_hold(src_object);
2112         vm_object_hold(dst_object);
2113         for (vaddr = dst_entry->start, dst_offset = 0;
2114             vaddr < dst_entry->end;
2115             vaddr += PAGE_SIZE, dst_offset += PAGE_SIZE) {
2116
2117                 /*
2118                  * Allocate a page in the destination object
2119                  */
2120                 do {
2121                         dst_m = vm_page_alloc(dst_object,
2122                                               OFF_TO_IDX(dst_offset),
2123                                               VM_ALLOC_NORMAL);
2124                         if (dst_m == NULL) {
2125                                 vm_wait(0);
2126                         }
2127                 } while (dst_m == NULL);
2128
2129                 /*
2130                  * Find the page in the source object, and copy it in.
2131                  * (Because the source is wired down, the page will be in
2132                  * memory.)
2133                  */
2134                 src_m = vm_page_lookup(src_object,
2135                                        OFF_TO_IDX(dst_offset + src_offset));
2136                 if (src_m == NULL)
2137                         panic("vm_fault_copy_wired: page missing");
2138
2139                 vm_page_copy(src_m, dst_m);
2140                 vm_page_event(src_m, VMEVENT_COW);
2141
2142                 /*
2143                  * Enter it in the pmap...
2144                  */
2145
2146                 vm_page_flag_clear(dst_m, PG_ZERO);
2147                 pmap_enter(dst_map->pmap, vaddr, dst_m, prot, FALSE, dst_entry);
2148
2149                 /*
2150                  * Mark it no longer busy, and put it on the active list.
2151                  */
2152                 vm_page_activate(dst_m);
2153                 vm_page_wakeup(dst_m);
2154         }
2155         vm_object_drop(dst_object);
2156         vm_object_drop(src_object);
2157 }
2158
2159 #if 0
2160
2161 /*
2162  * This routine checks around the requested page for other pages that
2163  * might be able to be faulted in.  This routine brackets the viable
2164  * pages for the pages to be paged in.
2165  *
2166  * Inputs:
2167  *      m, rbehind, rahead
2168  *
2169  * Outputs:
2170  *  marray (array of vm_page_t), reqpage (index of requested page)
2171  *
2172  * Return value:
2173  *  number of pages in marray
2174  */
2175 static int
2176 vm_fault_additional_pages(vm_page_t m, int rbehind, int rahead,
2177                           vm_page_t *marray, int *reqpage)
2178 {
2179         int i,j;
2180         vm_object_t object;
2181         vm_pindex_t pindex, startpindex, endpindex, tpindex;
2182         vm_page_t rtm;
2183         int cbehind, cahead;
2184
2185         object = m->object;
2186         pindex = m->pindex;
2187
2188         /*
2189          * we don't fault-ahead for device pager
2190          */
2191         if ((object->type == OBJT_DEVICE) ||
2192             (object->type == OBJT_MGTDEVICE)) {
2193                 *reqpage = 0;
2194                 marray[0] = m;
2195                 return 1;
2196         }
2197
2198         /*
2199          * if the requested page is not available, then give up now
2200          */
2201         if (!vm_pager_has_page(object, pindex, &cbehind, &cahead)) {
2202                 *reqpage = 0;   /* not used by caller, fix compiler warn */
2203                 return 0;
2204         }
2205
2206         if ((cbehind == 0) && (cahead == 0)) {
2207                 *reqpage = 0;
2208                 marray[0] = m;
2209                 return 1;
2210         }
2211
2212         if (rahead > cahead) {
2213                 rahead = cahead;
2214         }
2215
2216         if (rbehind > cbehind) {
2217                 rbehind = cbehind;
2218         }
2219
2220         /*
2221          * Do not do any readahead if we have insufficient free memory.
2222          *
2223          * XXX code was broken disabled before and has instability
2224          * with this conditonal fixed, so shortcut for now.
2225          */
2226         if (burst_fault == 0 || vm_page_count_severe()) {
2227                 marray[0] = m;
2228                 *reqpage = 0;
2229                 return 1;
2230         }
2231
2232         /*
2233          * scan backward for the read behind pages -- in memory 
2234          *
2235          * Assume that if the page is not found an interrupt will not
2236          * create it.  Theoretically interrupts can only remove (busy)
2237          * pages, not create new associations.
2238          */
2239         if (pindex > 0) {
2240                 if (rbehind > pindex) {
2241                         rbehind = pindex;
2242                         startpindex = 0;
2243                 } else {
2244                         startpindex = pindex - rbehind;
2245                 }
2246
2247                 vm_object_hold(object);
2248                 for (tpindex = pindex; tpindex > startpindex; --tpindex) {
2249                         if (vm_page_lookup(object, tpindex - 1))
2250                                 break;
2251                 }
2252
2253                 i = 0;
2254                 while (tpindex < pindex) {
2255                         rtm = vm_page_alloc(object, tpindex, VM_ALLOC_SYSTEM |
2256                                                              VM_ALLOC_NULL_OK);
2257                         if (rtm == NULL) {
2258                                 for (j = 0; j < i; j++) {
2259                                         vm_page_free(marray[j]);
2260                                 }
2261                                 vm_object_drop(object);
2262                                 marray[0] = m;
2263                                 *reqpage = 0;
2264                                 return 1;
2265                         }
2266                         marray[i] = rtm;
2267                         ++i;
2268                         ++tpindex;
2269                 }
2270                 vm_object_drop(object);
2271         } else {
2272                 i = 0;
2273         }
2274
2275         /*
2276          * Assign requested page
2277          */
2278         marray[i] = m;
2279         *reqpage = i;
2280         ++i;
2281
2282         /*
2283          * Scan forwards for read-ahead pages
2284          */
2285         tpindex = pindex + 1;
2286         endpindex = tpindex + rahead;
2287         if (endpindex > object->size)
2288                 endpindex = object->size;
2289
2290         vm_object_hold(object);
2291         while (tpindex < endpindex) {
2292                 if (vm_page_lookup(object, tpindex))
2293                         break;
2294                 rtm = vm_page_alloc(object, tpindex, VM_ALLOC_SYSTEM |
2295                                                      VM_ALLOC_NULL_OK);
2296                 if (rtm == NULL)
2297                         break;
2298                 marray[i] = rtm;
2299                 ++i;
2300                 ++tpindex;
2301         }
2302         vm_object_drop(object);
2303
2304         return (i);
2305 }
2306
2307 #endif
2308
2309 /*
2310  * vm_prefault() provides a quick way of clustering pagefaults into a
2311  * processes address space.  It is a "cousin" of pmap_object_init_pt,
2312  * except it runs at page fault time instead of mmap time.
2313  *
2314  * vm.fast_fault        Enables pre-faulting zero-fill pages
2315  *
2316  * vm.prefault_pages    Number of pages (1/2 negative, 1/2 positive) to
2317  *                      prefault.  Scan stops in either direction when
2318  *                      a page is found to already exist.
2319  *
2320  * This code used to be per-platform pmap_prefault().  It is now
2321  * machine-independent and enhanced to also pre-fault zero-fill pages
2322  * (see vm.fast_fault) as well as make them writable, which greatly
2323  * reduces the number of page faults programs incur.
2324  *
2325  * Application performance when pre-faulting zero-fill pages is heavily
2326  * dependent on the application.  Very tiny applications like /bin/echo
2327  * lose a little performance while applications of any appreciable size
2328  * gain performance.  Prefaulting multiple pages also reduces SMP
2329  * congestion and can improve SMP performance significantly.
2330  *
2331  * NOTE!  prot may allow writing but this only applies to the top level
2332  *        object.  If we wind up mapping a page extracted from a backing
2333  *        object we have to make sure it is read-only.
2334  *
2335  * NOTE!  The caller has already handled any COW operations on the
2336  *        vm_map_entry via the normal fault code.  Do NOT call this
2337  *        shortcut unless the normal fault code has run on this entry.
2338  *
2339  * The related map must be locked.
2340  * No other requirements.
2341  */
2342 static int vm_prefault_pages = 8;
2343 SYSCTL_INT(_vm, OID_AUTO, prefault_pages, CTLFLAG_RW, &vm_prefault_pages, 0,
2344            "Maximum number of pages to pre-fault");
2345 static int vm_fast_fault = 1;
2346 SYSCTL_INT(_vm, OID_AUTO, fast_fault, CTLFLAG_RW, &vm_fast_fault, 0,
2347            "Burst fault zero-fill regions");
2348
2349 /*
2350  * Set PG_NOSYNC if the map entry indicates so, but only if the page
2351  * is not already dirty by other means.  This will prevent passive
2352  * filesystem syncing as well as 'sync' from writing out the page.
2353  */
2354 static void
2355 vm_set_nosync(vm_page_t m, vm_map_entry_t entry)
2356 {
2357         if (entry->eflags & MAP_ENTRY_NOSYNC) {
2358                 if (m->dirty == 0)
2359                         vm_page_flag_set(m, PG_NOSYNC);
2360         } else {
2361                 vm_page_flag_clear(m, PG_NOSYNC);
2362         }
2363 }
2364
2365 static void
2366 vm_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry, int prot,
2367             int fault_flags)
2368 {
2369         struct lwp *lp;
2370         vm_page_t m;
2371         vm_offset_t addr;
2372         vm_pindex_t index;
2373         vm_pindex_t pindex;
2374         vm_object_t object;
2375         int pprot;
2376         int i;
2377         int noneg;
2378         int nopos;
2379         int maxpages;
2380
2381         /*
2382          * Get stable max count value, disabled if set to 0
2383          */
2384         maxpages = vm_prefault_pages;
2385         cpu_ccfence();
2386         if (maxpages <= 0)
2387                 return;
2388
2389         /*
2390          * We do not currently prefault mappings that use virtual page
2391          * tables.  We do not prefault foreign pmaps.
2392          */
2393         if (entry->maptype == VM_MAPTYPE_VPAGETABLE)
2394                 return;
2395         lp = curthread->td_lwp;
2396         if (lp == NULL || (pmap != vmspace_pmap(lp->lwp_vmspace)))
2397                 return;
2398
2399         /*
2400          * Limit pre-fault count to 1024 pages.
2401          */
2402         if (maxpages > 1024)
2403                 maxpages = 1024;
2404
2405         object = entry->object.vm_object;
2406         KKASSERT(object != NULL);
2407         KKASSERT(object == entry->object.vm_object);
2408         vm_object_hold(object);
2409         vm_object_chain_acquire(object, 0);
2410
2411         noneg = 0;
2412         nopos = 0;
2413         for (i = 0; i < maxpages; ++i) {
2414                 vm_object_t lobject;
2415                 vm_object_t nobject;
2416                 int allocated = 0;
2417                 int error;
2418
2419                 /*
2420                  * This can eat a lot of time on a heavily contended
2421                  * machine so yield on the tick if needed.
2422                  */
2423                 if ((i & 7) == 7)
2424                         lwkt_yield();
2425
2426                 /*
2427                  * Calculate the page to pre-fault, stopping the scan in
2428                  * each direction separately if the limit is reached.
2429                  */
2430                 if (i & 1) {
2431                         if (noneg)
2432                                 continue;
2433                         addr = addra - ((i + 1) >> 1) * PAGE_SIZE;
2434                 } else {
2435                         if (nopos)
2436                                 continue;
2437                         addr = addra + ((i + 2) >> 1) * PAGE_SIZE;
2438                 }
2439                 if (addr < entry->start) {
2440                         noneg = 1;
2441                         if (noneg && nopos)
2442                                 break;
2443                         continue;
2444                 }
2445                 if (addr >= entry->end) {
2446                         nopos = 1;
2447                         if (noneg && nopos)
2448                                 break;
2449                         continue;
2450                 }
2451
2452                 /*
2453                  * Skip pages already mapped, and stop scanning in that
2454                  * direction.  When the scan terminates in both directions
2455                  * we are done.
2456                  */
2457                 if (pmap_prefault_ok(pmap, addr) == 0) {
2458                         if (i & 1)
2459                                 noneg = 1;
2460                         else
2461                                 nopos = 1;
2462                         if (noneg && nopos)
2463                                 break;
2464                         continue;
2465                 }
2466
2467                 /*
2468                  * Follow the VM object chain to obtain the page to be mapped
2469                  * into the pmap.
2470                  *
2471                  * If we reach the terminal object without finding a page
2472                  * and we determine it would be advantageous, then allocate
2473                  * a zero-fill page for the base object.  The base object
2474                  * is guaranteed to be OBJT_DEFAULT for this case.
2475                  *
2476                  * In order to not have to check the pager via *haspage*()
2477                  * we stop if any non-default object is encountered.  e.g.
2478                  * a vnode or swap object would stop the loop.
2479                  */
2480                 index = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
2481                 lobject = object;
2482                 pindex = index;
2483                 pprot = prot;
2484
2485                 KKASSERT(lobject == entry->object.vm_object);
2486                 /*vm_object_hold(lobject); implied */
2487
2488                 while ((m = vm_page_lookup_busy_try(lobject, pindex,
2489                                                     TRUE, &error)) == NULL) {
2490                         if (lobject->type != OBJT_DEFAULT)
2491                                 break;
2492                         if (lobject->backing_object == NULL) {
2493                                 if (vm_fast_fault == 0)
2494                                         break;
2495                                 if ((prot & VM_PROT_WRITE) == 0 ||
2496                                     vm_page_count_min(0)) {
2497                                         break;
2498                                 }
2499
2500                                 /*
2501                                  * NOTE: Allocated from base object
2502                                  */
2503                                 m = vm_page_alloc(object, index,
2504                                                   VM_ALLOC_NORMAL |
2505                                                   VM_ALLOC_ZERO |
2506                                                   VM_ALLOC_USE_GD |
2507                                                   VM_ALLOC_NULL_OK);
2508                                 if (m == NULL)
2509                                         break;
2510                                 allocated = 1;
2511                                 pprot = prot;
2512                                 /* lobject = object .. not needed */
2513                                 break;
2514                         }
2515                         if (lobject->backing_object_offset & PAGE_MASK)
2516                                 break;
2517                         nobject = lobject->backing_object;
2518                         vm_object_hold(nobject);
2519                         KKASSERT(nobject == lobject->backing_object);
2520                         pindex += lobject->backing_object_offset >> PAGE_SHIFT;
2521                         if (lobject != object) {
2522                                 vm_object_lock_swap();
2523                                 vm_object_drop(lobject);
2524                         }
2525                         lobject = nobject;
2526                         pprot &= ~VM_PROT_WRITE;
2527                         vm_object_chain_acquire(lobject, 0);
2528                 }
2529
2530                 /*
2531                  * NOTE: A non-NULL (m) will be associated with lobject if
2532                  *       it was found there, otherwise it is probably a
2533                  *       zero-fill page associated with the base object.
2534                  *
2535                  * Give-up if no page is available.
2536                  */
2537                 if (m == NULL) {
2538                         if (lobject != object) {
2539 #if 0
2540                                 if (object->backing_object != lobject)
2541                                         vm_object_hold(object->backing_object);
2542 #endif
2543                                 vm_object_chain_release_all(
2544                                         object->backing_object, lobject);
2545 #if 0
2546                                 if (object->backing_object != lobject)
2547                                         vm_object_drop(object->backing_object);
2548 #endif
2549                                 vm_object_drop(lobject);
2550                         }
2551                         break;
2552                 }
2553
2554                 /*
2555                  * The object must be marked dirty if we are mapping a
2556                  * writable page.  m->object is either lobject or object,
2557                  * both of which are still held.  Do this before we
2558                  * potentially drop the object.
2559                  */
2560                 if (pprot & VM_PROT_WRITE)
2561                         vm_object_set_writeable_dirty(m->object);
2562
2563                 /*
2564                  * Do not conditionalize on PG_RAM.  If pages are present in
2565                  * the VM system we assume optimal caching.  If caching is
2566                  * not optimal the I/O gravy train will be restarted when we
2567                  * hit an unavailable page.  We do not want to try to restart
2568                  * the gravy train now because we really don't know how much
2569                  * of the object has been cached.  The cost for restarting
2570                  * the gravy train should be low (since accesses will likely
2571                  * be I/O bound anyway).
2572                  */
2573                 if (lobject != object) {
2574 #if 0
2575                         if (object->backing_object != lobject)
2576                                 vm_object_hold(object->backing_object);
2577 #endif
2578                         vm_object_chain_release_all(object->backing_object,
2579                                                     lobject);
2580 #if 0
2581                         if (object->backing_object != lobject)
2582                                 vm_object_drop(object->backing_object);
2583 #endif
2584                         vm_object_drop(lobject);
2585                 }
2586
2587                 /*
2588                  * Enter the page into the pmap if appropriate.  If we had
2589                  * allocated the page we have to place it on a queue.  If not
2590                  * we just have to make sure it isn't on the cache queue
2591                  * (pages on the cache queue are not allowed to be mapped).
2592                  */
2593                 if (allocated) {
2594                         /*
2595                          * Page must be zerod.
2596                          */
2597                         if ((m->flags & PG_ZERO) == 0) {
2598                                 vm_page_zero_fill(m);
2599                         } else {
2600 #ifdef PMAP_DEBUG
2601                                 pmap_page_assertzero(
2602                                                 VM_PAGE_TO_PHYS(m));
2603 #endif
2604                                 vm_page_flag_clear(m, PG_ZERO);
2605                                 mycpu->gd_cnt.v_ozfod++;
2606                         }
2607                         mycpu->gd_cnt.v_zfod++;
2608                         m->valid = VM_PAGE_BITS_ALL;
2609
2610                         /*
2611                          * Handle dirty page case
2612                          */
2613                         if (pprot & VM_PROT_WRITE)
2614                                 vm_set_nosync(m, entry);
2615                         pmap_enter(pmap, addr, m, pprot, 0, entry);
2616                         mycpu->gd_cnt.v_vm_faults++;
2617                         if (curthread->td_lwp)
2618                                 ++curthread->td_lwp->lwp_ru.ru_minflt;
2619                         vm_page_deactivate(m);
2620                         if (pprot & VM_PROT_WRITE) {
2621                                 /*vm_object_set_writeable_dirty(m->object);*/
2622                                 vm_set_nosync(m, entry);
2623                                 if (fault_flags & VM_FAULT_DIRTY) {
2624                                         vm_page_dirty(m);
2625                                         /*XXX*/
2626                                         swap_pager_unswapped(m);
2627                                 }
2628                         }
2629                         vm_page_wakeup(m);
2630                 } else if (error) {
2631                         /* couldn't busy page, no wakeup */
2632                 } else if (
2633                     ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
2634                     (m->flags & PG_FICTITIOUS) == 0) {
2635                         /*
2636                          * A fully valid page not undergoing soft I/O can
2637                          * be immediately entered into the pmap.
2638                          */
2639                         if ((m->queue - m->pc) == PQ_CACHE)
2640                                 vm_page_deactivate(m);
2641                         if (pprot & VM_PROT_WRITE) {
2642                                 /*vm_object_set_writeable_dirty(m->object);*/
2643                                 vm_set_nosync(m, entry);
2644                                 if (fault_flags & VM_FAULT_DIRTY) {
2645                                         vm_page_dirty(m);
2646                                         /*XXX*/
2647                                         swap_pager_unswapped(m);
2648                                 }
2649                         }
2650                         if (pprot & VM_PROT_WRITE)
2651                                 vm_set_nosync(m, entry);
2652                         pmap_enter(pmap, addr, m, pprot, 0, entry);
2653                         mycpu->gd_cnt.v_vm_faults++;
2654                         if (curthread->td_lwp)
2655                                 ++curthread->td_lwp->lwp_ru.ru_minflt;
2656                         vm_page_wakeup(m);
2657                 } else {
2658                         vm_page_wakeup(m);
2659                 }
2660         }
2661         vm_object_chain_release(object);
2662         vm_object_drop(object);
2663 }
2664
2665 /*
2666  * Object can be held shared
2667  */
2668 static void
2669 vm_prefault_quick(pmap_t pmap, vm_offset_t addra,
2670                   vm_map_entry_t entry, int prot, int fault_flags)
2671 {
2672         struct lwp *lp;
2673         vm_page_t m;
2674         vm_offset_t addr;
2675         vm_pindex_t pindex;
2676         vm_object_t object;
2677         int i;
2678         int noneg;
2679         int nopos;
2680         int maxpages;
2681
2682         /*
2683          * Get stable max count value, disabled if set to 0
2684          */
2685         maxpages = vm_prefault_pages;
2686         cpu_ccfence();
2687         if (maxpages <= 0)
2688                 return;
2689
2690         /*
2691          * We do not currently prefault mappings that use virtual page
2692          * tables.  We do not prefault foreign pmaps.
2693          */
2694         if (entry->maptype == VM_MAPTYPE_VPAGETABLE)
2695                 return;
2696         lp = curthread->td_lwp;
2697         if (lp == NULL || (pmap != vmspace_pmap(lp->lwp_vmspace)))
2698                 return;
2699         object = entry->object.vm_object;
2700         if (object->backing_object != NULL)
2701                 return;
2702         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
2703
2704         /*
2705          * Limit pre-fault count to 1024 pages.
2706          */
2707         if (maxpages > 1024)
2708                 maxpages = 1024;
2709
2710         noneg = 0;
2711         nopos = 0;
2712         for (i = 0; i < maxpages; ++i) {
2713                 int error;
2714
2715                 /*
2716                  * Calculate the page to pre-fault, stopping the scan in
2717                  * each direction separately if the limit is reached.
2718                  */
2719                 if (i & 1) {
2720                         if (noneg)
2721                                 continue;
2722                         addr = addra - ((i + 1) >> 1) * PAGE_SIZE;
2723                 } else {
2724                         if (nopos)
2725                                 continue;
2726                         addr = addra + ((i + 2) >> 1) * PAGE_SIZE;
2727                 }
2728                 if (addr < entry->start) {
2729                         noneg = 1;
2730                         if (noneg && nopos)
2731                                 break;
2732                         continue;
2733                 }
2734                 if (addr >= entry->end) {
2735                         nopos = 1;
2736                         if (noneg && nopos)
2737                                 break;
2738                         continue;
2739                 }
2740
2741                 /*
2742                  * Skip pages already mapped, and stop scanning in that
2743                  * direction.  When the scan terminates in both directions
2744                  * we are done.
2745                  */
2746                 if (pmap_prefault_ok(pmap, addr) == 0) {
2747                         if (i & 1)
2748                                 noneg = 1;
2749                         else
2750                                 nopos = 1;
2751                         if (noneg && nopos)
2752                                 break;
2753                         continue;
2754                 }
2755
2756                 /*
2757                  * Follow the VM object chain to obtain the page to be mapped
2758                  * into the pmap.  This version of the prefault code only
2759                  * works with terminal objects.
2760                  *
2761                  * WARNING!  We cannot call swap_pager_unswapped() with a
2762                  *           shared token.
2763                  */
2764                 pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
2765
2766                 m = vm_page_lookup_busy_try(object, pindex, TRUE, &error);
2767                 if (m == NULL || error)
2768                         continue;
2769
2770                 if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
2771                     (m->flags & PG_FICTITIOUS) == 0 &&
2772                     ((m->flags & PG_SWAPPED) == 0 ||
2773                      (prot & VM_PROT_WRITE) == 0 ||
2774                      (fault_flags & VM_FAULT_DIRTY) == 0)) {
2775                         /*
2776                          * A fully valid page not undergoing soft I/O can
2777                          * be immediately entered into the pmap.
2778                          */
2779                         if ((m->queue - m->pc) == PQ_CACHE)
2780                                 vm_page_deactivate(m);
2781                         if (prot & VM_PROT_WRITE) {
2782                                 vm_object_set_writeable_dirty(m->object);
2783                                 vm_set_nosync(m, entry);
2784                                 if (fault_flags & VM_FAULT_DIRTY) {
2785                                         vm_page_dirty(m);
2786                                         /*XXX*/
2787                                         swap_pager_unswapped(m);
2788                                 }
2789                         }
2790                         pmap_enter(pmap, addr, m, prot, 0, entry);
2791                         mycpu->gd_cnt.v_vm_faults++;
2792                         if (curthread->td_lwp)
2793                                 ++curthread->td_lwp->lwp_ru.ru_minflt;
2794                 }
2795                 vm_page_wakeup(m);
2796         }
2797 }