kernel - Move CPUMASK_LOCK out of the cpumask_t
[dragonfly.git] / sys / platform / pc64 / x86_64 / pmap.c
1 /*
2  * Copyright (c) 1991 Regents of the University of California.
3  * Copyright (c) 1994 John S. Dyson
4  * Copyright (c) 1994 David Greenman
5  * Copyright (c) 2003 Peter Wemm
6  * Copyright (c) 2005-2008 Alan L. Cox <alc@cs.rice.edu>
7  * Copyright (c) 2008, 2009 The DragonFly Project.
8  * Copyright (c) 2008, 2009 Jordan Gordeev.
9  * Copyright (c) 2011-2012 Matthew Dillon
10  * All rights reserved.
11  *
12  * This code is derived from software contributed to Berkeley by
13  * the Systems Programming Group of the University of Utah Computer
14  * Science Department and William Jolitz of UUNET Technologies Inc.
15  *
16  * Redistribution and use in source and binary forms, with or without
17  * modification, are permitted provided that the following conditions
18  * are met:
19  * 1. Redistributions of source code must retain the above copyright
20  *    notice, this list of conditions and the following disclaimer.
21  * 2. Redistributions in binary form must reproduce the above copyright
22  *    notice, this list of conditions and the following disclaimer in the
23  *    documentation and/or other materials provided with the distribution.
24  * 3. All advertising materials mentioning features or use of this software
25  *    must display the following acknowledgement:
26  *      This product includes software developed by the University of
27  *      California, Berkeley and its contributors.
28  * 4. Neither the name of the University nor the names of its contributors
29  *    may be used to endorse or promote products derived from this software
30  *    without specific prior written permission.
31  *
32  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
33  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
34  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
35  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
36  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
37  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
38  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
39  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
40  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
41  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
42  * SUCH DAMAGE.
43  */
44 /*
45  * Manage physical address maps for x86-64 systems.
46  */
47
48 #if JG
49 #include "opt_disable_pse.h"
50 #include "opt_pmap.h"
51 #endif
52 #include "opt_msgbuf.h"
53
54 #include <sys/param.h>
55 #include <sys/kernel.h>
56 #include <sys/proc.h>
57 #include <sys/msgbuf.h>
58 #include <sys/vmmeter.h>
59 #include <sys/mman.h>
60 #include <sys/systm.h>
61
62 #include <vm/vm.h>
63 #include <vm/vm_param.h>
64 #include <sys/sysctl.h>
65 #include <sys/lock.h>
66 #include <vm/vm_kern.h>
67 #include <vm/vm_page.h>
68 #include <vm/vm_map.h>
69 #include <vm/vm_object.h>
70 #include <vm/vm_extern.h>
71 #include <vm/vm_pageout.h>
72 #include <vm/vm_pager.h>
73 #include <vm/vm_zone.h>
74
75 #include <sys/user.h>
76 #include <sys/thread2.h>
77 #include <sys/sysref2.h>
78 #include <sys/spinlock2.h>
79 #include <vm/vm_page2.h>
80
81 #include <machine/cputypes.h>
82 #include <machine/md_var.h>
83 #include <machine/specialreg.h>
84 #include <machine/smp.h>
85 #include <machine_base/apic/apicreg.h>
86 #include <machine/globaldata.h>
87 #include <machine/pmap.h>
88 #include <machine/pmap_inval.h>
89 #include <machine/inttypes.h>
90
91 #include <ddb/ddb.h>
92
93 #define PMAP_KEEP_PDIRS
94 #ifndef PMAP_SHPGPERPROC
95 #define PMAP_SHPGPERPROC 2000
96 #endif
97
98 #if defined(DIAGNOSTIC)
99 #define PMAP_DIAGNOSTIC
100 #endif
101
102 #define MINPV 2048
103
104 /*
105  * pmap debugging will report who owns a pv lock when blocking.
106  */
107 #ifdef PMAP_DEBUG
108
109 #define PMAP_DEBUG_DECL         ,const char *func, int lineno
110 #define PMAP_DEBUG_ARGS         , __func__, __LINE__
111 #define PMAP_DEBUG_COPY         , func, lineno
112
113 #define pv_get(pmap, pindex)            _pv_get(pmap, pindex            \
114                                                         PMAP_DEBUG_ARGS)
115 #define pv_lock(pv)                     _pv_lock(pv                     \
116                                                         PMAP_DEBUG_ARGS)
117 #define pv_hold_try(pv)                 _pv_hold_try(pv                 \
118                                                         PMAP_DEBUG_ARGS)
119 #define pv_alloc(pmap, pindex, isnewp)  _pv_alloc(pmap, pindex, isnewp  \
120                                                         PMAP_DEBUG_ARGS)
121
122 #else
123
124 #define PMAP_DEBUG_DECL
125 #define PMAP_DEBUG_ARGS
126 #define PMAP_DEBUG_COPY
127
128 #define pv_get(pmap, pindex)            _pv_get(pmap, pindex)
129 #define pv_lock(pv)                     _pv_lock(pv)
130 #define pv_hold_try(pv)                 _pv_hold_try(pv)
131 #define pv_alloc(pmap, pindex, isnewp)  _pv_alloc(pmap, pindex, isnewp)
132
133 #endif
134
135 /*
136  * Get PDEs and PTEs for user/kernel address space
137  */
138 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
139
140 #define pmap_pde_v(pmap, pte)           ((*(pd_entry_t *)pte & pmap->pmap_bits[PG_V_IDX]) != 0)
141 #define pmap_pte_w(pmap, pte)           ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_W_IDX]) != 0)
142 #define pmap_pte_m(pmap, pte)           ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_M_IDX]) != 0)
143 #define pmap_pte_u(pmap, pte)           ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_U_IDX]) != 0)
144 #define pmap_pte_v(pmap, pte)           ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_V_IDX]) != 0)
145
146 /*
147  * Given a map and a machine independent protection code,
148  * convert to a vax protection code.
149  */
150 #define pte_prot(m, p)          \
151         (m->protection_codes[p & (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE)])
152 static int protection_codes[PROTECTION_CODES_SIZE];
153
154 struct pmap kernel_pmap;
155 static TAILQ_HEAD(,pmap)        pmap_list = TAILQ_HEAD_INITIALIZER(pmap_list);
156
157 MALLOC_DEFINE(M_OBJPMAP, "objpmap", "pmaps associated with VM objects");
158
159 vm_paddr_t avail_start;         /* PA of first available physical page */
160 vm_paddr_t avail_end;           /* PA of last available physical page */
161 vm_offset_t virtual2_start;     /* cutout free area prior to kernel start */
162 vm_offset_t virtual2_end;
163 vm_offset_t virtual_start;      /* VA of first avail page (after kernel bss) */
164 vm_offset_t virtual_end;        /* VA of last avail page (end of kernel AS) */
165 vm_offset_t KvaStart;           /* VA start of KVA space */
166 vm_offset_t KvaEnd;             /* VA end of KVA space (non-inclusive) */
167 vm_offset_t KvaSize;            /* max size of kernel virtual address space */
168 static boolean_t pmap_initialized = FALSE;      /* Has pmap_init completed? */
169 //static int pgeflag;           /* PG_G or-in */
170 //static int pseflag;           /* PG_PS or-in */
171 uint64_t PatMsr;
172
173 static int ndmpdp;
174 static vm_paddr_t dmaplimit;
175 static int nkpt;
176 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
177
178 static pt_entry_t pat_pte_index[PAT_INDEX_SIZE];        /* PAT -> PG_ bits */
179 /*static pt_entry_t pat_pde_index[PAT_INDEX_SIZE];*/    /* PAT -> PG_ bits */
180
181 static uint64_t KPTbase;
182 static uint64_t KPTphys;
183 static uint64_t KPDphys;        /* phys addr of kernel level 2 */
184 static uint64_t KPDbase;        /* phys addr of kernel level 2 @ KERNBASE */
185 uint64_t KPDPphys;      /* phys addr of kernel level 3 */
186 uint64_t KPML4phys;     /* phys addr of kernel level 4 */
187
188 static uint64_t DMPDphys;       /* phys addr of direct mapped level 2 */
189 static uint64_t DMPDPphys;      /* phys addr of direct mapped level 3 */
190
191 /*
192  * Data for the pv entry allocation mechanism
193  */
194 static vm_zone_t pvzone;
195 static struct vm_zone pvzone_store;
196 static struct vm_object pvzone_obj;
197 static int pv_entry_max=0, pv_entry_high_water=0;
198 static int pmap_pagedaemon_waken = 0;
199 static struct pv_entry *pvinit;
200
201 /*
202  * All those kernel PT submaps that BSD is so fond of
203  */
204 pt_entry_t *CMAP1 = NULL, *ptmmap;
205 caddr_t CADDR1 = NULL, ptvmmap = NULL;
206 static pt_entry_t *msgbufmap;
207 struct msgbuf *msgbufp=NULL;
208
209 /*
210  * PMAP default PG_* bits. Needed to be able to add
211  * EPT/NPT pagetable pmap_bits for the VMM module
212  */
213 uint64_t pmap_bits_default[] = {
214                 REGULAR_PMAP,                                   /* TYPE_IDX             0 */
215                 X86_PG_V,                                       /* PG_V_IDX             1 */
216                 X86_PG_RW,                                      /* PG_RW_IDX            2 */
217                 X86_PG_U,                                       /* PG_U_IDX             3 */
218                 X86_PG_A,                                       /* PG_A_IDX             4 */
219                 X86_PG_M,                                       /* PG_M_IDX             5 */
220                 X86_PG_PS,                                      /* PG_PS_IDX3           6 */
221                 X86_PG_G,                                       /* PG_G_IDX             7 */
222                 X86_PG_AVAIL1,                                  /* PG_AVAIL1_IDX        8 */
223                 X86_PG_AVAIL2,                                  /* PG_AVAIL2_IDX        9 */
224                 X86_PG_AVAIL3,                                  /* PG_AVAIL3_IDX        10 */
225                 X86_PG_NC_PWT | X86_PG_NC_PCD,                  /* PG_N_IDX     11 */
226 };
227 /*
228  * Crashdump maps.
229  */
230 static pt_entry_t *pt_crashdumpmap;
231 static caddr_t crashdumpmap;
232
233 #ifdef PMAP_DEBUG2
234 static int pmap_enter_debug = 0;
235 SYSCTL_INT(_machdep, OID_AUTO, pmap_enter_debug, CTLFLAG_RW,
236     &pmap_enter_debug, 0, "Debug pmap_enter's");
237 #endif
238 static int pmap_yield_count = 64;
239 SYSCTL_INT(_machdep, OID_AUTO, pmap_yield_count, CTLFLAG_RW,
240     &pmap_yield_count, 0, "Yield during init_pt/release");
241 static int pmap_mmu_optimize = 0;
242 SYSCTL_INT(_machdep, OID_AUTO, pmap_mmu_optimize, CTLFLAG_RW,
243     &pmap_mmu_optimize, 0, "Share page table pages when possible");
244
245 #define DISABLE_PSE
246
247 /* Standard user access funtions */
248 extern int std_copyinstr (const void *udaddr, void *kaddr, size_t len,
249     size_t *lencopied);
250 extern int std_copyin (const void *udaddr, void *kaddr, size_t len);
251 extern int std_copyout (const void *kaddr, void *udaddr, size_t len);
252 extern int std_fubyte (const void *base);
253 extern int std_subyte (void *base, int byte);
254 extern long std_fuword (const void *base);
255 extern int std_suword (void *base, long word);
256 extern int std_suword32 (void *base, int word);
257
258 static void pv_hold(pv_entry_t pv);
259 static int _pv_hold_try(pv_entry_t pv
260                                 PMAP_DEBUG_DECL);
261 static void pv_drop(pv_entry_t pv);
262 static void _pv_lock(pv_entry_t pv
263                                 PMAP_DEBUG_DECL);
264 static void pv_unlock(pv_entry_t pv);
265 static pv_entry_t _pv_alloc(pmap_t pmap, vm_pindex_t pindex, int *isnew
266                                 PMAP_DEBUG_DECL);
267 static pv_entry_t _pv_get(pmap_t pmap, vm_pindex_t pindex
268                                 PMAP_DEBUG_DECL);
269 static pv_entry_t pv_get_try(pmap_t pmap, vm_pindex_t pindex, int *errorp);
270 static pv_entry_t pv_find(pmap_t pmap, vm_pindex_t pindex);
271 static void pv_put(pv_entry_t pv);
272 static void pv_free(pv_entry_t pv);
273 static void *pv_pte_lookup(pv_entry_t pv, vm_pindex_t pindex);
274 static pv_entry_t pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
275                       pv_entry_t *pvpp);
276 static pv_entry_t pmap_allocpte_seg(pmap_t pmap, vm_pindex_t ptepindex,
277                       pv_entry_t *pvpp, vm_map_entry_t entry, vm_offset_t va);
278 static void pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp,
279                       struct pmap_inval_info *info);
280 static vm_page_t pmap_remove_pv_page(pv_entry_t pv);
281 static int pmap_release_pv( struct pmap_inval_info *info,
282                       pv_entry_t pv, pv_entry_t pvp);
283
284 struct pmap_scan_info;
285 static void pmap_remove_callback(pmap_t pmap, struct pmap_scan_info *info,
286                       pv_entry_t pte_pv, pv_entry_t pt_pv, int sharept,
287                       vm_offset_t va, pt_entry_t *ptep, void *arg __unused);
288 static void pmap_protect_callback(pmap_t pmap, struct pmap_scan_info *info,
289                       pv_entry_t pte_pv, pv_entry_t pt_pv, int sharept,
290                       vm_offset_t va, pt_entry_t *ptep, void *arg __unused);
291
292 static void i386_protection_init (void);
293 static void create_pagetables(vm_paddr_t *firstaddr);
294 static void pmap_remove_all (vm_page_t m);
295 static boolean_t pmap_testbit (vm_page_t m, int bit);
296
297 static pt_entry_t * pmap_pte_quick (pmap_t pmap, vm_offset_t va);
298 static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
299
300 static void pmap_pinit_defaults(struct pmap *pmap);
301
302 static unsigned pdir4mb;
303
304 static int
305 pv_entry_compare(pv_entry_t pv1, pv_entry_t pv2)
306 {
307         if (pv1->pv_pindex < pv2->pv_pindex)
308                 return(-1);
309         if (pv1->pv_pindex > pv2->pv_pindex)
310                 return(1);
311         return(0);
312 }
313
314 RB_GENERATE2(pv_entry_rb_tree, pv_entry, pv_entry,
315              pv_entry_compare, vm_pindex_t, pv_pindex);
316
317 static __inline
318 void
319 pmap_page_stats_adding(vm_page_t m)
320 {
321         globaldata_t gd = mycpu;
322
323         if (TAILQ_EMPTY(&m->md.pv_list)) {
324                 ++gd->gd_vmtotal.t_arm;
325         } else if (TAILQ_FIRST(&m->md.pv_list) ==
326                    TAILQ_LAST(&m->md.pv_list, md_page_pv_list)) {
327                 ++gd->gd_vmtotal.t_armshr;
328                 ++gd->gd_vmtotal.t_avmshr;
329         } else {
330                 ++gd->gd_vmtotal.t_avmshr;
331         }
332 }
333
334 static __inline
335 void
336 pmap_page_stats_deleting(vm_page_t m)
337 {
338         globaldata_t gd = mycpu;
339
340         if (TAILQ_EMPTY(&m->md.pv_list)) {
341                 --gd->gd_vmtotal.t_arm;
342         } else if (TAILQ_FIRST(&m->md.pv_list) ==
343                    TAILQ_LAST(&m->md.pv_list, md_page_pv_list)) {
344                 --gd->gd_vmtotal.t_armshr;
345                 --gd->gd_vmtotal.t_avmshr;
346         } else {
347                 --gd->gd_vmtotal.t_avmshr;
348         }
349 }
350
351 /*
352  * Move the kernel virtual free pointer to the next
353  * 2MB.  This is used to help improve performance
354  * by using a large (2MB) page for much of the kernel
355  * (.text, .data, .bss)
356  */
357 static
358 vm_offset_t
359 pmap_kmem_choose(vm_offset_t addr)
360 {
361         vm_offset_t newaddr = addr;
362
363         newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
364         return newaddr;
365 }
366
367 /*
368  * pmap_pte_quick:
369  *
370  *      Super fast pmap_pte routine best used when scanning the pv lists.
371  *      This eliminates many course-grained invltlb calls.  Note that many of
372  *      the pv list scans are across different pmaps and it is very wasteful
373  *      to do an entire invltlb when checking a single mapping.
374  */
375 static __inline pt_entry_t *pmap_pte(pmap_t pmap, vm_offset_t va);
376
377 static
378 pt_entry_t *
379 pmap_pte_quick(pmap_t pmap, vm_offset_t va)
380 {
381         return pmap_pte(pmap, va);
382 }
383
384 /*
385  * Returns the pindex of a page table entry (representing a terminal page).
386  * There are NUPTE_TOTAL page table entries possible (a huge number)
387  *
388  * x86-64 has a 48-bit address space, where bit 47 is sign-extended out.
389  * We want to properly translate negative KVAs.
390  */
391 static __inline
392 vm_pindex_t
393 pmap_pte_pindex(vm_offset_t va)
394 {
395         return ((va >> PAGE_SHIFT) & (NUPTE_TOTAL - 1));
396 }
397
398 /*
399  * Returns the pindex of a page table.
400  */
401 static __inline
402 vm_pindex_t
403 pmap_pt_pindex(vm_offset_t va)
404 {
405         return (NUPTE_TOTAL + ((va >> PDRSHIFT) & (NUPT_TOTAL - 1)));
406 }
407
408 /*
409  * Returns the pindex of a page directory.
410  */
411 static __inline
412 vm_pindex_t
413 pmap_pd_pindex(vm_offset_t va)
414 {
415         return (NUPTE_TOTAL + NUPT_TOTAL +
416                 ((va >> PDPSHIFT) & (NUPD_TOTAL - 1)));
417 }
418
419 static __inline
420 vm_pindex_t
421 pmap_pdp_pindex(vm_offset_t va)
422 {
423         return (NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL +
424                 ((va >> PML4SHIFT) & (NUPDP_TOTAL - 1)));
425 }
426
427 static __inline
428 vm_pindex_t
429 pmap_pml4_pindex(void)
430 {
431         return (NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL);
432 }
433
434 /*
435  * Return various clipped indexes for a given VA
436  *
437  * Returns the index of a pte in a page table, representing a terminal
438  * page.
439  */
440 static __inline
441 vm_pindex_t
442 pmap_pte_index(vm_offset_t va)
443 {
444         return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
445 }
446
447 /*
448  * Returns the index of a pt in a page directory, representing a page
449  * table.
450  */
451 static __inline
452 vm_pindex_t
453 pmap_pt_index(vm_offset_t va)
454 {
455         return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
456 }
457
458 /*
459  * Returns the index of a pd in a page directory page, representing a page
460  * directory.
461  */
462 static __inline
463 vm_pindex_t
464 pmap_pd_index(vm_offset_t va)
465 {
466         return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
467 }
468
469 /*
470  * Returns the index of a pdp in the pml4 table, representing a page
471  * directory page.
472  */
473 static __inline
474 vm_pindex_t
475 pmap_pdp_index(vm_offset_t va)
476 {
477         return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
478 }
479
480 /*
481  * Generic procedure to index a pte from a pt, pd, or pdp.
482  *
483  * NOTE: Normally passed pindex as pmap_xx_index().  pmap_xx_pindex() is NOT
484  *       a page table page index but is instead of PV lookup index.
485  */
486 static
487 void *
488 pv_pte_lookup(pv_entry_t pv, vm_pindex_t pindex)
489 {
490         pt_entry_t *pte;
491
492         pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pv->pv_m));
493         return(&pte[pindex]);
494 }
495
496 /*
497  * Return pointer to PDP slot in the PML4
498  */
499 static __inline
500 pml4_entry_t *
501 pmap_pdp(pmap_t pmap, vm_offset_t va)
502 {
503         return (&pmap->pm_pml4[pmap_pdp_index(va)]);
504 }
505
506 /*
507  * Return pointer to PD slot in the PDP given a pointer to the PDP
508  */
509 static __inline
510 pdp_entry_t *
511 pmap_pdp_to_pd(pml4_entry_t pdp_pte, vm_offset_t va)
512 {
513         pdp_entry_t *pd;
514
515         pd = (pdp_entry_t *)PHYS_TO_DMAP(pdp_pte & PG_FRAME);
516         return (&pd[pmap_pd_index(va)]);
517 }
518
519 /*
520  * Return pointer to PD slot in the PDP.
521  */
522 static __inline
523 pdp_entry_t *
524 pmap_pd(pmap_t pmap, vm_offset_t va)
525 {
526         pml4_entry_t *pdp;
527
528         pdp = pmap_pdp(pmap, va);
529         if ((*pdp & pmap->pmap_bits[PG_V_IDX]) == 0)
530                 return NULL;
531         return (pmap_pdp_to_pd(*pdp, va));
532 }
533
534 /*
535  * Return pointer to PT slot in the PD given a pointer to the PD
536  */
537 static __inline
538 pd_entry_t *
539 pmap_pd_to_pt(pdp_entry_t pd_pte, vm_offset_t va)
540 {
541         pd_entry_t *pt;
542
543         pt = (pd_entry_t *)PHYS_TO_DMAP(pd_pte & PG_FRAME);
544         return (&pt[pmap_pt_index(va)]);
545 }
546
547 /*
548  * Return pointer to PT slot in the PD
549  *
550  * SIMPLE PMAP NOTE: Simple pmaps (embedded in objects) do not have PDPs,
551  *                   so we cannot lookup the PD via the PDP.  Instead we
552  *                   must look it up via the pmap.
553  */
554 static __inline
555 pd_entry_t *
556 pmap_pt(pmap_t pmap, vm_offset_t va)
557 {
558         pdp_entry_t *pd;
559         pv_entry_t pv;
560         vm_pindex_t pd_pindex;
561
562         if (pmap->pm_flags & PMAP_FLAG_SIMPLE) {
563                 pd_pindex = pmap_pd_pindex(va);
564                 spin_lock(&pmap->pm_spin);
565                 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pd_pindex);
566                 spin_unlock(&pmap->pm_spin);
567                 if (pv == NULL || pv->pv_m == NULL)
568                         return NULL;
569                 return (pmap_pd_to_pt(VM_PAGE_TO_PHYS(pv->pv_m), va));
570         } else {
571                 pd = pmap_pd(pmap, va);
572                 if (pd == NULL || (*pd & pmap->pmap_bits[PG_V_IDX]) == 0)
573                          return NULL;
574                 return (pmap_pd_to_pt(*pd, va));
575         }
576 }
577
578 /*
579  * Return pointer to PTE slot in the PT given a pointer to the PT
580  */
581 static __inline
582 pt_entry_t *
583 pmap_pt_to_pte(pd_entry_t pt_pte, vm_offset_t va)
584 {
585         pt_entry_t *pte;
586
587         pte = (pt_entry_t *)PHYS_TO_DMAP(pt_pte & PG_FRAME);
588         return (&pte[pmap_pte_index(va)]);
589 }
590
591 /*
592  * Return pointer to PTE slot in the PT
593  */
594 static __inline
595 pt_entry_t *
596 pmap_pte(pmap_t pmap, vm_offset_t va)
597 {
598         pd_entry_t *pt;
599
600         pt = pmap_pt(pmap, va);
601         if (pt == NULL || (*pt & pmap->pmap_bits[PG_V_IDX]) == 0)
602                  return NULL;
603         if ((*pt & pmap->pmap_bits[PG_PS_IDX]) != 0)
604                 return ((pt_entry_t *)pt);
605         return (pmap_pt_to_pte(*pt, va));
606 }
607
608 /*
609  * Of all the layers (PTE, PT, PD, PDP, PML4) the best one to cache is
610  * the PT layer.  This will speed up core pmap operations considerably.
611  *
612  * NOTE: The pmap spinlock does not need to be held but the passed-in pv
613  *       must be in a known associated state (typically by being locked when
614  *       the pmap spinlock isn't held).  We allow the race for that case.
615  */
616 static __inline
617 void
618 pv_cache(pv_entry_t pv, vm_pindex_t pindex)
619 {
620         if (pindex >= pmap_pt_pindex(0) && pindex <= pmap_pd_pindex(0))
621                 pv->pv_pmap->pm_pvhint = pv;
622 }
623
624
625 /*
626  * Return address of PT slot in PD (KVM only)
627  *
628  * Cannot be used for user page tables because it might interfere with
629  * the shared page-table-page optimization (pmap_mmu_optimize).
630  */
631 static __inline
632 pd_entry_t *
633 vtopt(vm_offset_t va)
634 {
635         uint64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT +
636                                   NPML4EPGSHIFT)) - 1);
637
638         return (PDmap + ((va >> PDRSHIFT) & mask));
639 }
640
641 /*
642  * KVM - return address of PTE slot in PT
643  */
644 static __inline
645 pt_entry_t *
646 vtopte(vm_offset_t va)
647 {
648         uint64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT +
649                                   NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
650
651         return (PTmap + ((va >> PAGE_SHIFT) & mask));
652 }
653
654 static uint64_t
655 allocpages(vm_paddr_t *firstaddr, long n)
656 {
657         uint64_t ret;
658
659         ret = *firstaddr;
660         bzero((void *)ret, n * PAGE_SIZE);
661         *firstaddr += n * PAGE_SIZE;
662         return (ret);
663 }
664
665 static
666 void
667 create_pagetables(vm_paddr_t *firstaddr)
668 {
669         long i;         /* must be 64 bits */
670         long nkpt_base;
671         long nkpt_phys;
672         int j;
673
674         /*
675          * We are running (mostly) V=P at this point
676          *
677          * Calculate NKPT - number of kernel page tables.  We have to
678          * accomodoate prealloction of the vm_page_array, dump bitmap,
679          * MSGBUF_SIZE, and other stuff.  Be generous.
680          *
681          * Maxmem is in pages.
682          *
683          * ndmpdp is the number of 1GB pages we wish to map.
684          */
685         ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
686         if (ndmpdp < 4)         /* Minimum 4GB of dirmap */
687                 ndmpdp = 4;
688         KKASSERT(ndmpdp <= NKPDPE * NPDEPG);
689
690         /*
691          * Starting at the beginning of kvm (not KERNBASE).
692          */
693         nkpt_phys = (Maxmem * sizeof(struct vm_page) + NBPDR - 1) / NBPDR;
694         nkpt_phys += (Maxmem * sizeof(struct pv_entry) + NBPDR - 1) / NBPDR;
695         nkpt_phys += ((nkpt + nkpt + 1 + NKPML4E + NKPDPE + NDMPML4E +
696                        ndmpdp) + 511) / 512;
697         nkpt_phys += 128;
698
699         /*
700          * Starting at KERNBASE - map 2G worth of page table pages.
701          * KERNBASE is offset -2G from the end of kvm.
702          */
703         nkpt_base = (NPDPEPG - KPDPI) * NPTEPG; /* typically 2 x 512 */
704
705         /*
706          * Allocate pages
707          */
708         KPTbase = allocpages(firstaddr, nkpt_base);
709         KPTphys = allocpages(firstaddr, nkpt_phys);
710         KPML4phys = allocpages(firstaddr, 1);
711         KPDPphys = allocpages(firstaddr, NKPML4E);
712         KPDphys = allocpages(firstaddr, NKPDPE);
713
714         /*
715          * Calculate the page directory base for KERNBASE,
716          * that is where we start populating the page table pages.
717          * Basically this is the end - 2.
718          */
719         KPDbase = KPDphys + ((NKPDPE - (NPDPEPG - KPDPI)) << PAGE_SHIFT);
720
721         DMPDPphys = allocpages(firstaddr, NDMPML4E);
722         if ((amd_feature & AMDID_PAGE1GB) == 0)
723                 DMPDphys = allocpages(firstaddr, ndmpdp);
724         dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
725
726         /*
727          * Fill in the underlying page table pages for the area around
728          * KERNBASE.  This remaps low physical memory to KERNBASE.
729          *
730          * Read-only from zero to physfree
731          * XXX not fully used, underneath 2M pages
732          */
733         for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) {
734                 ((pt_entry_t *)KPTbase)[i] = i << PAGE_SHIFT;
735                 ((pt_entry_t *)KPTbase)[i] |=
736                     pmap_bits_default[PG_RW_IDX] |
737                     pmap_bits_default[PG_V_IDX] |
738                     pmap_bits_default[PG_G_IDX];
739         }
740
741         /*
742          * Now map the initial kernel page tables.  One block of page
743          * tables is placed at the beginning of kernel virtual memory,
744          * and another block is placed at KERNBASE to map the kernel binary,
745          * data, bss, and initial pre-allocations.
746          */
747         for (i = 0; i < nkpt_base; i++) {
748                 ((pd_entry_t *)KPDbase)[i] = KPTbase + (i << PAGE_SHIFT);
749                 ((pd_entry_t *)KPDbase)[i] |=
750                     pmap_bits_default[PG_RW_IDX] |
751                     pmap_bits_default[PG_V_IDX];
752         }
753         for (i = 0; i < nkpt_phys; i++) {
754                 ((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT);
755                 ((pd_entry_t *)KPDphys)[i] |=
756                     pmap_bits_default[PG_RW_IDX] |
757                     pmap_bits_default[PG_V_IDX];
758         }
759
760         /*
761          * Map from zero to end of allocations using 2M pages as an
762          * optimization.  This will bypass some of the KPTBase pages
763          * above in the KERNBASE area.
764          */
765         for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) {
766                 ((pd_entry_t *)KPDbase)[i] = i << PDRSHIFT;
767                 ((pd_entry_t *)KPDbase)[i] |=
768                     pmap_bits_default[PG_RW_IDX] |
769                     pmap_bits_default[PG_V_IDX] |
770                     pmap_bits_default[PG_PS_IDX] |
771                     pmap_bits_default[PG_G_IDX];
772         }
773
774         /*
775          * And connect up the PD to the PDP.  The kernel pmap is expected
776          * to pre-populate all of its PDs.  See NKPDPE in vmparam.h.
777          */
778         for (i = 0; i < NKPDPE; i++) {
779                 ((pdp_entry_t *)KPDPphys)[NPDPEPG - NKPDPE + i] =
780                                 KPDphys + (i << PAGE_SHIFT);
781                 ((pdp_entry_t *)KPDPphys)[NPDPEPG - NKPDPE + i] |=
782                     pmap_bits_default[PG_RW_IDX] |
783                     pmap_bits_default[PG_V_IDX] |
784                     pmap_bits_default[PG_U_IDX];
785         }
786
787         /*
788          * Now set up the direct map space using either 2MB or 1GB pages
789          * Preset PG_M and PG_A because demotion expects it.
790          *
791          * When filling in entries in the PD pages make sure any excess
792          * entries are set to zero as we allocated enough PD pages
793          */
794         if ((amd_feature & AMDID_PAGE1GB) == 0) {
795                 for (i = 0; i < NPDEPG * ndmpdp; i++) {
796                         ((pd_entry_t *)DMPDphys)[i] = i << PDRSHIFT;
797                         ((pd_entry_t *)DMPDphys)[i] |=
798                             pmap_bits_default[PG_RW_IDX] |
799                             pmap_bits_default[PG_V_IDX] |
800                             pmap_bits_default[PG_PS_IDX] |
801                             pmap_bits_default[PG_G_IDX] |
802                             pmap_bits_default[PG_M_IDX] |
803                             pmap_bits_default[PG_A_IDX];
804                 }
805
806                 /*
807                  * And the direct map space's PDP
808                  */
809                 for (i = 0; i < ndmpdp; i++) {
810                         ((pdp_entry_t *)DMPDPphys)[i] = DMPDphys +
811                                                         (i << PAGE_SHIFT);
812                         ((pdp_entry_t *)DMPDPphys)[i] |=
813                             pmap_bits_default[PG_RW_IDX] |
814                             pmap_bits_default[PG_V_IDX] |
815                             pmap_bits_default[PG_U_IDX];
816                 }
817         } else {
818                 for (i = 0; i < ndmpdp; i++) {
819                         ((pdp_entry_t *)DMPDPphys)[i] =
820                                                 (vm_paddr_t)i << PDPSHIFT;
821                         ((pdp_entry_t *)DMPDPphys)[i] |=
822                             pmap_bits_default[PG_RW_IDX] |
823                             pmap_bits_default[PG_V_IDX] |
824                             pmap_bits_default[PG_PS_IDX] |
825                             pmap_bits_default[PG_G_IDX] |
826                             pmap_bits_default[PG_M_IDX] |
827                             pmap_bits_default[PG_A_IDX];
828                 }
829         }
830
831         /* And recursively map PML4 to itself in order to get PTmap */
832         ((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys;
833         ((pdp_entry_t *)KPML4phys)[PML4PML4I] |=
834             pmap_bits_default[PG_RW_IDX] |
835             pmap_bits_default[PG_V_IDX] |
836             pmap_bits_default[PG_U_IDX];
837
838         /*
839          * Connect the Direct Map slots up to the PML4
840          */
841         for (j = 0; j < NDMPML4E; ++j) {
842                 ((pdp_entry_t *)KPML4phys)[DMPML4I + j] =
843                     (DMPDPphys + ((vm_paddr_t)j << PML4SHIFT)) |
844                     pmap_bits_default[PG_RW_IDX] |
845                     pmap_bits_default[PG_V_IDX] |
846                     pmap_bits_default[PG_U_IDX];
847         }
848
849         /*
850          * Connect the KVA slot up to the PML4
851          */
852         ((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys;
853         ((pdp_entry_t *)KPML4phys)[KPML4I] |=
854             pmap_bits_default[PG_RW_IDX] |
855             pmap_bits_default[PG_V_IDX] |
856             pmap_bits_default[PG_U_IDX];
857 }
858
859 /*
860  *      Bootstrap the system enough to run with virtual memory.
861  *
862  *      On the i386 this is called after mapping has already been enabled
863  *      and just syncs the pmap module with what has already been done.
864  *      [We can't call it easily with mapping off since the kernel is not
865  *      mapped with PA == VA, hence we would have to relocate every address
866  *      from the linked base (virtual) address "KERNBASE" to the actual
867  *      (physical) address starting relative to 0]
868  */
869 void
870 pmap_bootstrap(vm_paddr_t *firstaddr)
871 {
872         vm_offset_t va;
873         pt_entry_t *pte;
874
875         KvaStart = VM_MIN_KERNEL_ADDRESS;
876         KvaEnd = VM_MAX_KERNEL_ADDRESS;
877         KvaSize = KvaEnd - KvaStart;
878
879         avail_start = *firstaddr;
880
881         /*
882          * Create an initial set of page tables to run the kernel in.
883          */
884         create_pagetables(firstaddr);
885
886         virtual2_start = KvaStart;
887         virtual2_end = PTOV_OFFSET;
888
889         virtual_start = (vm_offset_t) PTOV_OFFSET + *firstaddr;
890         virtual_start = pmap_kmem_choose(virtual_start);
891
892         virtual_end = VM_MAX_KERNEL_ADDRESS;
893
894         /* XXX do %cr0 as well */
895         load_cr4(rcr4() | CR4_PGE | CR4_PSE);
896         load_cr3(KPML4phys);
897
898         /*
899          * Initialize protection array.
900          */
901         i386_protection_init();
902
903         /*
904          * The kernel's pmap is statically allocated so we don't have to use
905          * pmap_create, which is unlikely to work correctly at this part of
906          * the boot sequence (XXX and which no longer exists).
907          */
908         kernel_pmap.pm_pml4 = (pdp_entry_t *) (PTOV_OFFSET + KPML4phys);
909         kernel_pmap.pm_count = 1;
910         kernel_pmap.pm_active = (cpumask_t)-1;
911         RB_INIT(&kernel_pmap.pm_pvroot);
912         spin_init(&kernel_pmap.pm_spin);
913         lwkt_token_init(&kernel_pmap.pm_token, "kpmap_tok");
914
915         /*
916          * Reserve some special page table entries/VA space for temporary
917          * mapping of pages.
918          */
919 #define SYSMAP(c, p, v, n)      \
920         v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
921
922         va = virtual_start;
923         pte = vtopte(va);
924
925         /*
926          * CMAP1/CMAP2 are used for zeroing and copying pages.
927          */
928         SYSMAP(caddr_t, CMAP1, CADDR1, 1)
929
930         /*
931          * Crashdump maps.
932          */
933         SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS);
934
935         /*
936          * ptvmmap is used for reading arbitrary physical pages via
937          * /dev/mem.
938          */
939         SYSMAP(caddr_t, ptmmap, ptvmmap, 1)
940
941         /*
942          * msgbufp is used to map the system message buffer.
943          * XXX msgbufmap is not used.
944          */
945         SYSMAP(struct msgbuf *, msgbufmap, msgbufp,
946                atop(round_page(MSGBUF_SIZE)))
947
948         virtual_start = va;
949
950         *CMAP1 = 0;
951
952         /*
953          * PG_G is terribly broken on SMP because we IPI invltlb's in some
954          * cases rather then invl1pg.  Actually, I don't even know why it
955          * works under UP because self-referential page table mappings
956          */
957 //      pgeflag = 0;
958
959 /*
960  * Initialize the 4MB page size flag
961  */
962 //      pseflag = 0;
963 /*
964  * The 4MB page version of the initial
965  * kernel page mapping.
966  */
967         pdir4mb = 0;
968
969 #if !defined(DISABLE_PSE)
970         if (cpu_feature & CPUID_PSE) {
971                 pt_entry_t ptditmp;
972                 /*
973                  * Note that we have enabled PSE mode
974                  */
975 //              pseflag = kernel_pmap.pmap_bits[PG_PS_IDX];
976                 ptditmp = *(PTmap + x86_64_btop(KERNBASE));
977                 ptditmp &= ~(NBPDR - 1);
978                 ptditmp |= pmap_bits_default[PG_V_IDX] |
979                     pmap_bits_default[PG_RW_IDX] |
980                     pmap_bits_default[PG_PS_IDX] |
981                     pmap_bits_default[PG_U_IDX];
982 //                  pgeflag;
983                 pdir4mb = ptditmp;
984         }
985 #endif
986         cpu_invltlb();
987
988         /* Initialize the PAT MSR */
989         pmap_init_pat();
990
991         pmap_pinit_defaults(&kernel_pmap);
992 }
993
994 /*
995  * Setup the PAT MSR.
996  */
997 void
998 pmap_init_pat(void)
999 {
1000         uint64_t pat_msr;
1001         u_long cr0, cr4;
1002
1003         /*
1004          * Default values mapping PATi,PCD,PWT bits at system reset.
1005          * The default values effectively ignore the PATi bit by
1006          * repeating the encodings for 0-3 in 4-7, and map the PCD
1007          * and PWT bit combinations to the expected PAT types.
1008          */
1009         pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |        /* 000 */
1010                   PAT_VALUE(1, PAT_WRITE_THROUGH) |     /* 001 */
1011                   PAT_VALUE(2, PAT_UNCACHED) |          /* 010 */
1012                   PAT_VALUE(3, PAT_UNCACHEABLE) |       /* 011 */
1013                   PAT_VALUE(4, PAT_WRITE_BACK) |        /* 100 */
1014                   PAT_VALUE(5, PAT_WRITE_THROUGH) |     /* 101 */
1015                   PAT_VALUE(6, PAT_UNCACHED) |          /* 110 */
1016                   PAT_VALUE(7, PAT_UNCACHEABLE);        /* 111 */
1017         pat_pte_index[PAT_WRITE_BACK]   = 0;
1018         pat_pte_index[PAT_WRITE_THROUGH]= 0         | X86_PG_NC_PWT;
1019         pat_pte_index[PAT_UNCACHED]     = X86_PG_NC_PCD;
1020         pat_pte_index[PAT_UNCACHEABLE]  = X86_PG_NC_PCD | X86_PG_NC_PWT;
1021         pat_pte_index[PAT_WRITE_PROTECTED] = pat_pte_index[PAT_UNCACHEABLE];
1022         pat_pte_index[PAT_WRITE_COMBINING] = pat_pte_index[PAT_UNCACHEABLE];
1023
1024         if (cpu_feature & CPUID_PAT) {
1025                 /*
1026                  * If we support the PAT then set-up entries for
1027                  * WRITE_PROTECTED and WRITE_COMBINING using bit patterns
1028                  * 4 and 5.
1029                  */
1030                 pat_msr = (pat_msr & ~PAT_MASK(4)) |
1031                           PAT_VALUE(4, PAT_WRITE_PROTECTED);
1032                 pat_msr = (pat_msr & ~PAT_MASK(5)) |
1033                           PAT_VALUE(5, PAT_WRITE_COMBINING);
1034                 pat_pte_index[PAT_WRITE_PROTECTED] = X86_PG_PTE_PAT | 0;
1035                 pat_pte_index[PAT_WRITE_COMBINING] = X86_PG_PTE_PAT | X86_PG_NC_PWT;
1036
1037                 /*
1038                  * Then enable the PAT
1039                  */
1040
1041                 /* Disable PGE. */
1042                 cr4 = rcr4();
1043                 load_cr4(cr4 & ~CR4_PGE);
1044
1045                 /* Disable caches (CD = 1, NW = 0). */
1046                 cr0 = rcr0();
1047                 load_cr0((cr0 & ~CR0_NW) | CR0_CD);
1048
1049                 /* Flushes caches and TLBs. */
1050                 wbinvd();
1051                 cpu_invltlb();
1052
1053                 /* Update PAT and index table. */
1054                 wrmsr(MSR_PAT, pat_msr);
1055
1056                 /* Flush caches and TLBs again. */
1057                 wbinvd();
1058                 cpu_invltlb();
1059
1060                 /* Restore caches and PGE. */
1061                 load_cr0(cr0);
1062                 load_cr4(cr4);
1063                 PatMsr = pat_msr;
1064         }
1065 }
1066
1067 /*
1068  * Set 4mb pdir for mp startup
1069  */
1070 void
1071 pmap_set_opt(void)
1072 {
1073         if (cpu_feature & CPUID_PSE) {
1074                 load_cr4(rcr4() | CR4_PSE);
1075                 if (pdir4mb && mycpu->gd_cpuid == 0) {  /* only on BSP */
1076                         cpu_invltlb();
1077                 }
1078         }
1079 }
1080
1081 /*
1082  *      Initialize the pmap module.
1083  *      Called by vm_init, to initialize any structures that the pmap
1084  *      system needs to map virtual memory.
1085  *      pmap_init has been enhanced to support in a fairly consistant
1086  *      way, discontiguous physical memory.
1087  */
1088 void
1089 pmap_init(void)
1090 {
1091         int i;
1092         int initial_pvs;
1093
1094         /*
1095          * Allocate memory for random pmap data structures.  Includes the
1096          * pv_head_table.
1097          */
1098
1099         for (i = 0; i < vm_page_array_size; i++) {
1100                 vm_page_t m;
1101
1102                 m = &vm_page_array[i];
1103                 TAILQ_INIT(&m->md.pv_list);
1104         }
1105
1106         /*
1107          * init the pv free list
1108          */
1109         initial_pvs = vm_page_array_size;
1110         if (initial_pvs < MINPV)
1111                 initial_pvs = MINPV;
1112         pvzone = &pvzone_store;
1113         pvinit = (void *)kmem_alloc(&kernel_map,
1114                                     initial_pvs * sizeof (struct pv_entry));
1115         zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry),
1116                   pvinit, initial_pvs);
1117
1118         /*
1119          * Now it is safe to enable pv_table recording.
1120          */
1121         pmap_initialized = TRUE;
1122 }
1123
1124 /*
1125  * Initialize the address space (zone) for the pv_entries.  Set a
1126  * high water mark so that the system can recover from excessive
1127  * numbers of pv entries.
1128  */
1129 void
1130 pmap_init2(void)
1131 {
1132         int shpgperproc = PMAP_SHPGPERPROC;
1133         int entry_max;
1134
1135         TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
1136         pv_entry_max = shpgperproc * maxproc + vm_page_array_size;
1137         TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
1138         pv_entry_high_water = 9 * (pv_entry_max / 10);
1139
1140         /*
1141          * Subtract out pages already installed in the zone (hack)
1142          */
1143         entry_max = pv_entry_max - vm_page_array_size;
1144         if (entry_max <= 0)
1145                 entry_max = 1;
1146
1147         zinitna(pvzone, &pvzone_obj, NULL, 0, entry_max, ZONE_INTERRUPT, 1);
1148 }
1149
1150 /*
1151  * Typically used to initialize a fictitious page by vm/device_pager.c
1152  */
1153 void
1154 pmap_page_init(struct vm_page *m)
1155 {
1156         vm_page_init(m);
1157         TAILQ_INIT(&m->md.pv_list);
1158 }
1159
1160 /***************************************************
1161  * Low level helper routines.....
1162  ***************************************************/
1163
1164 /*
1165  * this routine defines the region(s) of memory that should
1166  * not be tested for the modified bit.
1167  */
1168 static __inline
1169 int
1170 pmap_track_modified(vm_pindex_t pindex)
1171 {
1172         vm_offset_t va = (vm_offset_t)pindex << PAGE_SHIFT;
1173         if ((va < clean_sva) || (va >= clean_eva)) 
1174                 return 1;
1175         else
1176                 return 0;
1177 }
1178
1179 /*
1180  * Extract the physical page address associated with the map/VA pair.
1181  * The page must be wired for this to work reliably.
1182  *
1183  * XXX for the moment we're using pv_find() instead of pv_get(), as
1184  *     callers might be expecting non-blocking operation.
1185  */
1186 vm_paddr_t 
1187 pmap_extract(pmap_t pmap, vm_offset_t va)
1188 {
1189         vm_paddr_t rtval;
1190         pv_entry_t pt_pv;
1191         pt_entry_t *ptep;
1192
1193         rtval = 0;
1194         if (va >= VM_MAX_USER_ADDRESS) {
1195                 /*
1196                  * Kernel page directories might be direct-mapped and
1197                  * there is typically no PV tracking of pte's
1198                  */
1199                 pd_entry_t *pt;
1200
1201                 pt = pmap_pt(pmap, va);
1202                 if (pt && (*pt & pmap->pmap_bits[PG_V_IDX])) {
1203                         if (*pt & pmap->pmap_bits[PG_PS_IDX]) {
1204                                 rtval = *pt & PG_PS_FRAME;
1205                                 rtval |= va & PDRMASK;
1206                         } else {
1207                                 ptep = pmap_pt_to_pte(*pt, va);
1208                                 if (*pt & pmap->pmap_bits[PG_V_IDX]) {
1209                                         rtval = *ptep & PG_FRAME;
1210                                         rtval |= va & PAGE_MASK;
1211                                 }
1212                         }
1213                 }
1214         } else {
1215                 /*
1216                  * User pages currently do not direct-map the page directory
1217                  * and some pages might not used managed PVs.  But all PT's
1218                  * will have a PV.
1219                  */
1220                 pt_pv = pv_find(pmap, pmap_pt_pindex(va));
1221                 if (pt_pv) {
1222                         ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va));
1223                         if (*ptep & pmap->pmap_bits[PG_V_IDX]) {
1224                                 rtval = *ptep & PG_FRAME;
1225                                 rtval |= va & PAGE_MASK;
1226                         }
1227                         pv_drop(pt_pv);
1228                 }
1229         }
1230         return rtval;
1231 }
1232
1233 /*
1234  * Similar to extract but checks protections, SMP-friendly short-cut for
1235  * vm_fault_page[_quick]().  Can return NULL to cause the caller to
1236  * fall-through to the real fault code.
1237  *
1238  * The returned page, if not NULL, is held (and not busied).
1239  */
1240 vm_page_t
1241 pmap_fault_page_quick(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1242 {
1243         if (pmap && va < VM_MAX_USER_ADDRESS) {
1244                 pv_entry_t pt_pv;
1245                 pv_entry_t pte_pv;
1246                 pt_entry_t *ptep;
1247                 pt_entry_t req;
1248                 vm_page_t m;
1249                 int error;
1250
1251                 req = pmap->pmap_bits[PG_V_IDX] |
1252                       pmap->pmap_bits[PG_U_IDX];
1253                 if (prot & VM_PROT_WRITE)
1254                         req |= pmap->pmap_bits[PG_RW_IDX];
1255
1256                 pt_pv = pv_find(pmap, pmap_pt_pindex(va));
1257                 if (pt_pv == NULL)
1258                         return (NULL);
1259                 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va));
1260                 if ((*ptep & req) != req) {
1261                         pv_drop(pt_pv);
1262                         return (NULL);
1263                 }
1264                 pte_pv = pv_get_try(pmap, pmap_pte_pindex(va), &error);
1265                 if (pte_pv && error == 0) {
1266                         m = pte_pv->pv_m;
1267                         vm_page_hold(m);
1268                         if (prot & VM_PROT_WRITE)
1269                                 vm_page_dirty(m);
1270                         pv_put(pte_pv);
1271                 } else if (pte_pv) {
1272                         pv_drop(pte_pv);
1273                         m = NULL;
1274                 } else {
1275                         m = NULL;
1276                 }
1277                 pv_drop(pt_pv);
1278                 return(m);
1279         } else {
1280                 return(NULL);
1281         }
1282 }
1283
1284 /*
1285  * Extract the physical page address associated kernel virtual address.
1286  */
1287 vm_paddr_t
1288 pmap_kextract(vm_offset_t va)
1289 {
1290         pd_entry_t pt;          /* pt entry in pd */
1291         vm_paddr_t pa;
1292
1293         if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
1294                 pa = DMAP_TO_PHYS(va);
1295         } else {
1296                 pt = *vtopt(va);
1297                 if (pt & kernel_pmap.pmap_bits[PG_PS_IDX]) {
1298                         pa = (pt & PG_PS_FRAME) | (va & PDRMASK);
1299                 } else {
1300                         /*
1301                          * Beware of a concurrent promotion that changes the
1302                          * PDE at this point!  For example, vtopte() must not
1303                          * be used to access the PTE because it would use the
1304                          * new PDE.  It is, however, safe to use the old PDE
1305                          * because the page table page is preserved by the
1306                          * promotion.
1307                          */
1308                         pa = *pmap_pt_to_pte(pt, va);
1309                         pa = (pa & PG_FRAME) | (va & PAGE_MASK);
1310                 }
1311         }
1312         return pa;
1313 }
1314
1315 /***************************************************
1316  * Low level mapping routines.....
1317  ***************************************************/
1318
1319 /*
1320  * Routine: pmap_kenter
1321  * Function:
1322  *      Add a wired page to the KVA
1323  *      NOTE! note that in order for the mapping to take effect -- you
1324  *      should do an invltlb after doing the pmap_kenter().
1325  */
1326 void 
1327 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
1328 {
1329         pt_entry_t *pte;
1330         pt_entry_t npte;
1331         pmap_inval_info info;
1332
1333         pmap_inval_init(&info);                         /* XXX remove */
1334         npte = pa |
1335             kernel_pmap.pmap_bits[PG_RW_IDX] |
1336             kernel_pmap.pmap_bits[PG_V_IDX];
1337 //          pgeflag;
1338         pte = vtopte(va);
1339         pmap_inval_interlock(&info, &kernel_pmap, va);  /* XXX remove */
1340         *pte = npte;
1341         pmap_inval_deinterlock(&info, &kernel_pmap);    /* XXX remove */
1342         pmap_inval_done(&info);                         /* XXX remove */
1343 }
1344
1345 /*
1346  * Routine: pmap_kenter_quick
1347  * Function:
1348  *      Similar to pmap_kenter(), except we only invalidate the
1349  *      mapping on the current CPU.
1350  */
1351 void
1352 pmap_kenter_quick(vm_offset_t va, vm_paddr_t pa)
1353 {
1354         pt_entry_t *pte;
1355         pt_entry_t npte;
1356
1357         npte = pa |
1358             kernel_pmap.pmap_bits[PG_RW_IDX] |
1359             kernel_pmap.pmap_bits[PG_V_IDX];
1360 //          pgeflag;
1361         pte = vtopte(va);
1362         *pte = npte;
1363         cpu_invlpg((void *)va);
1364 }
1365
1366 void
1367 pmap_kenter_sync(vm_offset_t va)
1368 {
1369         pmap_inval_info info;
1370
1371         pmap_inval_init(&info);
1372         pmap_inval_interlock(&info, &kernel_pmap, va);
1373         pmap_inval_deinterlock(&info, &kernel_pmap);
1374         pmap_inval_done(&info);
1375 }
1376
1377 void
1378 pmap_kenter_sync_quick(vm_offset_t va)
1379 {
1380         cpu_invlpg((void *)va);
1381 }
1382
1383 /*
1384  * remove a page from the kernel pagetables
1385  */
1386 void
1387 pmap_kremove(vm_offset_t va)
1388 {
1389         pt_entry_t *pte;
1390         pmap_inval_info info;
1391
1392         pmap_inval_init(&info);
1393         pte = vtopte(va);
1394         pmap_inval_interlock(&info, &kernel_pmap, va);
1395         (void)pte_load_clear(pte);
1396         pmap_inval_deinterlock(&info, &kernel_pmap);
1397         pmap_inval_done(&info);
1398 }
1399
1400 void
1401 pmap_kremove_quick(vm_offset_t va)
1402 {
1403         pt_entry_t *pte;
1404         pte = vtopte(va);
1405         (void)pte_load_clear(pte);
1406         cpu_invlpg((void *)va);
1407 }
1408
1409 /*
1410  * XXX these need to be recoded.  They are not used in any critical path.
1411  */
1412 void
1413 pmap_kmodify_rw(vm_offset_t va)
1414 {
1415         atomic_set_long(vtopte(va), kernel_pmap.pmap_bits[PG_RW_IDX]);
1416         cpu_invlpg((void *)va);
1417 }
1418
1419 /* NOT USED
1420 void
1421 pmap_kmodify_nc(vm_offset_t va)
1422 {
1423         atomic_set_long(vtopte(va), PG_N);
1424         cpu_invlpg((void *)va);
1425 }
1426 */
1427
1428 /*
1429  * Used to map a range of physical addresses into kernel virtual
1430  * address space during the low level boot, typically to map the
1431  * dump bitmap, message buffer, and vm_page_array.
1432  *
1433  * These mappings are typically made at some pointer after the end of the
1434  * kernel text+data.
1435  *
1436  * We could return PHYS_TO_DMAP(start) here and not allocate any
1437  * via (*virtp), but then kmem from userland and kernel dumps won't
1438  * have access to the related pointers.
1439  */
1440 vm_offset_t
1441 pmap_map(vm_offset_t *virtp, vm_paddr_t start, vm_paddr_t end, int prot)
1442 {
1443         vm_offset_t va;
1444         vm_offset_t va_start;
1445
1446         /*return PHYS_TO_DMAP(start);*/
1447
1448         va_start = *virtp;
1449         va = va_start;
1450
1451         while (start < end) {
1452                 pmap_kenter_quick(va, start);
1453                 va += PAGE_SIZE;
1454                 start += PAGE_SIZE;
1455         }
1456         *virtp = va;
1457         return va_start;
1458 }
1459
1460 #define PMAP_CLFLUSH_THRESHOLD  (2 * 1024 * 1024)
1461
1462 /*
1463  * Remove the specified set of pages from the data and instruction caches.
1464  *
1465  * In contrast to pmap_invalidate_cache_range(), this function does not
1466  * rely on the CPU's self-snoop feature, because it is intended for use
1467  * when moving pages into a different cache domain.
1468  */
1469 void
1470 pmap_invalidate_cache_pages(vm_page_t *pages, int count)
1471 {
1472         vm_offset_t daddr, eva;
1473         int i;
1474
1475         if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
1476             (cpu_feature & CPUID_CLFSH) == 0)
1477                 wbinvd();
1478         else {
1479                 cpu_mfence();
1480                 for (i = 0; i < count; i++) {
1481                         daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i]));
1482                         eva = daddr + PAGE_SIZE;
1483                         for (; daddr < eva; daddr += cpu_clflush_line_size)
1484                                 clflush(daddr);
1485                 }
1486                 cpu_mfence();
1487         }
1488 }
1489
1490 void
1491 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
1492 {
1493         KASSERT((sva & PAGE_MASK) == 0,
1494             ("pmap_invalidate_cache_range: sva not page-aligned"));
1495         KASSERT((eva & PAGE_MASK) == 0,
1496             ("pmap_invalidate_cache_range: eva not page-aligned"));
1497
1498         if (cpu_feature & CPUID_SS) {
1499                 ; /* If "Self Snoop" is supported, do nothing. */
1500         } else {
1501                 /* Globally invalidate caches */
1502                 cpu_wbinvd_on_all_cpus();
1503         }
1504 }
1505 void
1506 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1507 {
1508         smp_invlpg_range(pmap->pm_active, sva, eva);
1509 }
1510
1511 /*
1512  * Add a list of wired pages to the kva
1513  * this routine is only used for temporary
1514  * kernel mappings that do not need to have
1515  * page modification or references recorded.
1516  * Note that old mappings are simply written
1517  * over.  The page *must* be wired.
1518  */
1519 void
1520 pmap_qenter(vm_offset_t va, vm_page_t *m, int count)
1521 {
1522         vm_offset_t end_va;
1523
1524         end_va = va + count * PAGE_SIZE;
1525                 
1526         while (va < end_va) {
1527                 pt_entry_t *pte;
1528
1529                 pte = vtopte(va);
1530                 *pte = VM_PAGE_TO_PHYS(*m) |
1531                     kernel_pmap.pmap_bits[PG_RW_IDX] |
1532                     kernel_pmap.pmap_bits[PG_V_IDX] |
1533                     kernel_pmap.pmap_cache_bits[(*m)->pat_mode];
1534 //              pgeflag;
1535                 cpu_invlpg((void *)va);
1536                 va += PAGE_SIZE;
1537                 m++;
1538         }
1539         smp_invltlb();
1540 }
1541
1542 /*
1543  * This routine jerks page mappings from the
1544  * kernel -- it is meant only for temporary mappings.
1545  *
1546  * MPSAFE, INTERRUPT SAFE (cluster callback)
1547  */
1548 void
1549 pmap_qremove(vm_offset_t va, int count)
1550 {
1551         vm_offset_t end_va;
1552
1553         end_va = va + count * PAGE_SIZE;
1554
1555         while (va < end_va) {
1556                 pt_entry_t *pte;
1557
1558                 pte = vtopte(va);
1559                 (void)pte_load_clear(pte);
1560                 cpu_invlpg((void *)va);
1561                 va += PAGE_SIZE;
1562         }
1563         smp_invltlb();
1564 }
1565
1566 /*
1567  * Create a new thread and optionally associate it with a (new) process.
1568  * NOTE! the new thread's cpu may not equal the current cpu.
1569  */
1570 void
1571 pmap_init_thread(thread_t td)
1572 {
1573         /* enforce pcb placement & alignment */
1574         td->td_pcb = (struct pcb *)(td->td_kstack + td->td_kstack_size) - 1;
1575         td->td_pcb = (struct pcb *)((intptr_t)td->td_pcb & ~(intptr_t)0xF);
1576         td->td_savefpu = &td->td_pcb->pcb_save;
1577         td->td_sp = (char *)td->td_pcb; /* no -16 */
1578 }
1579
1580 /*
1581  * This routine directly affects the fork perf for a process.
1582  */
1583 void
1584 pmap_init_proc(struct proc *p)
1585 {
1586 }
1587
1588 static void
1589 pmap_pinit_defaults(struct pmap *pmap)
1590 {
1591         bcopy(pmap_bits_default, pmap->pmap_bits,
1592               sizeof(pmap_bits_default));
1593         bcopy(protection_codes, pmap->protection_codes,
1594               sizeof(protection_codes));
1595         bcopy(pat_pte_index, pmap->pmap_cache_bits,
1596               sizeof(pat_pte_index));
1597         pmap->pmap_cache_mask = X86_PG_NC_PWT | X86_PG_NC_PCD | X86_PG_PTE_PAT;
1598         pmap->copyinstr = std_copyinstr;
1599         pmap->copyin = std_copyin;
1600         pmap->copyout = std_copyout;
1601         pmap->fubyte = std_fubyte;
1602         pmap->subyte = std_subyte;
1603         pmap->fuword = std_fuword;
1604         pmap->suword = std_suword;
1605         pmap->suword32 = std_suword32;
1606 }
1607 /*
1608  * Initialize pmap0/vmspace0.  This pmap is not added to pmap_list because
1609  * it, and IdlePTD, represents the template used to update all other pmaps.
1610  *
1611  * On architectures where the kernel pmap is not integrated into the user
1612  * process pmap, this pmap represents the process pmap, not the kernel pmap.
1613  * kernel_pmap should be used to directly access the kernel_pmap.
1614  */
1615 void
1616 pmap_pinit0(struct pmap *pmap)
1617 {
1618         pmap->pm_pml4 = (pml4_entry_t *)(PTOV_OFFSET + KPML4phys);
1619         pmap->pm_count = 1;
1620         pmap->pm_active = 0;
1621         pmap->pm_pvhint = NULL;
1622         RB_INIT(&pmap->pm_pvroot);
1623         spin_init(&pmap->pm_spin);
1624         lwkt_token_init(&pmap->pm_token, "pmap_tok");
1625         bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1626         pmap_pinit_defaults(pmap);
1627 }
1628
1629 /*
1630  * Initialize a preallocated and zeroed pmap structure,
1631  * such as one in a vmspace structure.
1632  */
1633 static void
1634 pmap_pinit_simple(struct pmap *pmap)
1635 {
1636         /*
1637          * Misc initialization
1638          */
1639         pmap->pm_count = 1;
1640         pmap->pm_active = 0;
1641         pmap->pm_pvhint = NULL;
1642         pmap->pm_flags = PMAP_FLAG_SIMPLE;
1643
1644         pmap_pinit_defaults(pmap);
1645
1646         /*
1647          * Don't blow up locks/tokens on re-use (XXX fix/use drop code
1648          * for this).
1649          */
1650         if (pmap->pm_pmlpv == NULL) {
1651                 RB_INIT(&pmap->pm_pvroot);
1652                 bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1653                 spin_init(&pmap->pm_spin);
1654                 lwkt_token_init(&pmap->pm_token, "pmap_tok");
1655         }
1656 }
1657
1658 void
1659 pmap_pinit(struct pmap *pmap)
1660 {
1661         pv_entry_t pv;
1662         int j;
1663
1664         if (pmap->pm_pmlpv) {
1665                 if (pmap->pmap_bits[TYPE_IDX] != REGULAR_PMAP) {
1666                         pmap_puninit(pmap);
1667                 }
1668         }
1669
1670         pmap_pinit_simple(pmap);
1671         pmap->pm_flags &= ~PMAP_FLAG_SIMPLE;
1672
1673         /*
1674          * No need to allocate page table space yet but we do need a valid
1675          * page directory table.
1676          */
1677         if (pmap->pm_pml4 == NULL) {
1678                 pmap->pm_pml4 =
1679                     (pml4_entry_t *)kmem_alloc_pageable(&kernel_map, PAGE_SIZE);
1680         }
1681
1682         /*
1683          * Allocate the page directory page, which wires it even though
1684          * it isn't being entered into some higher level page table (it
1685          * being the highest level).  If one is already cached we don't
1686          * have to do anything.
1687          */
1688         if ((pv = pmap->pm_pmlpv) == NULL) {
1689                 pv = pmap_allocpte(pmap, pmap_pml4_pindex(), NULL);
1690                 pmap->pm_pmlpv = pv;
1691                 pmap_kenter((vm_offset_t)pmap->pm_pml4,
1692                             VM_PAGE_TO_PHYS(pv->pv_m));
1693                 pv_put(pv);
1694
1695                 /*
1696                  * Install DMAP and KMAP.
1697                  */
1698                 for (j = 0; j < NDMPML4E; ++j) {
1699                         pmap->pm_pml4[DMPML4I + j] =
1700                             (DMPDPphys + ((vm_paddr_t)j << PML4SHIFT)) |
1701                             pmap->pmap_bits[PG_RW_IDX] |
1702                             pmap->pmap_bits[PG_V_IDX] |
1703                             pmap->pmap_bits[PG_U_IDX];
1704                 }
1705                 pmap->pm_pml4[KPML4I] = KPDPphys |
1706                     pmap->pmap_bits[PG_RW_IDX] |
1707                     pmap->pmap_bits[PG_V_IDX] |
1708                     pmap->pmap_bits[PG_U_IDX];
1709
1710                 /*
1711                  * install self-referential address mapping entry
1712                  */
1713                 pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pv->pv_m) |
1714                     pmap->pmap_bits[PG_V_IDX] |
1715                     pmap->pmap_bits[PG_RW_IDX] |
1716                     pmap->pmap_bits[PG_A_IDX] |
1717                     pmap->pmap_bits[PG_M_IDX];
1718         } else {
1719                 KKASSERT(pv->pv_m->flags & PG_MAPPED);
1720                 KKASSERT(pv->pv_m->flags & PG_WRITEABLE);
1721         }
1722         KKASSERT(pmap->pm_pml4[255] == 0);
1723         KKASSERT(RB_ROOT(&pmap->pm_pvroot) == pv);
1724         KKASSERT(pv->pv_entry.rbe_left == NULL);
1725         KKASSERT(pv->pv_entry.rbe_right == NULL);
1726 }
1727
1728 /*
1729  * Clean up a pmap structure so it can be physically freed.  This routine
1730  * is called by the vmspace dtor function.  A great deal of pmap data is
1731  * left passively mapped to improve vmspace management so we have a bit
1732  * of cleanup work to do here.
1733  */
1734 void
1735 pmap_puninit(pmap_t pmap)
1736 {
1737         pv_entry_t pv;
1738         vm_page_t p;
1739
1740         KKASSERT(pmap->pm_active == 0);
1741         if ((pv = pmap->pm_pmlpv) != NULL) {
1742                 if (pv_hold_try(pv) == 0)
1743                         pv_lock(pv);
1744                 KKASSERT(pv == pmap->pm_pmlpv);
1745                 p = pmap_remove_pv_page(pv);
1746                 pv_free(pv);
1747                 pmap_kremove((vm_offset_t)pmap->pm_pml4);
1748                 vm_page_busy_wait(p, FALSE, "pgpun");
1749                 KKASSERT(p->flags & (PG_FICTITIOUS|PG_UNMANAGED));
1750                 vm_page_unwire(p, 0);
1751                 vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE);
1752
1753                 /*
1754                  * XXX eventually clean out PML4 static entries and
1755                  * use vm_page_free_zero()
1756                  */
1757                 vm_page_free(p);
1758                 pmap->pm_pmlpv = NULL;
1759         }
1760         if (pmap->pm_pml4) {
1761                 KKASSERT(pmap->pm_pml4 != (void *)(PTOV_OFFSET + KPML4phys));
1762                 kmem_free(&kernel_map, (vm_offset_t)pmap->pm_pml4, PAGE_SIZE);
1763                 pmap->pm_pml4 = NULL;
1764         }
1765         KKASSERT(pmap->pm_stats.resident_count == 0);
1766         KKASSERT(pmap->pm_stats.wired_count == 0);
1767 }
1768
1769 /*
1770  * Wire in kernel global address entries.  To avoid a race condition
1771  * between pmap initialization and pmap_growkernel, this procedure
1772  * adds the pmap to the master list (which growkernel scans to update),
1773  * then copies the template.
1774  */
1775 void
1776 pmap_pinit2(struct pmap *pmap)
1777 {
1778         spin_lock(&pmap_spin);
1779         TAILQ_INSERT_TAIL(&pmap_list, pmap, pm_pmnode);
1780         spin_unlock(&pmap_spin);
1781 }
1782
1783 /*
1784  * This routine is called when various levels in the page table need to
1785  * be populated.  This routine cannot fail.
1786  *
1787  * This function returns two locked pv_entry's, one representing the
1788  * requested pv and one representing the requested pv's parent pv.  If
1789  * the pv did not previously exist it will be mapped into its parent
1790  * and wired, otherwise no additional wire count will be added.
1791  */
1792 static
1793 pv_entry_t
1794 pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp)
1795 {
1796         pt_entry_t *ptep;
1797         pv_entry_t pv;
1798         pv_entry_t pvp;
1799         vm_pindex_t pt_pindex;
1800         vm_page_t m;
1801         int isnew;
1802         int ispt;
1803
1804         /*
1805          * If the pv already exists and we aren't being asked for the
1806          * parent page table page we can just return it.  A locked+held pv
1807          * is returned.  The pv will also have a second hold related to the
1808          * pmap association that we don't have to worry about.
1809          */
1810         ispt = 0;
1811         pv = pv_alloc(pmap, ptepindex, &isnew);
1812         if (isnew == 0 && pvpp == NULL)
1813                 return(pv);
1814
1815         /*
1816          * Special case terminal PVs.  These are not page table pages so
1817          * no vm_page is allocated (the caller supplied the vm_page).  If
1818          * pvpp is non-NULL we are being asked to also removed the pt_pv
1819          * for this pv.
1820          *
1821          * Note that pt_pv's are only returned for user VAs. We assert that
1822          * a pt_pv is not being requested for kernel VAs.
1823          */
1824         if (ptepindex < pmap_pt_pindex(0)) {
1825                 if (ptepindex >= NUPTE_USER)
1826                         KKASSERT(pvpp == NULL);
1827                 else
1828                         KKASSERT(pvpp != NULL);
1829                 if (pvpp) {
1830                         pt_pindex = NUPTE_TOTAL + (ptepindex >> NPTEPGSHIFT);
1831                         pvp = pmap_allocpte(pmap, pt_pindex, NULL);
1832                         if (isnew)
1833                                 vm_page_wire_quick(pvp->pv_m);
1834                         *pvpp = pvp;
1835                 } else {
1836                         pvp = NULL;
1837                 }
1838                 return(pv);
1839         }
1840
1841         /*
1842          * Non-terminal PVs allocate a VM page to represent the page table,
1843          * so we have to resolve pvp and calculate ptepindex for the pvp
1844          * and then for the page table entry index in the pvp for
1845          * fall-through.
1846          */
1847         if (ptepindex < pmap_pd_pindex(0)) {
1848                 /*
1849                  * pv is PT, pvp is PD
1850                  */
1851                 ptepindex = (ptepindex - pmap_pt_pindex(0)) >> NPDEPGSHIFT;
1852                 ptepindex += NUPTE_TOTAL + NUPT_TOTAL;
1853                 pvp = pmap_allocpte(pmap, ptepindex, NULL);
1854                 if (!isnew)
1855                         goto notnew;
1856
1857                 /*
1858                  * PT index in PD
1859                  */
1860                 ptepindex = pv->pv_pindex - pmap_pt_pindex(0);
1861                 ptepindex &= ((1ul << NPDEPGSHIFT) - 1);
1862                 ispt = 1;
1863         } else if (ptepindex < pmap_pdp_pindex(0)) {
1864                 /*
1865                  * pv is PD, pvp is PDP
1866                  *
1867                  * SIMPLE PMAP NOTE: Simple pmaps do not allocate above
1868                  *                   the PD.
1869                  */
1870                 ptepindex = (ptepindex - pmap_pd_pindex(0)) >> NPDPEPGSHIFT;
1871                 ptepindex += NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL;
1872
1873                 if (pmap->pm_flags & PMAP_FLAG_SIMPLE) {
1874                         KKASSERT(pvpp == NULL);
1875                         pvp = NULL;
1876                 } else {
1877                         pvp = pmap_allocpte(pmap, ptepindex, NULL);
1878                 }
1879                 if (!isnew)
1880                         goto notnew;
1881
1882                 /*
1883                  * PD index in PDP
1884                  */
1885                 ptepindex = pv->pv_pindex - pmap_pd_pindex(0);
1886                 ptepindex &= ((1ul << NPDPEPGSHIFT) - 1);
1887         } else if (ptepindex < pmap_pml4_pindex()) {
1888                 /*
1889                  * pv is PDP, pvp is the root pml4 table
1890                  */
1891                 pvp = pmap_allocpte(pmap, pmap_pml4_pindex(), NULL);
1892                 if (!isnew)
1893                         goto notnew;
1894
1895                 /*
1896                  * PDP index in PML4
1897                  */
1898                 ptepindex = pv->pv_pindex - pmap_pdp_pindex(0);
1899                 ptepindex &= ((1ul << NPML4EPGSHIFT) - 1);
1900         } else {
1901                 /*
1902                  * pv represents the top-level PML4, there is no parent.
1903                  */
1904                 pvp = NULL;
1905                 if (!isnew)
1906                         goto notnew;
1907         }
1908
1909         /*
1910          * This code is only reached if isnew is TRUE and this is not a
1911          * terminal PV.  We need to allocate a vm_page for the page table
1912          * at this level and enter it into the parent page table.
1913          *
1914          * page table pages are marked PG_WRITEABLE and PG_MAPPED.
1915          */
1916         for (;;) {
1917                 m = vm_page_alloc(NULL, pv->pv_pindex,
1918                                   VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM |
1919                                   VM_ALLOC_INTERRUPT);
1920                 if (m)
1921                         break;
1922                 vm_wait(0);
1923         }
1924         vm_page_spin_lock(m);
1925         pmap_page_stats_adding(m);
1926         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
1927         pv->pv_m = m;
1928         vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE);
1929         vm_page_spin_unlock(m);
1930         vm_page_unmanage(m);    /* m must be spinunlocked */
1931
1932         if ((m->flags & PG_ZERO) == 0) {
1933                 pmap_zero_page(VM_PAGE_TO_PHYS(m));
1934         }
1935 #ifdef PMAP_DEBUG
1936         else {
1937                 pmap_page_assertzero(VM_PAGE_TO_PHYS(m));
1938         }
1939 #endif
1940         m->valid = VM_PAGE_BITS_ALL;
1941         vm_page_flag_clear(m, PG_ZERO);
1942         vm_page_wire(m);        /* wire for mapping in parent */
1943
1944         /*
1945          * Wire the page into pvp, bump the wire-count for pvp's page table
1946          * page.  Bump the resident_count for the pmap.  There is no pvp
1947          * for the top level, address the pm_pml4[] array directly.
1948          *
1949          * If the caller wants the parent we return it, otherwise
1950          * we just put it away.
1951          *
1952          * No interlock is needed for pte 0 -> non-zero.
1953          *
1954          * In the situation where *ptep is valid we might have an unmanaged
1955          * page table page shared from another page table which we need to
1956          * unshare before installing our private page table page.
1957          */
1958         if (pvp) {
1959                 ptep = pv_pte_lookup(pvp, ptepindex);
1960                 if (*ptep & pmap->pmap_bits[PG_V_IDX]) {
1961                         pt_entry_t pte;
1962                         pmap_inval_info info;
1963
1964                         if (ispt == 0) {
1965                                 panic("pmap_allocpte: unexpected pte %p/%d",
1966                                       pvp, (int)ptepindex);
1967                         }
1968                         pmap_inval_init(&info);
1969                         pmap_inval_interlock(&info, pmap, (vm_offset_t)-1);
1970                         pte = pte_load_clear(ptep);
1971                         pmap_inval_deinterlock(&info, pmap);
1972                         pmap_inval_done(&info);
1973                         if (vm_page_unwire_quick(
1974                                         PHYS_TO_VM_PAGE(pte & PG_FRAME))) {
1975                                 panic("pmap_allocpte: shared pgtable "
1976                                       "pg bad wirecount");
1977                         }
1978                         atomic_add_long(&pmap->pm_stats.resident_count, -1);
1979                 } else {
1980                         vm_page_wire_quick(pvp->pv_m);
1981                 }
1982                 *ptep = VM_PAGE_TO_PHYS(m) |
1983                     (pmap->pmap_bits[PG_U_IDX] |
1984                     pmap->pmap_bits[PG_RW_IDX] |
1985                     pmap->pmap_bits[PG_V_IDX] |
1986                     pmap->pmap_bits[PG_A_IDX] |
1987                     pmap->pmap_bits[PG_M_IDX]);
1988         }
1989         vm_page_wakeup(m);
1990 notnew:
1991         if (pvpp)
1992                 *pvpp = pvp;
1993         else if (pvp)
1994                 pv_put(pvp);
1995         return (pv);
1996 }
1997
1998 /*
1999  * This version of pmap_allocpte() checks for possible segment optimizations
2000  * that would allow page-table sharing.  It can be called for terminal
2001  * page or page table page ptepindex's.
2002  *
2003  * The function is called with page table page ptepindex's for fictitious
2004  * and unmanaged terminal pages.  That is, we don't want to allocate a
2005  * terminal pv, we just want the pt_pv.  pvpp is usually passed as NULL
2006  * for this case.
2007  *
2008  * This function can return a pv and *pvpp associated with the passed in pmap
2009  * OR a pv and *pvpp associated with the shared pmap.  In the latter case
2010  * an unmanaged page table page will be entered into the pass in pmap.
2011  */
2012 static
2013 pv_entry_t
2014 pmap_allocpte_seg(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp,
2015                   vm_map_entry_t entry, vm_offset_t va)
2016 {
2017         struct pmap_inval_info info;
2018         vm_object_t object;
2019         pmap_t obpmap;
2020         pmap_t *obpmapp;
2021         vm_offset_t b;
2022         pv_entry_t pte_pv;      /* in original or shared pmap */
2023         pv_entry_t pt_pv;       /* in original or shared pmap */
2024         pv_entry_t proc_pd_pv;  /* in original pmap */
2025         pv_entry_t proc_pt_pv;  /* in original pmap */
2026         pv_entry_t xpv;         /* PT in shared pmap */
2027         pd_entry_t *pt;         /* PT entry in PD of original pmap */
2028         pd_entry_t opte;        /* contents of *pt */
2029         pd_entry_t npte;        /* contents of *pt */
2030         vm_page_t m;
2031
2032 retry:
2033         /*
2034          * Basic tests, require a non-NULL vm_map_entry, require proper
2035          * alignment and type for the vm_map_entry, require that the
2036          * underlying object already be allocated.
2037          *
2038          * We allow almost any type of object to use this optimization.
2039          * The object itself does NOT have to be sized to a multiple of the
2040          * segment size, but the memory mapping does.
2041          *
2042          * XXX don't handle devices currently, because VM_PAGE_TO_PHYS()
2043          *     won't work as expected.
2044          */
2045         if (entry == NULL ||
2046             pmap_mmu_optimize == 0 ||                   /* not enabled */
2047             ptepindex >= pmap_pd_pindex(0) ||           /* not terminal or pt */
2048             entry->inheritance != VM_INHERIT_SHARE ||   /* not shared */
2049             entry->maptype != VM_MAPTYPE_NORMAL ||      /* weird map type */
2050             entry->object.vm_object == NULL ||          /* needs VM object */
2051             entry->object.vm_object->type == OBJT_DEVICE ||     /* ick */
2052             entry->object.vm_object->type == OBJT_MGTDEVICE ||  /* ick */
2053             (entry->offset & SEG_MASK) ||               /* must be aligned */
2054             (entry->start & SEG_MASK)) {
2055                 return(pmap_allocpte(pmap, ptepindex, pvpp));
2056         }
2057
2058         /*
2059          * Make sure the full segment can be represented.
2060          */
2061         b = va & ~(vm_offset_t)SEG_MASK;
2062         if (b < entry->start || b + SEG_SIZE > entry->end)
2063                 return(pmap_allocpte(pmap, ptepindex, pvpp));
2064
2065         /*
2066          * If the full segment can be represented dive the VM object's
2067          * shared pmap, allocating as required.
2068          */
2069         object = entry->object.vm_object;
2070
2071         if (entry->protection & VM_PROT_WRITE)
2072                 obpmapp = &object->md.pmap_rw;
2073         else
2074                 obpmapp = &object->md.pmap_ro;
2075
2076 #ifdef PMAP_DEBUG2
2077         if (pmap_enter_debug > 0) {
2078                 --pmap_enter_debug;
2079                 kprintf("pmap_allocpte_seg: va=%jx prot %08x o=%p "
2080                         "obpmapp %p %p\n",
2081                         va, entry->protection, object,
2082                         obpmapp, *obpmapp);
2083                 kprintf("pmap_allocpte_seg: entry %p %jx-%jx\n",
2084                         entry, entry->start, entry->end);
2085         }
2086 #endif
2087
2088         /*
2089          * We allocate what appears to be a normal pmap but because portions
2090          * of this pmap are shared with other unrelated pmaps we have to
2091          * set pm_active to point to all cpus.
2092          *
2093          * XXX Currently using pmap_spin to interlock the update, can't use
2094          *     vm_object_hold/drop because the token might already be held
2095          *     shared OR exclusive and we don't know.
2096          */
2097         while ((obpmap = *obpmapp) == NULL) {
2098                 obpmap = kmalloc(sizeof(*obpmap), M_OBJPMAP, M_WAITOK|M_ZERO);
2099                 pmap_pinit_simple(obpmap);
2100                 pmap_pinit2(obpmap);
2101                 spin_lock(&pmap_spin);
2102                 if (*obpmapp != NULL) {
2103                         /*
2104                          * Handle race
2105                          */
2106                         spin_unlock(&pmap_spin);
2107                         pmap_release(obpmap);
2108                         pmap_puninit(obpmap);
2109                         kfree(obpmap, M_OBJPMAP);
2110                         obpmap = *obpmapp; /* safety */
2111                 } else {
2112                         obpmap->pm_active = smp_active_mask;
2113                         *obpmapp = obpmap;
2114                         spin_unlock(&pmap_spin);
2115                 }
2116         }
2117
2118         /*
2119          * Layering is: PTE, PT, PD, PDP, PML4.  We have to return the
2120          * pte/pt using the shared pmap from the object but also adjust
2121          * the process pmap's page table page as a side effect.
2122          */
2123
2124         /*
2125          * Resolve the terminal PTE and PT in the shared pmap.  This is what
2126          * we will return.  This is true if ptepindex represents a terminal
2127          * page, otherwise pte_pv is actually the PT and pt_pv is actually
2128          * the PD.
2129          */
2130         pt_pv = NULL;
2131         pte_pv = pmap_allocpte(obpmap, ptepindex, &pt_pv);
2132         if (ptepindex >= pmap_pt_pindex(0))
2133                 xpv = pte_pv;
2134         else
2135                 xpv = pt_pv;
2136
2137         /*
2138          * Resolve the PD in the process pmap so we can properly share the
2139          * page table page.  Lock order is bottom-up (leaf first)!
2140          *
2141          * NOTE: proc_pt_pv can be NULL.
2142          */
2143         proc_pt_pv = pv_get(pmap, pmap_pt_pindex(b));
2144         proc_pd_pv = pmap_allocpte(pmap, pmap_pd_pindex(b), NULL);
2145 #ifdef PMAP_DEBUG2
2146         if (pmap_enter_debug > 0) {
2147                 --pmap_enter_debug;
2148                 kprintf("proc_pt_pv %p (wc %d) pd_pv %p va=%jx\n",
2149                         proc_pt_pv,
2150                         (proc_pt_pv ? proc_pt_pv->pv_m->wire_count : -1),
2151                         proc_pd_pv,
2152                         va);
2153         }
2154 #endif
2155
2156         /*
2157          * xpv is the page table page pv from the shared object
2158          * (for convenience), from above.
2159          *
2160          * Calculate the pte value for the PT to load into the process PD.
2161          * If we have to change it we must properly dispose of the previous
2162          * entry.
2163          */
2164         pt = pv_pte_lookup(proc_pd_pv, pmap_pt_index(b));
2165         npte = VM_PAGE_TO_PHYS(xpv->pv_m) |
2166             (pmap->pmap_bits[PG_U_IDX] |
2167             pmap->pmap_bits[PG_RW_IDX] |
2168             pmap->pmap_bits[PG_V_IDX] |
2169             pmap->pmap_bits[PG_A_IDX] |
2170             pmap->pmap_bits[PG_M_IDX]);
2171
2172         /*
2173          * Dispose of previous page table page if it was local to the
2174          * process pmap.  If the old pt is not empty we cannot dispose of it
2175          * until we clean it out.  This case should not arise very often so
2176          * it is not optimized.
2177          */
2178         if (proc_pt_pv) {
2179                 if (proc_pt_pv->pv_m->wire_count != 1) {
2180                         pv_put(proc_pd_pv);
2181                         pv_put(proc_pt_pv);
2182                         pv_put(pt_pv);
2183                         pv_put(pte_pv);
2184                         pmap_remove(pmap,
2185                                     va & ~(vm_offset_t)SEG_MASK,
2186                                     (va + SEG_SIZE) & ~(vm_offset_t)SEG_MASK);
2187                         goto retry;
2188                 }
2189
2190                 /*
2191                  * The release call will indirectly clean out *pt
2192                  */
2193                 pmap_inval_init(&info);
2194                 pmap_release_pv(&info, proc_pt_pv, proc_pd_pv);
2195                 pmap_inval_done(&info);
2196                 proc_pt_pv = NULL;
2197                 /* relookup */
2198                 pt = pv_pte_lookup(proc_pd_pv, pmap_pt_index(b));
2199         }
2200
2201         /*
2202          * Handle remaining cases.
2203          */
2204         if (*pt == 0) {
2205                 *pt = npte;
2206                 vm_page_wire_quick(xpv->pv_m);
2207                 vm_page_wire_quick(proc_pd_pv->pv_m);
2208                 atomic_add_long(&pmap->pm_stats.resident_count, 1);
2209         } else if (*pt != npte) {
2210                 pmap_inval_init(&info);
2211                 pmap_inval_interlock(&info, pmap, (vm_offset_t)-1);
2212
2213                 opte = pte_load_clear(pt);
2214                 KKASSERT(opte && opte != npte);
2215
2216                 *pt = npte;
2217                 vm_page_wire_quick(xpv->pv_m);  /* pgtable pg that is npte */
2218
2219                 /*
2220                  * Clean up opte, bump the wire_count for the process
2221                  * PD page representing the new entry if it was
2222                  * previously empty.
2223                  *
2224                  * If the entry was not previously empty and we have
2225                  * a PT in the proc pmap then opte must match that
2226                  * pt.  The proc pt must be retired (this is done
2227                  * later on in this procedure).
2228                  *
2229                  * NOTE: replacing valid pte, wire_count on proc_pd_pv
2230                  * stays the same.
2231                  */
2232                 KKASSERT(opte & pmap->pmap_bits[PG_V_IDX]);
2233                 m = PHYS_TO_VM_PAGE(opte & PG_FRAME);
2234                 if (vm_page_unwire_quick(m)) {
2235                         panic("pmap_allocpte_seg: "
2236                               "bad wire count %p",
2237                               m);
2238                 }
2239
2240                 pmap_inval_deinterlock(&info, pmap);
2241                 pmap_inval_done(&info);
2242         }
2243
2244         /*
2245          * The existing process page table was replaced and must be destroyed
2246          * here.
2247          */
2248         if (proc_pd_pv)
2249                 pv_put(proc_pd_pv);
2250         if (pvpp)
2251                 *pvpp = pt_pv;
2252         else
2253                 pv_put(pt_pv);
2254
2255         return (pte_pv);
2256 }
2257
2258 /*
2259  * Release any resources held by the given physical map.
2260  *
2261  * Called when a pmap initialized by pmap_pinit is being released.  Should
2262  * only be called if the map contains no valid mappings.
2263  *
2264  * Caller must hold pmap->pm_token
2265  */
2266 struct pmap_release_info {
2267         pmap_t  pmap;
2268         int     retry;
2269 };
2270
2271 static int pmap_release_callback(pv_entry_t pv, void *data);
2272
2273 void
2274 pmap_release(struct pmap *pmap)
2275 {
2276         struct pmap_release_info info;
2277
2278         KASSERT(pmap->pm_active == 0,
2279                 ("pmap still active! %016jx", (uintmax_t)pmap->pm_active));
2280
2281         spin_lock(&pmap_spin);
2282         TAILQ_REMOVE(&pmap_list, pmap, pm_pmnode);
2283         spin_unlock(&pmap_spin);
2284
2285         /*
2286          * Pull pv's off the RB tree in order from low to high and release
2287          * each page.
2288          */
2289         info.pmap = pmap;
2290         do {
2291                 info.retry = 0;
2292                 spin_lock(&pmap->pm_spin);
2293                 RB_SCAN(pv_entry_rb_tree, &pmap->pm_pvroot, NULL,
2294                         pmap_release_callback, &info);
2295                 spin_unlock(&pmap->pm_spin);
2296         } while (info.retry);
2297
2298
2299         /*
2300          * One resident page (the pml4 page) should remain.
2301          * No wired pages should remain.
2302          */
2303         KKASSERT(pmap->pm_stats.resident_count ==
2304                  ((pmap->pm_flags & PMAP_FLAG_SIMPLE) ? 0 : 1));
2305
2306         KKASSERT(pmap->pm_stats.wired_count == 0);
2307 }
2308
2309 static int
2310 pmap_release_callback(pv_entry_t pv, void *data)
2311 {
2312         struct pmap_release_info *info = data;
2313         pmap_t pmap = info->pmap;
2314         int r;
2315
2316         if (pv_hold_try(pv)) {
2317                 spin_unlock(&pmap->pm_spin);
2318         } else {
2319                 spin_unlock(&pmap->pm_spin);
2320                 pv_lock(pv);
2321         }
2322         if (pv->pv_pmap != pmap) {
2323                 pv_put(pv);
2324                 spin_lock(&pmap->pm_spin);
2325                 info->retry = 1;
2326                 return(-1);
2327         }
2328         r = pmap_release_pv(NULL, pv, NULL);
2329         spin_lock(&pmap->pm_spin);
2330         return(r);
2331 }
2332
2333 /*
2334  * Called with held (i.e. also locked) pv.  This function will dispose of
2335  * the lock along with the pv.
2336  *
2337  * If the caller already holds the locked parent page table for pv it
2338  * must pass it as pvp, allowing us to avoid a deadlock, else it can
2339  * pass NULL for pvp.
2340  */
2341 static int
2342 pmap_release_pv(struct pmap_inval_info *info, pv_entry_t pv, pv_entry_t pvp)
2343 {
2344         vm_page_t p;
2345
2346         /*
2347          * The pmap is currently not spinlocked, pv is held+locked.
2348          * Remove the pv's page from its parent's page table.  The
2349          * parent's page table page's wire_count will be decremented.
2350          *
2351          * This will clean out the pte at any level of the page table.
2352          * If info is not NULL the appropriate invlpg/invltlb/smp
2353          * invalidation will be made.
2354          */
2355         pmap_remove_pv_pte(pv, pvp, info);
2356
2357         /*
2358          * Terminal pvs are unhooked from their vm_pages.  Because
2359          * terminal pages aren't page table pages they aren't wired
2360          * by us, so we have to be sure not to unwire them either.
2361          */
2362         if (pv->pv_pindex < pmap_pt_pindex(0)) {
2363                 pmap_remove_pv_page(pv);
2364                 goto skip;
2365         }
2366
2367         /*
2368          * We leave the top-level page table page cached, wired, and
2369          * mapped in the pmap until the dtor function (pmap_puninit())
2370          * gets called.
2371          *
2372          * Since we are leaving the top-level pv intact we need
2373          * to break out of what would otherwise be an infinite loop.
2374          */
2375         if (pv->pv_pindex == pmap_pml4_pindex()) {
2376                 pv_put(pv);
2377                 return(-1);
2378         }
2379
2380         /*
2381          * For page table pages (other than the top-level page),
2382          * remove and free the vm_page.  The representitive mapping
2383          * removed above by pmap_remove_pv_pte() did not undo the
2384          * last wire_count so we have to do that as well.
2385          */
2386         p = pmap_remove_pv_page(pv);
2387         vm_page_busy_wait(p, FALSE, "pmaprl");
2388         if (p->wire_count != 1) {
2389                 kprintf("p->wire_count was %016lx %d\n",
2390                         pv->pv_pindex, p->wire_count);
2391         }
2392         KKASSERT(p->wire_count == 1);
2393         KKASSERT(p->flags & PG_UNMANAGED);
2394
2395         vm_page_unwire(p, 0);
2396         KKASSERT(p->wire_count == 0);
2397
2398         /*
2399          * Theoretically this page, if not the pml4 page, should contain
2400          * all-zeros.  But its just too dangerous to mark it PG_ZERO.  Free
2401          * normally.
2402          */
2403         vm_page_free(p);
2404 skip:
2405         pv_free(pv);
2406         return 0;
2407 }
2408
2409 /*
2410  * This function will remove the pte associated with a pv from its parent.
2411  * Terminal pv's are supported.  The removal will be interlocked if info
2412  * is non-NULL.  The caller must dispose of pv instead of just unlocking
2413  * it.
2414  *
2415  * The wire count will be dropped on the parent page table.  The wire
2416  * count on the page being removed (pv->pv_m) from the parent page table
2417  * is NOT touched.  Note that terminal pages will not have any additional
2418  * wire counts while page table pages will have at least one representing
2419  * the mapping, plus others representing sub-mappings.
2420  *
2421  * NOTE: Cannot be called on kernel page table pages, only KVM terminal
2422  *       pages and user page table and terminal pages.
2423  *
2424  * The pv must be locked.
2425  *
2426  * XXX must lock parent pv's if they exist to remove pte XXX
2427  */
2428 static
2429 void
2430 pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, struct pmap_inval_info *info)
2431 {
2432         vm_pindex_t ptepindex = pv->pv_pindex;
2433         pmap_t pmap = pv->pv_pmap;
2434         vm_page_t p;
2435         int gotpvp = 0;
2436
2437         KKASSERT(pmap);
2438
2439         if (ptepindex == pmap_pml4_pindex()) {
2440                 /*
2441                  * We are the top level pml4 table, there is no parent.
2442                  */
2443                 p = pmap->pm_pmlpv->pv_m;
2444         } else if (ptepindex >= pmap_pdp_pindex(0)) {
2445                 /*
2446                  * Remove a PDP page from the pml4e.  This can only occur
2447                  * with user page tables.  We do not have to lock the
2448                  * pml4 PV so just ignore pvp.
2449                  */
2450                 vm_pindex_t pml4_pindex;
2451                 vm_pindex_t pdp_index;
2452                 pml4_entry_t *pdp;
2453
2454                 pdp_index = ptepindex - pmap_pdp_pindex(0);
2455                 if (pvp == NULL) {
2456                         pml4_pindex = pmap_pml4_pindex();
2457                         pvp = pv_get(pv->pv_pmap, pml4_pindex);
2458                         KKASSERT(pvp);
2459                         gotpvp = 1;
2460                 }
2461                 pdp = &pmap->pm_pml4[pdp_index & ((1ul << NPML4EPGSHIFT) - 1)];
2462                 KKASSERT((*pdp & pmap->pmap_bits[PG_V_IDX]) != 0);
2463                 p = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
2464                 if (info) {
2465                         pmap_inval_interlock(info, pmap, (vm_offset_t)-1);
2466                         pte_load_clear(pdp);
2467                         pmap_inval_deinterlock(info, pmap);
2468                 } else {
2469                         *pdp = 0;
2470                 }
2471         } else if (ptepindex >= pmap_pd_pindex(0)) {
2472                 /*
2473                  * Remove a PD page from the pdp
2474                  *
2475                  * SIMPLE PMAP NOTE: Non-existant pvp's are ok in the case
2476                  *                   of a simple pmap because it stops at
2477                  *                   the PD page.
2478                  */
2479                 vm_pindex_t pdp_pindex;
2480                 vm_pindex_t pd_index;
2481                 pdp_entry_t *pd;
2482
2483                 pd_index = ptepindex - pmap_pd_pindex(0);
2484
2485                 if (pvp == NULL) {
2486                         pdp_pindex = NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL +
2487                                      (pd_index >> NPML4EPGSHIFT);
2488                         pvp = pv_get(pv->pv_pmap, pdp_pindex);
2489                         if (pvp)
2490                                 gotpvp = 1;
2491                 }
2492                 if (pvp) {
2493                         pd = pv_pte_lookup(pvp, pd_index &
2494                                                 ((1ul << NPDPEPGSHIFT) - 1));
2495                         KKASSERT((*pd & pmap->pmap_bits[PG_V_IDX]) != 0);
2496                         p = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
2497                         if (info) {
2498                                 pmap_inval_interlock(info, pmap,
2499                                                      (vm_offset_t)-1);
2500                                 pte_load_clear(pd);
2501                                 pmap_inval_deinterlock(info, pmap);
2502                         } else {
2503                                 *pd = 0;
2504                         }
2505                 } else {
2506                         KKASSERT(pmap->pm_flags & PMAP_FLAG_SIMPLE);
2507                         p = pv->pv_m;           /* degenerate test later */
2508                 }
2509         } else if (ptepindex >= pmap_pt_pindex(0)) {
2510                 /*
2511                  *  Remove a PT page from the pd
2512                  */
2513                 vm_pindex_t pd_pindex;
2514                 vm_pindex_t pt_index;
2515                 pd_entry_t *pt;
2516
2517                 pt_index = ptepindex - pmap_pt_pindex(0);
2518
2519                 if (pvp == NULL) {
2520                         pd_pindex = NUPTE_TOTAL + NUPT_TOTAL +
2521                                     (pt_index >> NPDPEPGSHIFT);
2522                         pvp = pv_get(pv->pv_pmap, pd_pindex);
2523                         KKASSERT(pvp);
2524                         gotpvp = 1;
2525                 }
2526                 pt = pv_pte_lookup(pvp, pt_index & ((1ul << NPDPEPGSHIFT) - 1));
2527                 KKASSERT((*pt & pmap->pmap_bits[PG_V_IDX]) != 0);
2528                 p = PHYS_TO_VM_PAGE(*pt & PG_FRAME);
2529                 if (info) {
2530                         pmap_inval_interlock(info, pmap, (vm_offset_t)-1);
2531                         pte_load_clear(pt);
2532                         pmap_inval_deinterlock(info, pmap);
2533                 } else {
2534                         *pt = 0;
2535                 }
2536         } else {
2537                 /*
2538                  * Remove a PTE from the PT page
2539                  *
2540                  * NOTE: pv's must be locked bottom-up to avoid deadlocking.
2541                  *       pv is a pte_pv so we can safely lock pt_pv.
2542                  *
2543                  * NOTE: FICTITIOUS pages may have multiple physical mappings
2544                  *       so PHYS_TO_VM_PAGE() will not necessarily work for
2545                  *       terminal ptes.
2546                  */
2547                 vm_pindex_t pt_pindex;
2548                 pt_entry_t *ptep;
2549                 pt_entry_t pte;
2550                 vm_offset_t va;
2551
2552                 pt_pindex = ptepindex >> NPTEPGSHIFT;
2553                 va = (vm_offset_t)ptepindex << PAGE_SHIFT;
2554
2555                 if (ptepindex >= NUPTE_USER) {
2556                         ptep = vtopte(ptepindex << PAGE_SHIFT);
2557                         KKASSERT(pvp == NULL);
2558                 } else {
2559                         if (pvp == NULL) {
2560                                 pt_pindex = NUPTE_TOTAL +
2561                                             (ptepindex >> NPDPEPGSHIFT);
2562                                 pvp = pv_get(pv->pv_pmap, pt_pindex);
2563                                 KKASSERT(pvp);
2564                                 gotpvp = 1;
2565                         }
2566                         ptep = pv_pte_lookup(pvp, ptepindex &
2567                                                   ((1ul << NPDPEPGSHIFT) - 1));
2568                 }
2569
2570                 if (info)
2571                         pmap_inval_interlock(info, pmap, va);
2572                 pte = pte_load_clear(ptep);
2573                 if (info)
2574                         pmap_inval_deinterlock(info, pmap);
2575                 else
2576                         cpu_invlpg((void *)va);
2577
2578                 /*
2579                  * Now update the vm_page_t
2580                  */
2581                 if ((pte & (pmap->pmap_bits[PG_MANAGED_IDX] | pmap->pmap_bits[PG_V_IDX])) !=
2582                     (pmap->pmap_bits[PG_MANAGED_IDX]|pmap->pmap_bits[PG_V_IDX])) {
2583                         kprintf("remove_pte badpte %016lx %016lx %d\n",
2584                                 pte, pv->pv_pindex,
2585                                 pv->pv_pindex < pmap_pt_pindex(0));
2586                 }
2587                 /* PHYS_TO_VM_PAGE() will not work for FICTITIOUS pages */
2588                 /*KKASSERT((pte & (PG_MANAGED|PG_V)) == (PG_MANAGED|PG_V));*/
2589                 if (pte & pmap->pmap_bits[PG_DEVICE_IDX])
2590                         p = pv->pv_m;
2591                 else
2592                         p = PHYS_TO_VM_PAGE(pte & PG_FRAME);
2593                 /* p = pv->pv_m; */
2594
2595                 if (pte & pmap->pmap_bits[PG_M_IDX]) {
2596                         if (pmap_track_modified(ptepindex))
2597                                 vm_page_dirty(p);
2598                 }
2599                 if (pte & pmap->pmap_bits[PG_A_IDX]) {
2600                         vm_page_flag_set(p, PG_REFERENCED);
2601                 }
2602                 if (pte & pmap->pmap_bits[PG_W_IDX])
2603                         atomic_add_long(&pmap->pm_stats.wired_count, -1);
2604                 if (pte & pmap->pmap_bits[PG_G_IDX])
2605                         cpu_invlpg((void *)va);
2606         }
2607
2608         /*
2609          * Unwire the parent page table page.  The wire_count cannot go below
2610          * 1 here because the parent page table page is itself still mapped.
2611          *
2612          * XXX remove the assertions later.
2613          */
2614         KKASSERT(pv->pv_m == p);
2615         if (pvp && vm_page_unwire_quick(pvp->pv_m))
2616                 panic("pmap_remove_pv_pte: Insufficient wire_count");
2617
2618         if (gotpvp)
2619                 pv_put(pvp);
2620 }
2621
2622 /*
2623  * Remove the vm_page association to a pv.  The pv must be locked.
2624  */
2625 static
2626 vm_page_t
2627 pmap_remove_pv_page(pv_entry_t pv)
2628 {
2629         vm_page_t m;
2630
2631         m = pv->pv_m;
2632         KKASSERT(m);
2633         vm_page_spin_lock(m);
2634         pv->pv_m = NULL;
2635         TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2636         pmap_page_stats_deleting(m);
2637         /*
2638         if (m->object)
2639                 atomic_add_int(&m->object->agg_pv_list_count, -1);
2640         */
2641         if (TAILQ_EMPTY(&m->md.pv_list))
2642                 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
2643         vm_page_spin_unlock(m);
2644         return(m);
2645 }
2646
2647 /*
2648  * Grow the number of kernel page table entries, if needed.
2649  *
2650  * This routine is always called to validate any address space
2651  * beyond KERNBASE (for kldloads).  kernel_vm_end only governs the address
2652  * space below KERNBASE.
2653  */
2654 void
2655 pmap_growkernel(vm_offset_t kstart, vm_offset_t kend)
2656 {
2657         vm_paddr_t paddr;
2658         vm_offset_t ptppaddr;
2659         vm_page_t nkpg;
2660         pd_entry_t *pt, newpt;
2661         pdp_entry_t newpd;
2662         int update_kernel_vm_end;
2663
2664         /*
2665          * bootstrap kernel_vm_end on first real VM use
2666          */
2667         if (kernel_vm_end == 0) {
2668                 kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
2669                 nkpt = 0;
2670                 while ((*pmap_pt(&kernel_pmap, kernel_vm_end) & kernel_pmap.pmap_bits[PG_V_IDX]) != 0) {
2671                         kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) &
2672                                         ~(PAGE_SIZE * NPTEPG - 1);
2673                         nkpt++;
2674                         if (kernel_vm_end - 1 >= kernel_map.max_offset) {
2675                                 kernel_vm_end = kernel_map.max_offset;
2676                                 break;                       
2677                         }
2678                 }
2679         }
2680
2681         /*
2682          * Fill in the gaps.  kernel_vm_end is only adjusted for ranges
2683          * below KERNBASE.  Ranges above KERNBASE are kldloaded and we
2684          * do not want to force-fill 128G worth of page tables.
2685          */
2686         if (kstart < KERNBASE) {
2687                 if (kstart > kernel_vm_end)
2688                         kstart = kernel_vm_end;
2689                 KKASSERT(kend <= KERNBASE);
2690                 update_kernel_vm_end = 1;
2691         } else {
2692                 update_kernel_vm_end = 0;
2693         }
2694
2695         kstart = rounddown2(kstart, PAGE_SIZE * NPTEPG);
2696         kend = roundup2(kend, PAGE_SIZE * NPTEPG);
2697
2698         if (kend - 1 >= kernel_map.max_offset)
2699                 kend = kernel_map.max_offset;
2700
2701         while (kstart < kend) {
2702                 pt = pmap_pt(&kernel_pmap, kstart);
2703                 if (pt == NULL) {
2704                         /* We need a new PDP entry */
2705                         nkpg = vm_page_alloc(NULL, nkpt,
2706                                              VM_ALLOC_NORMAL |
2707                                              VM_ALLOC_SYSTEM |
2708                                              VM_ALLOC_INTERRUPT);
2709                         if (nkpg == NULL) {
2710                                 panic("pmap_growkernel: no memory to grow "
2711                                       "kernel");
2712                         }
2713                         paddr = VM_PAGE_TO_PHYS(nkpg);
2714                         if ((nkpg->flags & PG_ZERO) == 0)
2715                                 pmap_zero_page(paddr);
2716                         vm_page_flag_clear(nkpg, PG_ZERO);
2717                         newpd = (pdp_entry_t)
2718                             (paddr |
2719                             kernel_pmap.pmap_bits[PG_V_IDX] |
2720                             kernel_pmap.pmap_bits[PG_RW_IDX] |
2721                             kernel_pmap.pmap_bits[PG_A_IDX] |
2722                             kernel_pmap.pmap_bits[PG_M_IDX]);
2723                         *pmap_pd(&kernel_pmap, kstart) = newpd;
2724                         nkpt++;
2725                         continue; /* try again */
2726                 }
2727                 if ((*pt & kernel_pmap.pmap_bits[PG_V_IDX]) != 0) {
2728                         kstart = (kstart + PAGE_SIZE * NPTEPG) &
2729                                  ~(PAGE_SIZE * NPTEPG - 1);
2730                         if (kstart - 1 >= kernel_map.max_offset) {
2731                                 kstart = kernel_map.max_offset;
2732                                 break;                       
2733                         }
2734                         continue;
2735                 }
2736
2737                 /*
2738                  * This index is bogus, but out of the way
2739                  */
2740                 nkpg = vm_page_alloc(NULL, nkpt,
2741                                      VM_ALLOC_NORMAL |
2742                                      VM_ALLOC_SYSTEM |
2743                                      VM_ALLOC_INTERRUPT);
2744                 if (nkpg == NULL)
2745                         panic("pmap_growkernel: no memory to grow kernel");
2746
2747                 vm_page_wire(nkpg);
2748                 ptppaddr = VM_PAGE_TO_PHYS(nkpg);
2749                 pmap_zero_page(ptppaddr);
2750                 vm_page_flag_clear(nkpg, PG_ZERO);
2751                 newpt = (pd_entry_t) (ptppaddr |
2752                     kernel_pmap.pmap_bits[PG_V_IDX] |
2753                     kernel_pmap.pmap_bits[PG_RW_IDX] |
2754                     kernel_pmap.pmap_bits[PG_A_IDX] |
2755                     kernel_pmap.pmap_bits[PG_M_IDX]);
2756                 *pmap_pt(&kernel_pmap, kstart) = newpt;
2757                 nkpt++;
2758
2759                 kstart = (kstart + PAGE_SIZE * NPTEPG) &
2760                           ~(PAGE_SIZE * NPTEPG - 1);
2761
2762                 if (kstart - 1 >= kernel_map.max_offset) {
2763                         kstart = kernel_map.max_offset;
2764                         break;                       
2765                 }
2766         }
2767
2768         /*
2769          * Only update kernel_vm_end for areas below KERNBASE.
2770          */
2771         if (update_kernel_vm_end && kernel_vm_end < kstart)
2772                 kernel_vm_end = kstart;
2773 }
2774
2775 /*
2776  *      Add a reference to the specified pmap.
2777  */
2778 void
2779 pmap_reference(pmap_t pmap)
2780 {
2781         if (pmap != NULL) {
2782                 lwkt_gettoken(&pmap->pm_token);
2783                 ++pmap->pm_count;
2784                 lwkt_reltoken(&pmap->pm_token);
2785         }
2786 }
2787
2788 /***************************************************
2789  * page management routines.
2790  ***************************************************/
2791
2792 /*
2793  * Hold a pv without locking it
2794  */
2795 static void
2796 pv_hold(pv_entry_t pv)
2797 {
2798         atomic_add_int(&pv->pv_hold, 1);
2799 }
2800
2801 /*
2802  * Hold a pv_entry, preventing its destruction.  TRUE is returned if the pv
2803  * was successfully locked, FALSE if it wasn't.  The caller must dispose of
2804  * the pv properly.
2805  *
2806  * Either the pmap->pm_spin or the related vm_page_spin (if traversing a
2807  * pv list via its page) must be held by the caller.
2808  */
2809 static int
2810 _pv_hold_try(pv_entry_t pv PMAP_DEBUG_DECL)
2811 {
2812         u_int count;
2813
2814         /*
2815          * Critical path shortcut expects pv to already have one ref
2816          * (for the pv->pv_pmap).
2817          */
2818         if (atomic_cmpset_int(&pv->pv_hold, 1, PV_HOLD_LOCKED | 2)) {
2819 #ifdef PMAP_DEBUG
2820                 pv->pv_func = func;
2821                 pv->pv_line = lineno;
2822 #endif
2823                 return TRUE;
2824         }
2825
2826         for (;;) {
2827                 count = pv->pv_hold;
2828                 cpu_ccfence();
2829                 if ((count & PV_HOLD_LOCKED) == 0) {
2830                         if (atomic_cmpset_int(&pv->pv_hold, count,
2831                                               (count + 1) | PV_HOLD_LOCKED)) {
2832 #ifdef PMAP_DEBUG
2833                                 pv->pv_func = func;
2834                                 pv->pv_line = lineno;
2835 #endif
2836                                 return TRUE;
2837                         }
2838                 } else {
2839                         if (atomic_cmpset_int(&pv->pv_hold, count, count + 1))
2840                                 return FALSE;
2841                 }
2842                 /* retry */
2843         }
2844 }
2845
2846 /*
2847  * Drop a previously held pv_entry which could not be locked, allowing its
2848  * destruction.
2849  *
2850  * Must not be called with a spinlock held as we might zfree() the pv if it
2851  * is no longer associated with a pmap and this was the last hold count.
2852  */
2853 static void
2854 pv_drop(pv_entry_t pv)
2855 {
2856         u_int count;
2857
2858         for (;;) {
2859                 count = pv->pv_hold;
2860                 cpu_ccfence();
2861                 KKASSERT((count & PV_HOLD_MASK) > 0);
2862                 KKASSERT((count & (PV_HOLD_LOCKED | PV_HOLD_MASK)) !=
2863                          (PV_HOLD_LOCKED | 1));
2864                 if (atomic_cmpset_int(&pv->pv_hold, count, count - 1)) {
2865                         if ((count & PV_HOLD_MASK) == 1) {
2866 #ifdef PMAP_DEBUG2
2867                                 if (pmap_enter_debug > 0) {
2868                                         --pmap_enter_debug;
2869                                         kprintf("pv_drop: free pv %p\n", pv);
2870                                 }
2871 #endif
2872                                 KKASSERT(count == 1);
2873                                 KKASSERT(pv->pv_pmap == NULL);
2874                                 zfree(pvzone, pv);
2875                         }
2876                         return;
2877                 }
2878                 /* retry */
2879         }
2880 }
2881
2882 /*
2883  * Find or allocate the requested PV entry, returning a locked, held pv.
2884  *
2885  * If (*isnew) is non-zero, the returned pv will have two hold counts, one
2886  * for the caller and one representing the pmap and vm_page association.
2887  *
2888  * If (*isnew) is zero, the returned pv will have only one hold count.
2889  *
2890  * Since both associations can only be adjusted while the pv is locked,
2891  * together they represent just one additional hold.
2892  */
2893 static
2894 pv_entry_t
2895 _pv_alloc(pmap_t pmap, vm_pindex_t pindex, int *isnew PMAP_DEBUG_DECL)
2896 {
2897         pv_entry_t pv;
2898         pv_entry_t pnew = NULL;
2899
2900         spin_lock(&pmap->pm_spin);
2901         for (;;) {
2902                 if ((pv = pmap->pm_pvhint) == NULL || pv->pv_pindex != pindex) {
2903                         pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot,
2904                                                         pindex);
2905                 }
2906                 if (pv == NULL) {
2907                         if (pnew == NULL) {
2908                                 spin_unlock(&pmap->pm_spin);
2909                                 pnew = zalloc(pvzone);
2910                                 spin_lock(&pmap->pm_spin);
2911                                 continue;
2912                         }
2913                         pnew->pv_pmap = pmap;
2914                         pnew->pv_pindex = pindex;
2915                         pnew->pv_hold = PV_HOLD_LOCKED | 2;
2916 #ifdef PMAP_DEBUG
2917                         pnew->pv_func = func;
2918                         pnew->pv_line = lineno;
2919 #endif
2920                         pv_entry_rb_tree_RB_INSERT(&pmap->pm_pvroot, pnew);
2921                         ++pmap->pm_generation;
2922                         atomic_add_long(&pmap->pm_stats.resident_count, 1);
2923                         spin_unlock(&pmap->pm_spin);
2924                         *isnew = 1;
2925                         return(pnew);
2926                 }
2927                 if (pnew) {
2928                         spin_unlock(&pmap->pm_spin);
2929                         zfree(pvzone, pnew);
2930                         pnew = NULL;
2931                         spin_lock(&pmap->pm_spin);
2932                         continue;
2933                 }
2934                 if (_pv_hold_try(pv PMAP_DEBUG_COPY)) {
2935                         spin_unlock(&pmap->pm_spin);
2936                 } else {
2937                         spin_unlock(&pmap->pm_spin);
2938                         _pv_lock(pv PMAP_DEBUG_COPY);
2939                 }
2940                 if (pv->pv_pmap == pmap && pv->pv_pindex == pindex) {
2941                         *isnew = 0;
2942                         return(pv);
2943                 }
2944                 pv_put(pv);
2945                 spin_lock(&pmap->pm_spin);
2946         }
2947 }
2948
2949 /*
2950  * Find the requested PV entry, returning a locked+held pv or NULL
2951  */
2952 static
2953 pv_entry_t
2954 _pv_get(pmap_t pmap, vm_pindex_t pindex PMAP_DEBUG_DECL)
2955 {
2956         pv_entry_t pv;
2957
2958         spin_lock(&pmap->pm_spin);
2959         for (;;) {
2960                 /*
2961                  * Shortcut cache
2962                  */
2963                 if ((pv = pmap->pm_pvhint) == NULL || pv->pv_pindex != pindex) {
2964                         pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot,
2965                                                         pindex);
2966                 }
2967                 if (pv == NULL) {
2968                         spin_unlock(&pmap->pm_spin);
2969                         return NULL;
2970                 }
2971                 if (_pv_hold_try(pv PMAP_DEBUG_COPY)) {
2972                         spin_unlock(&pmap->pm_spin);
2973                 } else {
2974                         spin_unlock(&pmap->pm_spin);
2975                         _pv_lock(pv PMAP_DEBUG_COPY);
2976                 }
2977                 if (pv->pv_pmap == pmap && pv->pv_pindex == pindex) {
2978                         pv_cache(pv, pindex);
2979                         return(pv);
2980                 }
2981                 pv_put(pv);
2982                 spin_lock(&pmap->pm_spin);
2983         }
2984 }
2985
2986 /*
2987  * Lookup, hold, and attempt to lock (pmap,pindex).
2988  *
2989  * If the entry does not exist NULL is returned and *errorp is set to 0
2990  *
2991  * If the entry exists and could be successfully locked it is returned and
2992  * errorp is set to 0.
2993  *
2994  * If the entry exists but could NOT be successfully locked it is returned
2995  * held and *errorp is set to 1.
2996  */
2997 static
2998 pv_entry_t
2999 pv_get_try(pmap_t pmap, vm_pindex_t pindex, int *errorp)
3000 {
3001         pv_entry_t pv;
3002
3003         spin_lock_shared(&pmap->pm_spin);
3004         if ((pv = pmap->pm_pvhint) == NULL || pv->pv_pindex != pindex)
3005                 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pindex);
3006         if (pv == NULL) {
3007                 spin_unlock_shared(&pmap->pm_spin);
3008                 *errorp = 0;
3009                 return NULL;
3010         }
3011         if (pv_hold_try(pv)) {
3012                 pv_cache(pv, pindex);
3013                 spin_unlock_shared(&pmap->pm_spin);
3014                 *errorp = 0;
3015                 KKASSERT(pv->pv_pmap == pmap && pv->pv_pindex == pindex);
3016                 return(pv);     /* lock succeeded */
3017         }
3018         spin_unlock_shared(&pmap->pm_spin);
3019         *errorp = 1;
3020         return (pv);            /* lock failed */
3021 }
3022
3023 /*
3024  * Find the requested PV entry, returning a held pv or NULL
3025  */
3026 static
3027 pv_entry_t
3028 pv_find(pmap_t pmap, vm_pindex_t pindex)
3029 {
3030         pv_entry_t pv;
3031
3032         spin_lock_shared(&pmap->pm_spin);
3033
3034         if ((pv = pmap->pm_pvhint) == NULL || pv->pv_pindex != pindex)
3035                 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pindex);
3036         if (pv == NULL) {
3037                 spin_unlock_shared(&pmap->pm_spin);
3038                 return NULL;
3039         }
3040         pv_hold(pv);
3041         pv_cache(pv, pindex);
3042         spin_unlock_shared(&pmap->pm_spin);
3043         return(pv);
3044 }
3045
3046 /*
3047  * Lock a held pv, keeping the hold count
3048  */
3049 static
3050 void
3051 _pv_lock(pv_entry_t pv PMAP_DEBUG_DECL)
3052 {
3053         u_int count;
3054
3055         for (;;) {
3056                 count = pv->pv_hold;
3057                 cpu_ccfence();
3058                 if ((count & PV_HOLD_LOCKED) == 0) {
3059                         if (atomic_cmpset_int(&pv->pv_hold, count,
3060                                               count | PV_HOLD_LOCKED)) {
3061 #ifdef PMAP_DEBUG
3062                                 pv->pv_func = func;
3063                                 pv->pv_line = lineno;
3064 #endif
3065                                 return;
3066                         }
3067                         continue;
3068                 }
3069                 tsleep_interlock(pv, 0);
3070                 if (atomic_cmpset_int(&pv->pv_hold, count,
3071                                       count | PV_HOLD_WAITING)) {
3072 #ifdef PMAP_DEBUG
3073                         kprintf("pv waiting on %s:%d\n",
3074                                         pv->pv_func, pv->pv_line);
3075 #endif
3076                         tsleep(pv, PINTERLOCKED, "pvwait", hz);
3077                 }
3078                 /* retry */
3079         }
3080 }
3081
3082 /*
3083  * Unlock a held and locked pv, keeping the hold count.
3084  */
3085 static
3086 void
3087 pv_unlock(pv_entry_t pv)
3088 {
3089         u_int count;
3090
3091         for (;;) {
3092                 count = pv->pv_hold;
3093                 cpu_ccfence();
3094                 KKASSERT((count & (PV_HOLD_LOCKED | PV_HOLD_MASK)) >=
3095                          (PV_HOLD_LOCKED | 1));
3096                 if (atomic_cmpset_int(&pv->pv_hold, count,
3097                                       count &
3098                                       ~(PV_HOLD_LOCKED | PV_HOLD_WAITING))) {
3099                         if (count & PV_HOLD_WAITING)
3100                                 wakeup(pv);
3101                         break;
3102                 }
3103         }
3104 }
3105
3106 /*
3107  * Unlock and drop a pv.  If the pv is no longer associated with a pmap
3108  * and the hold count drops to zero we will free it.
3109  *
3110  * Caller should not hold any spin locks.  We are protected from hold races
3111  * by virtue of holds only occuring only with a pmap_spin or vm_page_spin
3112  * lock held.  A pv cannot be located otherwise.
3113  */
3114 static
3115 void
3116 pv_put(pv_entry_t pv)
3117 {
3118 #ifdef PMAP_DEBUG2
3119         if (pmap_enter_debug > 0) {
3120                 --pmap_enter_debug;
3121                 kprintf("pv_put pv=%p hold=%08x\n", pv, pv->pv_hold);
3122         }
3123 #endif
3124
3125         /*
3126          * Fast - shortcut most common condition
3127          */
3128         if (atomic_cmpset_int(&pv->pv_hold, PV_HOLD_LOCKED | 2, 1))
3129                 return;
3130
3131         /*
3132          * Slow
3133          */
3134         pv_unlock(pv);
3135         pv_drop(pv);
3136 }
3137
3138 /*
3139  * Remove the pmap association from a pv, require that pv_m already be removed,
3140  * then unlock and drop the pv.  Any pte operations must have already been
3141  * completed.  This call may result in a last-drop which will physically free
3142  * the pv.
3143  *
3144  * Removing the pmap association entails an additional drop.
3145  *
3146  * pv must be exclusively locked on call and will be disposed of on return.
3147  */
3148 static
3149 void
3150 pv_free(pv_entry_t pv)
3151 {
3152         pmap_t pmap;
3153
3154         KKASSERT(pv->pv_m == NULL);
3155         KKASSERT((pv->pv_hold & PV_HOLD_MASK) >= 2);
3156         if ((pmap = pv->pv_pmap) != NULL) {
3157                 spin_lock(&pmap->pm_spin);
3158                 pv_entry_rb_tree_RB_REMOVE(&pmap->pm_pvroot, pv);
3159                 ++pmap->pm_generation;
3160                 if (pmap->pm_pvhint == pv)
3161                         pmap->pm_pvhint = NULL;
3162                 atomic_add_long(&pmap->pm_stats.resident_count, -1);
3163                 pv->pv_pmap = NULL;
3164                 pv->pv_pindex = 0;
3165                 spin_unlock(&pmap->pm_spin);
3166
3167                 /*
3168                  * Try to shortcut three atomic ops, otherwise fall through
3169                  * and do it normally.  Drop two refs and the lock all in
3170                  * one go.
3171                  */
3172                 if (atomic_cmpset_int(&pv->pv_hold, PV_HOLD_LOCKED | 2, 0)) {
3173 #ifdef PMAP_DEBUG2
3174                         if (pmap_enter_debug > 0) {
3175                                 --pmap_enter_debug;
3176                                 kprintf("pv_free: free pv %p\n", pv);
3177                         }
3178 #endif
3179                         zfree(pvzone, pv);
3180                         return;
3181                 }
3182                 pv_drop(pv);    /* ref for pv_pmap */
3183         }
3184         pv_put(pv);
3185 }
3186
3187 /*
3188  * This routine is very drastic, but can save the system
3189  * in a pinch.
3190  */
3191 void
3192 pmap_collect(void)
3193 {
3194         int i;
3195         vm_page_t m;
3196         static int warningdone=0;
3197
3198         if (pmap_pagedaemon_waken == 0)
3199                 return;
3200         pmap_pagedaemon_waken = 0;
3201         if (warningdone < 5) {
3202                 kprintf("pmap_collect: collecting pv entries -- "
3203                         "suggest increasing PMAP_SHPGPERPROC\n");
3204                 warningdone++;
3205         }
3206
3207         for (i = 0; i < vm_page_array_size; i++) {
3208                 m = &vm_page_array[i];
3209                 if (m->wire_count || m->hold_count)
3210                         continue;
3211                 if (vm_page_busy_try(m, TRUE) == 0) {
3212                         if (m->wire_count == 0 && m->hold_count == 0) {
3213                                 pmap_remove_all(m);
3214                         }
3215                         vm_page_wakeup(m);
3216                 }
3217         }
3218 }
3219
3220 /*
3221  * Scan the pmap for active page table entries and issue a callback.
3222  * The callback must dispose of pte_pv, whos PTE entry is at *ptep in
3223  * its parent page table.
3224  *
3225  * pte_pv will be NULL if the page or page table is unmanaged.
3226  * pt_pv will point to the page table page containing the pte for the page.
3227  *
3228  * NOTE! If we come across an unmanaged page TABLE (verses an unmanaged page),
3229  *       we pass a NULL pte_pv and we pass a pt_pv pointing to the passed
3230  *       process pmap's PD and page to the callback function.  This can be
3231  *       confusing because the pt_pv is really a pd_pv, and the target page
3232  *       table page is simply aliased by the pmap and not owned by it.
3233  *
3234  * It is assumed that the start and end are properly rounded to the page size.
3235  *
3236  * It is assumed that PD pages and above are managed and thus in the RB tree,
3237  * allowing us to use RB_SCAN from the PD pages down for ranged scans.
3238  */
3239 struct pmap_scan_info {
3240         struct pmap *pmap;
3241         vm_offset_t sva;
3242         vm_offset_t eva;
3243         vm_pindex_t sva_pd_pindex;
3244         vm_pindex_t eva_pd_pindex;
3245         void (*func)(pmap_t, struct pmap_scan_info *,
3246                      pv_entry_t, pv_entry_t, int, vm_offset_t,
3247                      pt_entry_t *, void *);
3248         void *arg;
3249         int doinval;
3250         struct pmap_inval_info inval;
3251 };
3252
3253 static int pmap_scan_cmp(pv_entry_t pv, void *data);
3254 static int pmap_scan_callback(pv_entry_t pv, void *data);
3255
3256 static void
3257 pmap_scan(struct pmap_scan_info *info)
3258 {
3259         struct pmap *pmap = info->pmap;
3260         pv_entry_t pd_pv;       /* A page directory PV */
3261         pv_entry_t pt_pv;       /* A page table PV */
3262         pv_entry_t pte_pv;      /* A page table entry PV */
3263         pt_entry_t *ptep;
3264         pt_entry_t oldpte;
3265         struct pv_entry dummy_pv;
3266         int generation;
3267
3268         if (pmap == NULL)
3269                 return;
3270
3271         /*
3272          * Hold the token for stability; if the pmap is empty we have nothing
3273          * to do.
3274          */
3275         lwkt_gettoken(&pmap->pm_token);
3276 #if 0
3277         if (pmap->pm_stats.resident_count == 0) {
3278                 lwkt_reltoken(&pmap->pm_token);
3279                 return;
3280         }
3281 #endif
3282
3283         pmap_inval_init(&info->inval);
3284
3285 again:
3286         /*
3287          * Special handling for scanning one page, which is a very common
3288          * operation (it is?).
3289          *
3290          * NOTE: Locks must be ordered bottom-up. pte,pt,pd,pdp,pml4
3291          */
3292         if (info->sva + PAGE_SIZE == info->eva) {
3293                 generation = pmap->pm_generation;
3294                 if (info->sva >= VM_MAX_USER_ADDRESS) {
3295                         /*
3296                          * Kernel mappings do not track wire counts on
3297                          * page table pages and only maintain pd_pv and
3298                          * pte_pv levels so pmap_scan() works.
3299                          */
3300                         pt_pv = NULL;
3301                         pte_pv = pv_get(pmap, pmap_pte_pindex(info->sva));
3302                         ptep = vtopte(info->sva);
3303                 } else {
3304                         /*
3305                          * User pages which are unmanaged will not have a
3306                          * pte_pv.  User page table pages which are unmanaged
3307                          * (shared from elsewhere) will also not have a pt_pv.
3308                          * The func() callback will pass both pte_pv and pt_pv
3309                          * as NULL in that case.
3310                          */
3311                         pte_pv = pv_get(pmap, pmap_pte_pindex(info->sva));
3312                         pt_pv = pv_get(pmap, pmap_pt_pindex(info->sva));
3313                         if (pt_pv == NULL) {
3314                                 KKASSERT(pte_pv == NULL);
3315                                 pd_pv = pv_get(pmap, pmap_pd_pindex(info->sva));
3316                                 if (pd_pv) {
3317                                         ptep = pv_pte_lookup(pd_pv,
3318                                                     pmap_pt_index(info->sva));
3319                                         if (*ptep) {
3320                                                 info->func(pmap, info,
3321                                                      NULL, pd_pv, 1,
3322                                                      info->sva, ptep,
3323                                                      info->arg);
3324                                         }
3325                                         pv_put(pd_pv);
3326                                 }
3327                                 goto fast_skip;
3328                         }
3329                         ptep = pv_pte_lookup(pt_pv, pmap_pte_index(info->sva));
3330                 }
3331
3332                 /*
3333                  * NOTE: *ptep can't be ripped out from under us if we hold
3334                  *       pte_pv locked, but bits can change.  However, there is
3335                  *       a race where another thread may be inserting pte_pv
3336                  *       and setting *ptep just after our pte_pv lookup fails.
3337                  *
3338                  *       In this situation we can end up with a NULL pte_pv
3339                  *       but find that we have a managed *ptep.  We explicitly
3340                  *       check for this race.
3341                  */
3342                 oldpte = *ptep;
3343                 cpu_ccfence();
3344                 if (oldpte == 0) {
3345                         /*
3346                          * Unlike the pv_find() case below we actually
3347                          * acquired a locked pv in this case so any
3348                          * race should have been resolved.  It is expected
3349                          * to not exist.
3350                          */
3351                         KKASSERT(pte_pv == NULL);
3352                 } else if (pte_pv) {
3353                         KASSERT((oldpte & (pmap->pmap_bits[PG_MANAGED_IDX] |
3354                                            pmap->pmap_bits[PG_V_IDX])) ==
3355                                 (pmap->pmap_bits[PG_MANAGED_IDX] |
3356                                  pmap->pmap_bits[PG_V_IDX]),
3357                             ("badA *ptep %016lx/%016lx sva %016lx pte_pv %p"
3358                              "generation %d/%d",
3359                             *ptep, oldpte, info->sva, pte_pv,
3360                             generation, pmap->pm_generation));
3361                         info->func(pmap, info, pte_pv, pt_pv, 0,
3362                                    info->sva, ptep, info->arg);
3363                 } else {
3364                         /*
3365                          * Check for insertion race
3366                          */
3367                         if ((oldpte & pmap->pmap_bits[PG_MANAGED_IDX]) &&
3368                             pt_pv) {
3369                                 pte_pv = pv_find(pmap,
3370                                                  pmap_pte_pindex(info->sva));
3371                                 if (pte_pv) {
3372                                         pv_drop(pte_pv);
3373                                         pv_put(pt_pv);
3374                                         kprintf("pmap_scan: RACE1 "
3375                                                 "%016jx, %016lx\n",
3376                                                 info->sva, oldpte);
3377                                         goto again;
3378                                 }
3379                         }
3380
3381                         /*
3382                          * Didn't race
3383                          */
3384                         KASSERT((oldpte & (pmap->pmap_bits[PG_MANAGED_IDX] |
3385                                            pmap->pmap_bits[PG_V_IDX])) ==
3386                             pmap->pmap_bits[PG_V_IDX],
3387                             ("badB *ptep %016lx/%016lx sva %016lx pte_pv NULL"
3388                              "generation %d/%d",
3389                             *ptep, oldpte, info->sva,
3390                             generation, pmap->pm_generation));
3391                         info->func(pmap, info, NULL, pt_pv, 0,
3392                             info->sva, ptep, info->arg);
3393                 }
3394                 if (pt_pv)
3395                         pv_put(pt_pv);
3396 fast_skip:
3397                 pmap_inval_done(&info->inval);
3398                 lwkt_reltoken(&pmap->pm_token);
3399                 return;
3400         }
3401
3402         /*
3403          * Nominal scan case, RB_SCAN() for PD pages and iterate from
3404          * there.
3405          */
3406         info->sva_pd_pindex = pmap_pd_pindex(info->sva);
3407         info->eva_pd_pindex = pmap_pd_pindex(info->eva + NBPDP - 1);
3408
3409         if (info->sva >= VM_MAX_USER_ADDRESS) {
3410                 /*
3411                  * The kernel does not currently maintain any pv_entry's for
3412                  * higher-level page tables.
3413                  */
3414                 bzero(&dummy_pv, sizeof(dummy_pv));
3415                 dummy_pv.pv_pindex = info->sva_pd_pindex;
3416                 spin_lock(&pmap->pm_spin);
3417                 while (dummy_pv.pv_pindex < info->eva_pd_pindex) {
3418                         pmap_scan_callback(&dummy_pv, info);
3419                         ++dummy_pv.pv_pindex;
3420                 }
3421                 spin_unlock(&pmap->pm_spin);
3422         } else {
3423                 /*
3424                  * User page tables maintain local PML4, PDP, and PD
3425                  * pv_entry's at the very least.  PT pv's might be
3426                  * unmanaged and thus not exist.  PTE pv's might be
3427                  * unmanaged and thus not exist.
3428                  */
3429                 spin_lock(&pmap->pm_spin);
3430                 pv_entry_rb_tree_RB_SCAN(&pmap->pm_pvroot,
3431                         pmap_scan_cmp, pmap_scan_callback, info);
3432                 spin_unlock(&pmap->pm_spin);
3433         }
3434         pmap_inval_done(&info->inval);
3435         lwkt_reltoken(&pmap->pm_token);
3436 }
3437
3438 /*
3439  * WARNING! pmap->pm_spin held
3440  */
3441 static int
3442 pmap_scan_cmp(pv_entry_t pv, void *data)
3443 {
3444         struct pmap_scan_info *info = data;
3445         if (pv->pv_pindex < info->sva_pd_pindex)
3446                 return(-1);
3447         if (pv->pv_pindex >= info->eva_pd_pindex)
3448                 return(1);
3449         return(0);
3450 }
3451
3452 /*
3453  * WARNING! pmap->pm_spin held
3454  */
3455 static int
3456 pmap_scan_callback(pv_entry_t pv, void *data)
3457 {
3458         struct pmap_scan_info *info = data;
3459         struct pmap *pmap = info->pmap;
3460         pv_entry_t pd_pv;       /* A page directory PV */
3461         pv_entry_t pt_pv;       /* A page table PV */
3462         pv_entry_t pte_pv;      /* A page table entry PV */
3463         pt_entry_t *ptep;
3464         pt_entry_t oldpte;
3465         vm_offset_t sva;
3466         vm_offset_t eva;
3467         vm_offset_t va_next;
3468         vm_pindex_t pd_pindex;
3469         int error;
3470         int generation;
3471
3472         /*
3473          * Pull the PD pindex from the pv before releasing the spinlock.
3474          *
3475          * WARNING: pv is faked for kernel pmap scans.
3476          */
3477         pd_pindex = pv->pv_pindex;
3478         spin_unlock(&pmap->pm_spin);
3479         pv = NULL;      /* invalid after spinlock unlocked */
3480
3481         /*
3482          * Calculate the page range within the PD.  SIMPLE pmaps are
3483          * direct-mapped for the entire 2^64 address space.  Normal pmaps
3484          * reflect the user and kernel address space which requires
3485          * cannonicalization w/regards to converting pd_pindex's back
3486          * into addresses.
3487          */
3488         sva = (pd_pindex - NUPTE_TOTAL - NUPT_TOTAL) << PDPSHIFT;
3489         if ((pmap->pm_flags & PMAP_FLAG_SIMPLE) == 0 &&
3490             (sva & PML4_SIGNMASK)) {
3491                 sva |= PML4_SIGNMASK;
3492         }
3493         eva = sva + NBPDP;      /* can overflow */
3494         if (sva < info->sva)
3495                 sva = info->sva;
3496         if (eva < info->sva || eva > info->eva)
3497                 eva = info->eva;
3498
3499         /*
3500          * NOTE: kernel mappings do not track page table pages, only
3501          *       terminal pages.
3502          *
3503          * NOTE: Locks must be ordered bottom-up. pte,pt,pd,pdp,pml4.
3504          *       However, for the scan to be efficient we try to
3505          *       cache items top-down.
3506          */
3507         pd_pv = NULL;
3508         pt_pv = NULL;
3509
3510         for (; sva < eva; sva = va_next) {
3511                 if (sva >= VM_MAX_USER_ADDRESS) {
3512                         if (pt_pv) {
3513                                 pv_put(pt_pv);
3514                                 pt_pv = NULL;
3515                         }
3516                         goto kernel_skip;
3517                 }
3518
3519                 /*
3520                  * PD cache (degenerate case if we skip).  It is possible
3521                  * for the PD to not exist due to races.  This is ok.
3522                  */
3523                 if (pd_pv == NULL) {
3524                         pd_pv = pv_get(pmap, pmap_pd_pindex(sva));
3525                 } else if (pd_pv->pv_pindex != pmap_pd_pindex(sva)) {
3526                         pv_put(pd_pv);
3527                         pd_pv = pv_get(pmap, pmap_pd_pindex(sva));
3528                 }
3529                 if (pd_pv == NULL) {
3530                         va_next = (sva + NBPDP) & ~PDPMASK;
3531                         if (va_next < sva)
3532                                 va_next = eva;
3533                         continue;
3534                 }
3535
3536                 /*
3537                  * PT cache
3538                  */
3539                 if (pt_pv == NULL) {
3540                         if (pd_pv) {
3541                                 pv_put(pd_pv);
3542                                 pd_pv = NULL;
3543                         }
3544                         pt_pv = pv_get(pmap, pmap_pt_pindex(sva));
3545                 } else if (pt_pv->pv_pindex != pmap_pt_pindex(sva)) {
3546                         if (pd_pv) {
3547                                 pv_put(pd_pv);
3548                                 pd_pv = NULL;
3549                         }
3550                         pv_put(pt_pv);
3551                         pt_pv = pv_get(pmap, pmap_pt_pindex(sva));
3552                 }
3553
3554                 /*
3555                  * If pt_pv is NULL we either have an shared page table
3556                  * page and must issue a callback specific to that case,
3557                  * or there is no page table page.
3558                  *
3559                  * Either way we can skip the page table page.
3560                  */
3561                 if (pt_pv == NULL) {
3562                         /*
3563                          * Possible unmanaged (shared from another pmap)
3564                          * page table page.
3565                          */
3566                         if (pd_pv == NULL)
3567                                 pd_pv = pv_get(pmap, pmap_pd_pindex(sva));
3568                         KKASSERT(pd_pv != NULL);
3569                         ptep = pv_pte_lookup(pd_pv, pmap_pt_index(sva));
3570                         if (*ptep & pmap->pmap_bits[PG_V_IDX]) {
3571                                 info->func(pmap, info, NULL, pd_pv, 1,
3572                                            sva, ptep, info->arg);
3573                         }
3574
3575                         /*
3576                          * Done, move to next page table page.
3577                          */
3578                         va_next = (sva + NBPDR) & ~PDRMASK;
3579                         if (va_next < sva)
3580                                 va_next = eva;
3581                         continue;
3582                 }
3583
3584                 /*
3585                  * From this point in the loop testing pt_pv for non-NULL
3586                  * means we are in UVM, else if it is NULL we are in KVM.
3587                  *
3588                  * Limit our scan to either the end of the va represented
3589                  * by the current page table page, or to the end of the
3590                  * range being removed.
3591                  */
3592 kernel_skip:
3593                 va_next = (sva + NBPDR) & ~PDRMASK;
3594                 if (va_next < sva)
3595                         va_next = eva;
3596                 if (va_next > eva)
3597                         va_next = eva;
3598
3599                 /*
3600                  * Scan the page table for pages.  Some pages may not be
3601                  * managed (might not have a pv_entry).
3602                  *
3603                  * There is no page table management for kernel pages so
3604                  * pt_pv will be NULL in that case, but otherwise pt_pv
3605                  * is non-NULL, locked, and referenced.
3606                  */
3607
3608                 /*
3609                  * At this point a non-NULL pt_pv means a UVA, and a NULL
3610                  * pt_pv means a KVA.
3611                  */
3612                 if (pt_pv)
3613                         ptep = pv_pte_lookup(pt_pv, pmap_pte_index(sva));
3614                 else
3615                         ptep = vtopte(sva);
3616
3617                 while (sva < va_next) {
3618                         /*
3619                          * Acquire the related pte_pv, if any.  If *ptep == 0
3620                          * the related pte_pv should not exist, but if *ptep
3621                          * is not zero the pte_pv may or may not exist (e.g.
3622                          * will not exist for an unmanaged page).
3623                          *
3624                          * However a multitude of races are possible here.
3625                          *
3626                          * In addition, the (pt_pv, pte_pv) lock order is
3627                          * backwards, so we have to be careful in aquiring
3628                          * a properly locked pte_pv.
3629                          */
3630                         generation = pmap->pm_generation;
3631                         if (pt_pv) {
3632                                 pte_pv = pv_get_try(pmap, pmap_pte_pindex(sva),
3633                                                     &error);
3634                                 if (error) {
3635                                         if (pd_pv) {
3636                                                 pv_put(pd_pv);
3637                                                 pd_pv = NULL;
3638                                         }
3639                                         pv_put(pt_pv);   /* must be non-NULL */
3640                                         pt_pv = NULL;
3641                                         pv_lock(pte_pv); /* safe to block now */
3642                                         pv_put(pte_pv);
3643                                         pte_pv = NULL;
3644                                         pt_pv = pv_get(pmap,
3645                                                        pmap_pt_pindex(sva));
3646                                         /*
3647                                          * pt_pv reloaded, need new ptep
3648                                          */
3649                                         KKASSERT(pt_pv != NULL);
3650                                         ptep = pv_pte_lookup(pt_pv,
3651                                                         pmap_pte_index(sva));
3652                                         continue;
3653                                 }
3654                         } else {
3655                                 pte_pv = pv_get(pmap, pmap_pte_pindex(sva));
3656                         }
3657
3658                         /*
3659                          * Ok, if *ptep == 0 we had better NOT have a pte_pv.
3660                          */
3661                         oldpte = *ptep;
3662                         if (oldpte == 0) {
3663                                 if (pte_pv) {
3664                                         kprintf("Unexpected non-NULL pte_pv "
3665                                                 "%p pt_pv %p "
3666                                                 "*ptep = %016lx/%016lx\n",
3667                                                 pte_pv, pt_pv, *ptep, oldpte);
3668                                         panic("Unexpected non-NULL pte_pv");
3669                                 }
3670                                 sva += PAGE_SIZE;
3671                                 ++ptep;
3672                                 continue;
3673                         }
3674
3675                         /*
3676                          * Ready for the callback.  The locked pte_pv (if any)
3677                          * is consumed by the callback.  pte_pv will exist if
3678                          *  the page is managed, and will not exist if it
3679                          * isn't.
3680                          */
3681                         if (pte_pv) {
3682                                 KASSERT((oldpte & (pmap->pmap_bits[PG_MANAGED_IDX] | pmap->pmap_bits[PG_V_IDX])) ==
3683                                     (pmap->pmap_bits[PG_MANAGED_IDX] | pmap->pmap_bits[PG_V_IDX]),
3684                                     ("badC *ptep %016lx/%016lx sva %016lx "
3685                                     "pte_pv %p pm_generation %d/%d",
3686                                     *ptep, oldpte, sva, pte_pv,
3687                                     generation, pmap->pm_generation));
3688                                 info->func(pmap, info, pte_pv, pt_pv, 0,
3689                                     sva, ptep, info->arg);
3690                         } else {
3691                                 /*
3692                                  * Check for insertion race.  Since there is no
3693                                  * pte_pv to guard us it is possible for us
3694                                  * to race another thread doing an insertion.
3695                                  * Our lookup misses the pte_pv but our *ptep
3696                                  * check sees the inserted pte.
3697                                  *
3698                                  * XXX panic case seems to occur within a
3699                                  * vm_fork() of /bin/sh, which frankly
3700                                  * shouldn't happen since no other threads
3701                                  * should be inserting to our pmap in that
3702                                  * situation.  Removing, possibly.  Inserting,
3703                                  * shouldn't happen.
3704                                  */
3705                                 if ((oldpte & pmap->pmap_bits[PG_MANAGED_IDX]) &&
3706                                     pt_pv) {
3707                                         pte_pv = pv_find(pmap,
3708                                                          pmap_pte_pindex(sva));
3709                                         if (pte_pv) {
3710                                                 pv_drop(pte_pv);
3711                                                 kprintf("pmap_scan: RACE2 "
3712                                                         "%016jx, %016lx\n",
3713                                                         sva, oldpte);
3714                                                 continue;
3715                                         }
3716                                 }
3717
3718                                 /*
3719                                  * Didn't race
3720                                  */
3721                                 KASSERT((oldpte & (pmap->pmap_bits[PG_MANAGED_IDX] | pmap->pmap_bits[PG_V_IDX])) ==
3722                                     pmap->pmap_bits[PG_V_IDX],
3723                                     ("badD *ptep %016lx/%016lx sva %016lx "
3724                                     "pte_pv NULL pm_generation %d/%d",
3725                                      *ptep, oldpte, sva,
3726                                      generation, pmap->pm_generation));
3727                                 info->func(pmap, info, NULL, pt_pv, 0,
3728                                     sva, ptep, info->arg);
3729                         }
3730                         pte_pv = NULL;
3731                         sva += PAGE_SIZE;
3732                         ++ptep;
3733                 }
3734                 lwkt_yield();
3735         }
3736         if (pd_pv) {
3737                 pv_put(pd_pv);
3738                 pd_pv = NULL;
3739         }
3740         if (pt_pv) {
3741                 pv_put(pt_pv);
3742                 pt_pv = NULL;
3743         }
3744         lwkt_yield();
3745
3746         /*
3747          * Relock before returning.
3748          */
3749         spin_lock(&pmap->pm_spin);
3750         return (0);
3751 }
3752
3753 void
3754 pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva)
3755 {
3756         struct pmap_scan_info info;
3757
3758         info.pmap = pmap;
3759         info.sva = sva;
3760         info.eva = eva;
3761         info.func = pmap_remove_callback;
3762         info.arg = NULL;
3763         info.doinval = 1;       /* normal remove requires pmap inval */
3764         pmap_scan(&info);
3765 }
3766
3767 static void
3768 pmap_remove_noinval(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva)
3769 {
3770         struct pmap_scan_info info;
3771
3772         info.pmap = pmap;
3773         info.sva = sva;
3774         info.eva = eva;
3775         info.func = pmap_remove_callback;
3776         info.arg = NULL;
3777         info.doinval = 0;       /* normal remove requires pmap inval */
3778         pmap_scan(&info);
3779 }
3780
3781 static void
3782 pmap_remove_callback(pmap_t pmap, struct pmap_scan_info *info,
3783                      pv_entry_t pte_pv, pv_entry_t pt_pv, int sharept,
3784                      vm_offset_t va, pt_entry_t *ptep, void *arg __unused)
3785 {
3786         pt_entry_t pte;
3787
3788         if (pte_pv) {
3789                 /*
3790                  * This will also drop pt_pv's wire_count. Note that
3791                  * terminal pages are not wired based on mmu presence.
3792                  */
3793                 if (info->doinval)
3794                         pmap_remove_pv_pte(pte_pv, pt_pv, &info->inval);
3795                 else
3796                         pmap_remove_pv_pte(pte_pv, pt_pv, NULL);
3797                 pmap_remove_pv_page(pte_pv);
3798                 pv_free(pte_pv);
3799         } else if (sharept == 0) {
3800                 /*
3801                  * Unmanaged page table (pt, pd, or pdp. Not pte).
3802                  *
3803                  * pt_pv's wire_count is still bumped by unmanaged pages
3804                  * so we must decrement it manually.
3805                  *
3806                  * We have to unwire the target page table page.
3807                  *
3808                  * It is unclear how we can invalidate a segment so we
3809                  * invalidate -1 which invlidates the tlb.
3810                  */
3811                 if (info->doinval)
3812                         pmap_inval_interlock(&info->inval, pmap, -1);
3813                 pte = pte_load_clear(ptep);
3814                 if (info->doinval)
3815                         pmap_inval_deinterlock(&info->inval, pmap);
3816                 if (pte & pmap->pmap_bits[PG_W_IDX])
3817                         atomic_add_long(&pmap->pm_stats.wired_count, -1);
3818                 atomic_add_long(&pmap->pm_stats.resident_count, -1);
3819                 if (vm_page_unwire_quick(pt_pv->pv_m))
3820                         panic("pmap_remove: insufficient wirecount");
3821         } else {
3822                 /*
3823                  * Unmanaged page table (pt, pd, or pdp. Not pte) for
3824                  * a shared page table.
3825                  *
3826                  * pt_pv is actually the pd_pv for our pmap (not the shared
3827                  * object pmap).
3828                  *
3829                  * We have to unwire the target page table page and we
3830                  * have to unwire our page directory page.
3831                  *
3832                  * It is unclear how we can invalidate a segment so we
3833                  * invalidate -1 which invlidates the tlb.
3834                  */
3835                 if (info->doinval)
3836                         pmap_inval_interlock(&info->inval, pmap, -1);
3837                 pte = pte_load_clear(ptep);
3838                 if (info->doinval)
3839                         pmap_inval_deinterlock(&info->inval, pmap);
3840                 atomic_add_long(&pmap->pm_stats.resident_count, -1);
3841                 KKASSERT((pte & pmap->pmap_bits[PG_DEVICE_IDX]) == 0);
3842                 if (vm_page_unwire_quick(PHYS_TO_VM_PAGE(pte & PG_FRAME)))
3843                         panic("pmap_remove: shared pgtable1 bad wirecount");
3844                 if (vm_page_unwire_quick(pt_pv->pv_m))
3845                         panic("pmap_remove: shared pgtable2 bad wirecount");
3846         }
3847 }
3848
3849 /*
3850  * Removes this physical page from all physical maps in which it resides.
3851  * Reflects back modify bits to the pager.
3852  *
3853  * This routine may not be called from an interrupt.
3854  */
3855 static
3856 void
3857 pmap_remove_all(vm_page_t m)
3858 {
3859         struct pmap_inval_info info;
3860         pv_entry_t pv;
3861
3862         if (!pmap_initialized /* || (m->flags & PG_FICTITIOUS)*/)
3863                 return;
3864
3865         pmap_inval_init(&info);
3866         vm_page_spin_lock(m);
3867         while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3868                 KKASSERT(pv->pv_m == m);
3869                 if (pv_hold_try(pv)) {
3870                         vm_page_spin_unlock(m);
3871                 } else {
3872                         vm_page_spin_unlock(m);
3873                         pv_lock(pv);
3874                 }
3875                 if (pv->pv_m != m) {
3876                         pv_put(pv);
3877                         vm_page_spin_lock(m);
3878                         continue;
3879                 }
3880
3881                 /*
3882                  * Holding no spinlocks, pv is locked.
3883                  */
3884                 pmap_remove_pv_pte(pv, NULL, &info);
3885                 pmap_remove_pv_page(pv);
3886                 pv_free(pv);
3887                 vm_page_spin_lock(m);
3888         }
3889         KKASSERT((m->flags & (PG_MAPPED|PG_WRITEABLE)) == 0);
3890         vm_page_spin_unlock(m);
3891         pmap_inval_done(&info);
3892 }
3893
3894 /*
3895  * Set the physical protection on the specified range of this map
3896  * as requested.  This function is typically only used for debug watchpoints
3897  * and COW pages.
3898  *
3899  * This function may not be called from an interrupt if the map is
3900  * not the kernel_pmap.
3901  *
3902  * NOTE!  For shared page table pages we just unmap the page.
3903  */
3904 void
3905 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
3906 {
3907         struct pmap_scan_info info;
3908         /* JG review for NX */
3909
3910         if (pmap == NULL)
3911                 return;
3912         if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
3913                 pmap_remove(pmap, sva, eva);
3914                 return;
3915         }
3916         if (prot & VM_PROT_WRITE)
3917                 return;
3918         info.pmap = pmap;
3919         info.sva = sva;
3920         info.eva = eva;
3921         info.func = pmap_protect_callback;
3922         info.arg = &prot;
3923         info.doinval = 1;
3924         pmap_scan(&info);
3925 }
3926
3927 static
3928 void
3929 pmap_protect_callback(pmap_t pmap, struct pmap_scan_info *info,
3930                       pv_entry_t pte_pv, pv_entry_t pt_pv, int sharept,
3931                       vm_offset_t va, pt_entry_t *ptep, void *arg __unused)
3932 {
3933         pt_entry_t pbits;
3934         pt_entry_t cbits;
3935         pt_entry_t pte;
3936         vm_page_t m;
3937
3938         /*
3939          * XXX non-optimal.
3940          */
3941         pmap_inval_interlock(&info->inval, pmap, va);
3942 again:
3943         pbits = *ptep;
3944         cbits = pbits;
3945         if (pte_pv) {
3946                 m = NULL;
3947                 if (pbits & pmap->pmap_bits[PG_A_IDX]) {
3948                         if ((pbits & pmap->pmap_bits[PG_DEVICE_IDX]) == 0) {
3949                                 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
3950                                 KKASSERT(m == pte_pv->pv_m);
3951                                 vm_page_flag_set(m, PG_REFERENCED);
3952                         }
3953                         cbits &= ~pmap->pmap_bits[PG_A_IDX];
3954                 }
3955                 if (pbits & pmap->pmap_bits[PG_M_IDX]) {
3956                         if (pmap_track_modified(pte_pv->pv_pindex)) {
3957                                 if ((pbits & pmap->pmap_bits[PG_DEVICE_IDX]) == 0) {
3958                                         if (m == NULL) {
3959                                                 m = PHYS_TO_VM_PAGE(pbits &
3960                                                                     PG_FRAME);
3961                                         }
3962                                         vm_page_dirty(m);
3963                                 }
3964                                 cbits &= ~pmap->pmap_bits[PG_M_IDX];
3965                         }
3966                 }
3967         } else if (sharept) {
3968                 /*
3969                  * Unmanaged page table, pt_pv is actually the pd_pv
3970                  * for our pmap (not the object's shared pmap).
3971                  *
3972                  * When asked to protect something in a shared page table
3973                  * page we just unmap the page table page.  We have to
3974                  * invalidate the tlb in this situation.
3975                  *
3976                  * XXX Warning, shared page tables will not be used for
3977                  * OBJT_DEVICE or OBJT_MGTDEVICE (PG_FICTITIOUS) mappings
3978                  * so PHYS_TO_VM_PAGE() should be safe here.
3979                  */
3980                 pte = pte_load_clear(ptep);
3981                 pmap_inval_invltlb(&info->inval);
3982                 if (vm_page_unwire_quick(PHYS_TO_VM_PAGE(pte & PG_FRAME)))
3983                         panic("pmap_protect: pgtable1 pg bad wirecount");
3984                 if (vm_page_unwire_quick(pt_pv->pv_m))
3985                         panic("pmap_protect: pgtable2 pg bad wirecount");
3986                 ptep = NULL;
3987         }
3988         /* else unmanaged page, adjust bits, no wire changes */
3989
3990         if (ptep) {
3991                 cbits &= ~pmap->pmap_bits[PG_RW_IDX];
3992 #ifdef PMAP_DEBUG2
3993                 if (pmap_enter_debug > 0) {
3994                         --pmap_enter_debug;
3995                         kprintf("pmap_protect va=%lx ptep=%p pte_pv=%p "
3996                                 "pt_pv=%p cbits=%08lx\n",
3997                                 va, ptep, pte_pv,
3998                                 pt_pv, cbits
3999                         );
4000                 }
4001 #endif
4002                 if (pbits != cbits && !atomic_cmpset_long(ptep, pbits, cbits)) {
4003                         goto again;
4004                 }
4005         }
4006         pmap_inval_deinterlock(&info->inval, pmap);
4007         if (pte_pv)
4008                 pv_put(pte_pv);
4009 }
4010
4011 /*
4012  * Insert the vm_page (m) at the virtual address (va), replacing any prior
4013  * mapping at that address.  Set protection and wiring as requested.
4014  *
4015  * If entry is non-NULL we check to see if the SEG_SIZE optimization is
4016  * possible.  If it is we enter the page into the appropriate shared pmap
4017  * hanging off the related VM object instead of the passed pmap, then we
4018  * share the page table page from the VM object's pmap into the current pmap.
4019  *
4020  * NOTE: This routine MUST insert the page into the pmap now, it cannot
4021  *       lazy-evaluate.
4022  */
4023 void
4024 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
4025            boolean_t wired, vm_map_entry_t entry)
4026 {
4027         pmap_inval_info info;
4028         pv_entry_t pt_pv;       /* page table */
4029         pv_entry_t pte_pv;      /* page table entry */
4030         pt_entry_t *ptep;
4031         vm_paddr_t opa;
4032         pt_entry_t origpte, newpte;
4033         vm_paddr_t pa;
4034
4035         if (pmap == NULL)
4036                 return;
4037         va = trunc_page(va);
4038 #ifdef PMAP_DIAGNOSTIC
4039         if (va >= KvaEnd)
4040                 panic("pmap_enter: toobig");
4041         if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
4042                 panic("pmap_enter: invalid to pmap_enter page table "
4043                       "pages (va: 0x%lx)", va);
4044 #endif
4045         if (va < UPT_MAX_ADDRESS && pmap == &kernel_pmap) {
4046                 kprintf("Warning: pmap_enter called on UVA with "
4047                         "kernel_pmap\n");
4048 #ifdef DDB
4049                 db_print_backtrace();
4050 #endif
4051         }
4052         if (va >= UPT_MAX_ADDRESS && pmap != &kernel_pmap) {
4053                 kprintf("Warning: pmap_enter called on KVA without"
4054                         "kernel_pmap\n");
4055 #ifdef DDB
4056                 db_print_backtrace();
4057 #endif
4058         }
4059
4060         /*
4061          * Get locked PV entries for our new page table entry (pte_pv)
4062          * and for its parent page table (pt_pv).  We need the parent
4063          * so we can resolve the location of the ptep.
4064          *
4065          * Only hardware MMU actions can modify the ptep out from
4066          * under us.
4067          *
4068          * if (m) is fictitious or unmanaged we do not create a managing
4069          * pte_pv for it.  Any pre-existing page's management state must
4070          * match (avoiding code complexity).
4071          *
4072          * If the pmap is still being initialized we assume existing
4073          * page tables.
4074          *
4075          * Kernel mapppings do not track page table pages (i.e. pt_pv).
4076          */
4077         if (pmap_initialized == FALSE) {
4078                 pte_pv = NULL;
4079                 pt_pv = NULL;
4080                 ptep = vtopte(va);
4081                 origpte = *ptep;
4082         } else if (m->flags & (/*PG_FICTITIOUS |*/ PG_UNMANAGED)) { /* XXX */
4083                 pte_pv = NULL;
4084                 if (va >= VM_MAX_USER_ADDRESS) {
4085                         pt_pv = NULL;
4086                         ptep = vtopte(va);
4087                 } else {
4088                         pt_pv = pmap_allocpte_seg(pmap, pmap_pt_pindex(va),
4089                                                   NULL, entry, va);
4090                         ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va));
4091                 }
4092                 origpte = *ptep;
4093                 cpu_ccfence();
4094                 KKASSERT(origpte == 0 ||
4095                          (origpte & pmap->pmap_bits[PG_MANAGED_IDX]) == 0);
4096         } else {
4097                 if (va >= VM_MAX_USER_ADDRESS) {
4098                         /*
4099                          * Kernel map, pv_entry-tracked.
4100                          */
4101                         pt_pv = NULL;
4102                         pte_pv = pmap_allocpte(pmap, pmap_pte_pindex(va), NULL);
4103                         ptep = vtopte(va);
4104                 } else {
4105                         /*
4106                          * User map
4107                          */
4108                         pte_pv = pmap_allocpte_seg(pmap, pmap_pte_pindex(va),
4109                                                    &pt_pv, entry, va);
4110                         ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va));
4111                 }
4112                 origpte = *ptep;
4113                 cpu_ccfence();
4114                 KKASSERT(origpte == 0 ||
4115                          (origpte & pmap->pmap_bits[PG_MANAGED_IDX]));
4116         }
4117
4118         pa = VM_PAGE_TO_PHYS(m);
4119         opa = origpte & PG_FRAME;
4120
4121         newpte = (pt_entry_t)(pa | pte_prot(pmap, prot) |
4122                  pmap->pmap_bits[PG_V_IDX] | pmap->pmap_bits[PG_A_IDX]);
4123         if (wired)
4124                 newpte |= pmap->pmap_bits[PG_W_IDX];
4125         if (va < VM_MAX_USER_ADDRESS)
4126                 newpte |= pmap->pmap_bits[PG_U_IDX];
4127         if (pte_pv)
4128                 newpte |= pmap->pmap_bits[PG_MANAGED_IDX];
4129 //      if (pmap == &kernel_pmap)
4130 //              newpte |= pgeflag;
4131         newpte |= pmap->pmap_cache_bits[m->pat_mode];
4132         if (m->flags & PG_FICTITIOUS)
4133                 newpte |= pmap->pmap_bits[PG_DEVICE_IDX];
4134
4135         /*
4136          * It is possible for multiple faults to occur in threaded
4137          * environments, the existing pte might be correct.
4138          */
4139         if (((origpte ^ newpte) & ~(pt_entry_t)(pmap->pmap_bits[PG_M_IDX] |
4140             pmap->pmap_bits[PG_A_IDX])) == 0)
4141                 goto done;
4142
4143         if ((prot & VM_PROT_NOSYNC) == 0)
4144                 pmap_inval_init(&info);
4145
4146         /*
4147          * Ok, either the address changed or the protection or wiring
4148          * changed.
4149          *
4150          * Clear the current entry, interlocking the removal.  For managed
4151          * pte's this will also flush the modified state to the vm_page.
4152          * Atomic ops are mandatory in order to ensure that PG_M events are
4153          * not lost during any transition.
4154          *
4155          * WARNING: The caller has busied the new page but not the original
4156          *          vm_page which we are trying to replace.  Because we hold
4157          *          the pte_pv lock, but have not busied the page, PG bits
4158          *          can be cleared out from under us.
4159          */
4160         if (opa) {
4161                 if (pte_pv) {
4162                         /*
4163                          * pmap_remove_pv_pte() unwires pt_pv and assumes
4164                          * we will free pte_pv, but since we are reusing
4165                          * pte_pv we want to retain the wire count.
4166                          *
4167                          * pt_pv won't exist for a kernel page (managed or
4168                          * otherwise).
4169                          */
4170                         if (pt_pv)
4171                                 vm_page_wire_quick(pt_pv->pv_m);
4172                         if (prot & VM_PROT_NOSYNC)
4173                                 pmap_remove_pv_pte(pte_pv, pt_pv, NULL);
4174                         else
4175                                 pmap_remove_pv_pte(pte_pv, pt_pv, &info);
4176                         if (pte_pv->pv_m)
4177                                 pmap_remove_pv_page(pte_pv);
4178                 } else if (prot & VM_PROT_NOSYNC) {
4179                         /*
4180                          * Unmanaged page, NOSYNC (no mmu sync) requested.
4181                          *
4182                          * Leave wire count on PT page intact.
4183                          */
4184                         (void)pte_load_clear(ptep);
4185                         cpu_invlpg((void *)va);
4186                         atomic_add_long(&pmap->pm_stats.resident_count, -1);
4187                 } else {
4188                         /*
4189                          * Unmanaged page, normal enter.
4190                          *
4191                          * Leave wire count on PT page intact.
4192                          */
4193                         pmap_inval_interlock(&info, pmap, va);
4194                         (void)pte_load_clear(ptep);
4195                         pmap_inval_deinterlock(&info, pmap);
4196                         atomic_add_long(&pmap->pm_stats.resident_count, -1);
4197                 }
4198                 KKASSERT(*ptep == 0);
4199         }
4200
4201 #ifdef PMAP_DEBUG2
4202         if (pmap_enter_debug > 0) {
4203                 --pmap_enter_debug;
4204                 kprintf("pmap_enter: va=%lx m=%p origpte=%lx newpte=%lx ptep=%p"
4205                         " pte_pv=%p pt_pv=%p opa=%lx prot=%02x\n",
4206                         va, m,
4207                         origpte, newpte, ptep,
4208                         pte_pv, pt_pv, opa, prot);
4209         }
4210 #endif
4211
4212         if (pte_pv) {
4213                 /*
4214                  * Enter on the PV list if part of our managed memory.
4215                  * Wiring of the PT page is already handled.
4216                  */
4217                 KKASSERT(pte_pv->pv_m == NULL);
4218                 vm_page_spin_lock(m);
4219                 pte_pv->pv_m = m;
4220                 pmap_page_stats_adding(m);
4221                 TAILQ_INSERT_TAIL(&m->md.pv_list, pte_pv, pv_list);
4222                 vm_page_flag_set(m, PG_MAPPED);
4223                 vm_page_spin_unlock(m);
4224         } else if (pt_pv && opa == 0) {
4225                 /*
4226                  * We have to adjust the wire count on the PT page ourselves
4227                  * for unmanaged entries.  If opa was non-zero we retained
4228                  * the existing wire count from the removal.
4229                  */
4230                 vm_page_wire_quick(pt_pv->pv_m);
4231         }
4232
4233         /*
4234          * Kernel VMAs (pt_pv == NULL) require pmap invalidation interlocks.
4235          *
4236          * User VMAs do not because those will be zero->non-zero, so no
4237          * stale entries to worry about at this point.
4238          *
4239          * For KVM there appear to still be issues.  Theoretically we
4240          * should be able to scrap the interlocks entirely but we
4241          * get crashes.
4242          */
4243         if ((prot & VM_PROT_NOSYNC) == 0 && pt_pv == NULL)
4244                 pmap_inval_interlock(&info, pmap, va);
4245
4246         /*
4247          * Set the pte
4248          */
4249         *(volatile pt_entry_t *)ptep = newpte;
4250
4251         if ((prot & VM_PROT_NOSYNC) == 0 && pt_pv == NULL)
4252                 pmap_inval_deinterlock(&info, pmap);
4253         else if (pt_pv == NULL)
4254                 cpu_invlpg((void *)va);
4255
4256         if (wired) {
4257                 if (pte_pv) {
4258                         atomic_add_long(&pte_pv->pv_pmap->pm_stats.wired_count,
4259                                         1);
4260                 } else {
4261                         atomic_add_long(&pmap->pm_stats.wired_count, 1);
4262                 }
4263         }
4264         if (newpte & pmap->pmap_bits[PG_RW_IDX])
4265                 vm_page_flag_set(m, PG_WRITEABLE);
4266
4267         /*
4268          * Unmanaged pages need manual resident_count tracking.
4269          */
4270         if (pte_pv == NULL && pt_pv)
4271                 atomic_add_long(&pt_pv->pv_pmap->pm_stats.resident_count, 1);
4272
4273         /*
4274          * Cleanup
4275          */
4276         if ((prot & VM_PROT_NOSYNC) == 0 || pte_pv == NULL)
4277                 pmap_inval_done(&info);
4278 done:
4279         KKASSERT((newpte & pmap->pmap_bits[PG_MANAGED_IDX]) == 0 ||
4280                  (m->flags & PG_MAPPED));
4281
4282         /*
4283          * Cleanup the pv entry, allowing other accessors.
4284          */
4285         if (pte_pv)
4286                 pv_put(pte_pv);
4287         if (pt_pv)
4288                 pv_put(pt_pv);
4289 }
4290
4291 /*
4292  * This code works like pmap_enter() but assumes VM_PROT_READ and not-wired.
4293  * This code also assumes that the pmap has no pre-existing entry for this
4294  * VA.
4295  *
4296  * This code currently may only be used on user pmaps, not kernel_pmap.
4297  */
4298 void
4299 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m)
4300 {
4301         pmap_enter(pmap, va, m, VM_PROT_READ, FALSE, NULL);
4302 }
4303
4304 /*
4305  * Make a temporary mapping for a physical address.  This is only intended
4306  * to be used for panic dumps.
4307  *
4308  * The caller is responsible for calling smp_invltlb().
4309  */
4310 void *
4311 pmap_kenter_temporary(vm_paddr_t pa, long i)
4312 {
4313         pmap_kenter_quick((vm_offset_t)crashdumpmap + (i * PAGE_SIZE), pa);
4314         return ((void *)crashdumpmap);
4315 }
4316
4317 #define MAX_INIT_PT (96)
4318
4319 /*
4320  * This routine preloads the ptes for a given object into the specified pmap.
4321  * This eliminates the blast of soft faults on process startup and
4322  * immediately after an mmap.
4323  */
4324 static int pmap_object_init_pt_callback(vm_page_t p, void *data);
4325
4326 void
4327 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_prot_t prot,
4328                     vm_object_t object, vm_pindex_t pindex,
4329                     vm_size_t size, int limit)
4330 {
4331         struct rb_vm_page_scan_info info;
4332         struct lwp *lp;
4333         vm_size_t psize;
4334
4335         /*
4336          * We can't preinit if read access isn't set or there is no pmap
4337          * or object.
4338          */
4339         if ((prot & VM_PROT_READ) == 0 || pmap == NULL || object == NULL)
4340                 return;
4341
4342         /*
4343          * We can't preinit if the pmap is not the current pmap
4344          */
4345         lp = curthread->td_lwp;
4346         if (lp == NULL || pmap != vmspace_pmap(lp->lwp_vmspace))
4347                 return;
4348
4349         /*
4350          * Misc additional checks
4351          */
4352         psize = x86_64_btop(size);
4353
4354         if ((object->type != OBJT_VNODE) ||
4355                 ((limit & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) &&
4356                         (object->resident_page_count > MAX_INIT_PT))) {
4357                 return;
4358         }
4359
4360         if (pindex + psize > object->size) {
4361                 if (object->size < pindex)
4362                         return;           
4363                 psize = object->size - pindex;
4364         }
4365
4366         if (psize == 0)
4367                 return;
4368
4369         /*
4370          * If everything is segment-aligned do not pre-init here.  Instead
4371          * allow the normal vm_fault path to pass a segment hint to
4372          * pmap_enter() which will then use an object-referenced shared
4373          * page table page.
4374          */
4375         if ((addr & SEG_MASK) == 0 &&
4376             (ctob(psize) & SEG_MASK) == 0 &&
4377             (ctob(pindex) & SEG_MASK) == 0) {
4378                 return;
4379         }
4380
4381         /*
4382          * Use a red-black scan to traverse the requested range and load
4383          * any valid pages found into the pmap.
4384          *
4385          * We cannot safely scan the object's memq without holding the
4386          * object token.
4387          */
4388         info.start_pindex = pindex;
4389         info.end_pindex = pindex + psize - 1;
4390         info.limit = limit;
4391         info.mpte = NULL;
4392         info.addr = addr;
4393         info.pmap = pmap;
4394
4395         vm_object_hold_shared(object);
4396         vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
4397                                 pmap_object_init_pt_callback, &info);
4398         vm_object_drop(object);
4399 }
4400
4401 static
4402 int
4403 pmap_object_init_pt_callback(vm_page_t p, void *data)
4404 {
4405         struct rb_vm_page_scan_info *info = data;
4406         vm_pindex_t rel_index;
4407
4408         /*
4409          * don't allow an madvise to blow away our really
4410          * free pages allocating pv entries.
4411          */
4412         if ((info->limit & MAP_PREFAULT_MADVISE) &&
4413                 vmstats.v_free_count < vmstats.v_free_reserved) {
4414                     return(-1);
4415         }
4416
4417         /*
4418          * Ignore list markers and ignore pages we cannot instantly
4419          * busy (while holding the object token).
4420          */
4421         if (p->flags & PG_MARKER)
4422                 return 0;
4423         if (vm_page_busy_try(p, TRUE))
4424                 return 0;
4425         if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
4426             (p->flags & PG_FICTITIOUS) == 0) {
4427                 if ((p->queue - p->pc) == PQ_CACHE)
4428                         vm_page_deactivate(p);
4429                 rel_index = p->pindex - info->start_pindex;
4430                 pmap_enter_quick(info->pmap,
4431                                  info->addr + x86_64_ptob(rel_index), p);
4432         }
4433         vm_page_wakeup(p);
4434         lwkt_yield();
4435         return(0);
4436 }
4437
4438 /*
4439  * Return TRUE if the pmap is in shape to trivially pre-fault the specified
4440  * address.
4441  *
4442  * Returns FALSE if it would be non-trivial or if a pte is already loaded
4443  * into the slot.
4444  *
4445  * XXX This is safe only because page table pages are not freed.
4446  */
4447 int
4448 pmap_prefault_ok(pmap_t pmap, vm_offset_t addr)
4449 {
4450         pt_entry_t *pte;
4451
4452         /*spin_lock(&pmap->pm_spin);*/
4453         if ((pte = pmap_pte(pmap, addr)) != NULL) {
4454                 if (*pte & pmap->pmap_bits[PG_V_IDX]) {
4455                         /*spin_unlock(&pmap->pm_spin);*/
4456                         return FALSE;
4457                 }
4458         }
4459         /*spin_unlock(&pmap->pm_spin);*/
4460         return TRUE;
4461 }
4462
4463 /*
4464  * Change the wiring attribute for a pmap/va pair.  The mapping must already
4465  * exist in the pmap.  The mapping may or may not be managed.
4466  */
4467 void
4468 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired,
4469                    vm_map_entry_t entry)
4470 {
4471         pt_entry_t *ptep;
4472         pv_entry_t pv;
4473
4474         if (pmap == NULL)
4475                 return;
4476         lwkt_gettoken(&pmap->pm_token);
4477         pv = pmap_allocpte_seg(pmap, pmap_pt_pindex(va), NULL, entry, va);
4478         ptep = pv_pte_lookup(pv, pmap_pte_index(va));
4479
4480         if (wired && !pmap_pte_w(pmap, ptep))
4481                 atomic_add_long(&pv->pv_pmap->pm_stats.wired_count, 1);
4482         else if (!wired && pmap_pte_w(pmap, ptep))
4483                 atomic_add_long(&pv->pv_pmap->pm_stats.wired_count, -1);
4484
4485         /*
4486          * Wiring is not a hardware characteristic so there is no need to
4487          * invalidate TLB.  However, in an SMP environment we must use
4488          * a locked bus cycle to update the pte (if we are not using 
4489          * the pmap_inval_*() API that is)... it's ok to do this for simple
4490          * wiring changes.
4491          */
4492         if (wired)
4493                 atomic_set_long(ptep, pmap->pmap_bits[PG_W_IDX]);
4494         else
4495                 atomic_clear_long(ptep, pmap->pmap_bits[PG_W_IDX]);
4496         pv_put(pv);
4497         lwkt_reltoken(&pmap->pm_token);
4498 }
4499
4500
4501
4502 /*
4503  * Copy the range specified by src_addr/len from the source map to
4504  * the range dst_addr/len in the destination map.
4505  *
4506  * This routine is only advisory and need not do anything.
4507  */
4508 void
4509 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, 
4510           vm_size_t len, vm_offset_t src_addr)
4511 {
4512 }       
4513
4514 /*
4515  * pmap_zero_page:
4516  *
4517  *      Zero the specified physical page.
4518  *
4519  *      This function may be called from an interrupt and no locking is
4520  *      required.
4521  */
4522 void
4523 pmap_zero_page(vm_paddr_t phys)
4524 {
4525         vm_offset_t va = PHYS_TO_DMAP(phys);
4526
4527         pagezero((void *)va);
4528 }
4529
4530 /*
4531  * pmap_page_assertzero:
4532  *
4533  *      Assert that a page is empty, panic if it isn't.
4534  */
4535 void
4536 pmap_page_assertzero(vm_paddr_t phys)
4537 {
4538         vm_offset_t va = PHYS_TO_DMAP(phys);
4539         size_t i;
4540
4541         for (i = 0; i < PAGE_SIZE; i += sizeof(long)) {
4542                 if (*(long *)((char *)va + i) != 0) {
4543                         panic("pmap_page_assertzero() @ %p not zero!",
4544                               (void *)(intptr_t)va);
4545                 }
4546         }
4547 }
4548
4549 /*
4550  * pmap_zero_page:
4551  *
4552  *      Zero part of a physical page by mapping it into memory and clearing
4553  *      its contents with bzero.
4554  *
4555  *      off and size may not cover an area beyond a single hardware page.
4556  */
4557 void
4558 pmap_zero_page_area(vm_paddr_t phys, int off, int size)
4559 {
4560         vm_offset_t virt = PHYS_TO_DMAP(phys);
4561
4562         bzero((char *)virt + off, size);
4563 }
4564
4565 /*
4566  * pmap_copy_page:
4567  *
4568  *      Copy the physical page from the source PA to the target PA.
4569  *      This function may be called from an interrupt.  No locking
4570  *      is required.
4571  */
4572 void
4573 pmap_copy_page(vm_paddr_t src, vm_paddr_t dst)
4574 {
4575         vm_offset_t src_virt, dst_virt;
4576
4577         src_virt = PHYS_TO_DMAP(src);
4578         dst_virt = PHYS_TO_DMAP(dst);
4579         bcopy((void *)src_virt, (void *)dst_virt, PAGE_SIZE);
4580 }
4581
4582 /*
4583  * pmap_copy_page_frag:
4584  *
4585  *      Copy the physical page from the source PA to the target PA.
4586  *      This function may be called from an interrupt.  No locking
4587  *      is required.
4588  */
4589 void
4590 pmap_copy_page_frag(vm_paddr_t src, vm_paddr_t dst, size_t bytes)
4591 {
4592         vm_offset_t src_virt, dst_virt;
4593
4594         src_virt = PHYS_TO_DMAP(src);
4595         dst_virt = PHYS_TO_DMAP(dst);
4596
4597         bcopy((char *)src_virt + (src & PAGE_MASK),
4598               (char *)dst_virt + (dst & PAGE_MASK),
4599               bytes);
4600 }
4601
4602 /*
4603  * Returns true if the pmap's pv is one of the first 16 pvs linked to from
4604  * this page.  This count may be changed upwards or downwards in the future;
4605  * it is only necessary that true be returned for a small subset of pmaps
4606  * for proper page aging.
4607  */
4608 boolean_t
4609 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
4610 {
4611         pv_entry_t pv;
4612         int loops = 0;
4613
4614         if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
4615                 return FALSE;
4616
4617         vm_page_spin_lock(m);
4618         TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4619                 if (pv->pv_pmap == pmap) {
4620                         vm_page_spin_unlock(m);
4621                         return TRUE;
4622                 }
4623                 loops++;
4624                 if (loops >= 16)
4625                         break;
4626         }
4627         vm_page_spin_unlock(m);
4628         return (FALSE);
4629 }
4630
4631 /*
4632  * Remove all pages from specified address space this aids process exit
4633  * speeds.  Also, this code may be special cased for the current process
4634  * only.
4635  */
4636 void
4637 pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
4638 {
4639         pmap_remove_noinval(pmap, sva, eva);
4640         cpu_invltlb();
4641 }
4642
4643 /*
4644  * pmap_testbit tests bits in pte's note that the testbit/clearbit
4645  * routines are inline, and a lot of things compile-time evaluate.
4646  */
4647 static
4648 boolean_t
4649 pmap_testbit(vm_page_t m, int bit)
4650 {
4651         pv_entry_t pv;
4652         pt_entry_t *pte;
4653         pmap_t pmap;
4654
4655         if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
4656                 return FALSE;
4657
4658         if (TAILQ_FIRST(&m->md.pv_list) == NULL)
4659                 return FALSE;
4660         vm_page_spin_lock(m);
4661         if (TAILQ_FIRST(&m->md.pv_list) == NULL) {
4662                 vm_page_spin_unlock(m);
4663                 return FALSE;
4664         }
4665
4666         TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4667
4668 #if defined(PMAP_DIAGNOSTIC)
4669                 if (pv->pv_pmap == NULL) {
4670                         kprintf("Null pmap (tb) at pindex: %"PRIu64"\n",
4671                             pv->pv_pindex);
4672                         continue;
4673                 }
4674 #endif
4675                 pmap = pv->pv_pmap;
4676
4677                 /*
4678                  * If the bit being tested is the modified bit, then
4679                  * mark clean_map and ptes as never
4680                  * modified.
4681                  *
4682                  * WARNING!  Because we do not lock the pv, *pte can be in a
4683                  *           state of flux.  Despite this the value of *pte
4684                  *           will still be related to the vm_page in some way
4685                  *           because the pv cannot be destroyed as long as we
4686                  *           hold the vm_page spin lock.
4687                  */
4688                 if (bit == PG_A_IDX || bit == PG_M_IDX) {
4689                                 //& (pmap->pmap_bits[PG_A_IDX] | pmap->pmap_bits[PG_M_IDX])) {
4690                         if (!pmap_track_modified(pv->pv_pindex))
4691                                 continue;
4692                 }
4693
4694                 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT);
4695                 if (*pte & pmap->pmap_bits[bit]) {
4696                         vm_page_spin_unlock(m);
4697                         return TRUE;
4698                 }
4699         }
4700         vm_page_spin_unlock(m);
4701         return (FALSE);
4702 }
4703
4704 /*
4705  * This routine is used to modify bits in ptes.  Only one bit should be
4706  * specified.  PG_RW requires special handling.
4707  *
4708  * Caller must NOT hold any spin locks
4709  */
4710 static __inline
4711 void
4712 pmap_clearbit(vm_page_t m, int bit_index)
4713 {
4714         struct pmap_inval_info info;
4715         pv_entry_t pv;
4716         pt_entry_t *pte;
4717         pt_entry_t pbits;
4718         pmap_t pmap;
4719
4720         if (bit_index == PG_RW_IDX)
4721                 vm_page_flag_clear(m, PG_WRITEABLE);
4722         if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) {
4723                 return;
4724         }
4725
4726         /*
4727          * PG_M or PG_A case
4728          *
4729          * Loop over all current mappings setting/clearing as appropos If
4730          * setting RO do we need to clear the VAC?
4731          *
4732          * NOTE: When clearing PG_M we could also (not implemented) drop
4733          *       through to the PG_RW code and clear PG_RW too, forcing
4734          *       a fault on write to redetect PG_M for virtual kernels, but
4735          *       it isn't necessary since virtual kernels invalidate the
4736          *       pte when they clear the VPTE_M bit in their virtual page
4737          *       tables.
4738          *
4739          * NOTE: Does not re-dirty the page when clearing only PG_M.
4740          *
4741          * NOTE: Because we do not lock the pv, *pte can be in a state of
4742          *       flux.  Despite this the value of *pte is still somewhat
4743          *       related while we hold the vm_page spin lock.
4744          *
4745          *       *pte can be zero due to this race.  Since we are clearing
4746          *       bits we basically do no harm when this race  ccurs.
4747          */
4748         if (bit_index != PG_RW_IDX) {
4749                 vm_page_spin_lock(m);
4750                 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4751 #if defined(PMAP_DIAGNOSTIC)
4752                         if (pv->pv_pmap == NULL) {
4753                                 kprintf("Null pmap (cb) at pindex: %"PRIu64"\n",
4754                                     pv->pv_pindex);
4755                                 continue;
4756                         }
4757 #endif
4758                         pmap = pv->pv_pmap;
4759                         pte = pmap_pte_quick(pv->pv_pmap,
4760                                              pv->pv_pindex << PAGE_SHIFT);
4761                         pbits = *pte;
4762                         if (pbits & pmap->pmap_bits[bit_index])
4763                                 atomic_clear_long(pte, pmap->pmap_bits[bit_index]);
4764                 }
4765                 vm_page_spin_unlock(m);
4766                 return;
4767         }
4768
4769         /*
4770          * Clear PG_RW.  Also clears PG_M and marks the page dirty if PG_M
4771          * was set.
4772          */
4773         pmap_inval_init(&info);
4774
4775 restart:
4776         vm_page_spin_lock(m);
4777         TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4778                 /*
4779                  * don't write protect pager mappings
4780                  */
4781                 if (!pmap_track_modified(pv->pv_pindex))
4782                         continue;
4783
4784 #if defined(PMAP_DIAGNOSTIC)
4785                 if (pv->pv_pmap == NULL) {
4786                         kprintf("Null pmap (cb) at pindex: %"PRIu64"\n",
4787                             pv->pv_pindex);
4788                         continue;
4789                 }
4790 #endif
4791                 pmap = pv->pv_pmap;
4792                 /*
4793                  * Skip pages which do not have PG_RW set.
4794                  */
4795                 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT);
4796                 if ((*pte & pmap->pmap_bits[PG_RW_IDX]) == 0)
4797                         continue;
4798
4799                 /*
4800                  * Lock the PV
4801                  */
4802                 if (pv_hold_try(pv)) {
4803                         vm_page_spin_unlock(m);
4804                 } else {
4805                         vm_page_spin_unlock(m);
4806                         pv_lock(pv);    /* held, now do a blocking lock */
4807                 }
4808                 if (pv->pv_pmap != pmap || pv->pv_m != m) {
4809                         pv_put(pv);     /* and release */
4810                         goto restart;   /* anything could have happened */
4811                 }
4812                 pmap_inval_interlock(&info, pmap,
4813                                      (vm_offset_t)pv->pv_pindex << PAGE_SHIFT);
4814                 KKASSERT(pv->pv_pmap == pmap);
4815                 for (;;) {
4816                         pbits = *pte;
4817                         cpu_ccfence();
4818                         if (atomic_cmpset_long(pte, pbits, pbits &
4819                             ~(pmap->pmap_bits[PG_RW_IDX] |
4820                             pmap->pmap_bits[PG_M_IDX]))) {
4821                                 break;
4822                         }
4823                 }
4824                 pmap_inval_deinterlock(&info, pmap);
4825                 vm_page_spin_lock(m);
4826
4827                 /*
4828                  * If PG_M was found to be set while we were clearing PG_RW
4829                  * we also clear PG_M (done above) and mark the page dirty.
4830                  * Callers expect this behavior.
4831                  */
4832                 if (pbits & pmap->pmap_bits[PG_M_IDX])
4833                         vm_page_dirty(m);
4834                 pv_put(pv);
4835         }
4836         vm_page_spin_unlock(m);
4837         pmap_inval_done(&info);
4838 }
4839
4840 /*
4841  * Lower the permission for all mappings to a given page.
4842  *
4843  * Page must be busied by caller.  Because page is busied by caller this
4844  * should not be able to race a pmap_enter().
4845  */
4846 void
4847 pmap_page_protect(vm_page_t m, vm_prot_t prot)
4848 {
4849         /* JG NX support? */
4850         if ((prot & VM_PROT_WRITE) == 0) {
4851                 if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) {
4852                         /*
4853                          * NOTE: pmap_clearbit(.. PG_RW) also clears
4854                          *       the PG_WRITEABLE flag in (m).
4855                          */
4856                         pmap_clearbit(m, PG_RW_IDX);
4857                 } else {
4858                         pmap_remove_all(m);
4859                 }
4860         }
4861 }
4862
4863 vm_paddr_t
4864 pmap_phys_address(vm_pindex_t ppn)
4865 {
4866         return (x86_64_ptob(ppn));
4867 }
4868
4869 /*
4870  * Return a count of reference bits for a page, clearing those bits.
4871  * It is not necessary for every reference bit to be cleared, but it
4872  * is necessary that 0 only be returned when there are truly no
4873  * reference bits set.
4874  *
4875  * XXX: The exact number of bits to check and clear is a matter that
4876  * should be tested and standardized at some point in the future for
4877  * optimal aging of shared pages.
4878  *
4879  * This routine may not block.
4880  */
4881 int
4882 pmap_ts_referenced(vm_page_t m)
4883 {
4884         pv_entry_t pv;
4885         pt_entry_t *pte;
4886         pmap_t pmap;
4887         int rtval = 0;
4888
4889         if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
4890                 return (rtval);
4891
4892         vm_page_spin_lock(m);
4893         TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
4894                 if (!pmap_track_modified(pv->pv_pindex))
4895                         continue;
4896                 pmap = pv->pv_pmap;
4897                 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT);
4898                 if (pte && (*pte & pmap->pmap_bits[PG_A_IDX])) {
4899                         atomic_clear_long(pte, pmap->pmap_bits[PG_A_IDX]);
4900                         rtval++;
4901                         if (rtval > 4)
4902                                 break;
4903                 }
4904         }
4905         vm_page_spin_unlock(m);
4906         return (rtval);
4907 }
4908
4909 /*
4910  *      pmap_is_modified:
4911  *
4912  *      Return whether or not the specified physical page was modified
4913  *      in any physical maps.
4914  */
4915 boolean_t
4916 pmap_is_modified(vm_page_t m)
4917 {
4918         boolean_t res;
4919
4920         res = pmap_testbit(m, PG_M_IDX);
4921         return (res);
4922 }
4923
4924 /*
4925  *      Clear the modify bits on the specified physical page.
4926  */
4927 void
4928 pmap_clear_modify(vm_page_t m)
4929 {
4930         pmap_clearbit(m, PG_M_IDX);
4931 }
4932
4933 /*
4934  *      pmap_clear_reference:
4935  *
4936  *      Clear the reference bit on the specified physical page.
4937  */
4938 void
4939 pmap_clear_reference(vm_page_t m)
4940 {
4941         pmap_clearbit(m, PG_A_IDX);
4942 }
4943
4944 /*
4945  * Miscellaneous support routines follow
4946  */
4947
4948 static
4949 void
4950 i386_protection_init(void)
4951 {
4952         int *kp, prot;
4953
4954         /* JG NX support may go here; No VM_PROT_EXECUTE ==> set NX bit  */
4955         kp = protection_codes;
4956         for (prot = 0; prot < PROTECTION_CODES_SIZE; prot++) {
4957                 switch (prot) {
4958                 case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE:
4959                         /*
4960                          * Read access is also 0. There isn't any execute bit,
4961                          * so just make it readable.
4962                          */
4963                 case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE:
4964                 case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE:
4965                 case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE:
4966                         *kp++ = 0;
4967                         break;
4968                 case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE:
4969                 case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE:
4970                 case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE:
4971                 case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE:
4972                         *kp++ = pmap_bits_default[PG_RW_IDX];
4973                         break;
4974                 }
4975         }
4976 }
4977
4978 /*
4979  * Map a set of physical memory pages into the kernel virtual
4980  * address space. Return a pointer to where it is mapped. This
4981  * routine is intended to be used for mapping device memory,
4982  * NOT real memory.
4983  *
4984  * NOTE: We can't use pgeflag unless we invalidate the pages one at
4985  *       a time.
4986  *
4987  * NOTE: The PAT attributes {WRITE_BACK, WRITE_THROUGH, UNCACHED, UNCACHEABLE}
4988  *       work whether the cpu supports PAT or not.  The remaining PAT
4989  *       attributes {WRITE_PROTECTED, WRITE_COMBINING} only work if the cpu
4990  *       supports PAT.
4991  */
4992 void *
4993 pmap_mapdev(vm_paddr_t pa, vm_size_t size)
4994 {
4995         return(pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
4996 }
4997
4998 void *
4999 pmap_mapdev_uncacheable(vm_paddr_t pa, vm_size_t size)
5000 {
5001         return(pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
5002 }
5003
5004 void *
5005 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
5006 {
5007         return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
5008 }
5009
5010 /*
5011  * Map a set of physical memory pages into the kernel virtual
5012  * address space. Return a pointer to where it is mapped. This
5013  * routine is intended to be used for mapping device memory,
5014  * NOT real memory.
5015  */
5016 void *
5017 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
5018 {
5019         vm_offset_t va, tmpva, offset;
5020         pt_entry_t *pte;
5021         vm_size_t tmpsize;
5022
5023         offset = pa & PAGE_MASK;
5024         size = roundup(offset + size, PAGE_SIZE);
5025
5026         va = kmem_alloc_nofault(&kernel_map, size, PAGE_SIZE);
5027         if (va == 0)
5028                 panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
5029
5030         pa = pa & ~PAGE_MASK;
5031         for (tmpva = va, tmpsize = size; tmpsize > 0;) {
5032                 pte = vtopte(tmpva);
5033                 *pte = pa |
5034                     kernel_pmap.pmap_bits[PG_RW_IDX] |
5035                     kernel_pmap.pmap_bits[PG_V_IDX] | /* pgeflag | */
5036                     kernel_pmap.pmap_cache_bits[mode];
5037                 tmpsize -= PAGE_SIZE;
5038                 tmpva += PAGE_SIZE;
5039                 pa += PAGE_SIZE;
5040         }
5041         pmap_invalidate_range(&kernel_pmap, va, va + size);
5042         pmap_invalidate_cache_range(va, va + size);
5043
5044         return ((void *)(va + offset));
5045 }
5046
5047 void
5048 pmap_unmapdev(vm_offset_t va, vm_size_t size)
5049 {
5050         vm_offset_t base, offset;
5051
5052         base = va & ~PAGE_MASK;
5053         offset = va & PAGE_MASK;
5054         size = roundup(offset + size, PAGE_SIZE);
5055         pmap_qremove(va, size >> PAGE_SHIFT);
5056         kmem_free(&kernel_map, base, size);
5057 }
5058
5059 /*
5060  * Sets the memory attribute for the specified page.
5061  */
5062 void
5063 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
5064 {
5065
5066     m->pat_mode = ma;
5067
5068     /*
5069      * If "m" is a normal page, update its direct mapping.  This update
5070      * can be relied upon to perform any cache operations that are
5071      * required for data coherence.
5072      */
5073     if ((m->flags & PG_FICTITIOUS) == 0)
5074         pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
5075         m->pat_mode);
5076 }
5077
5078 /*
5079  * Change the PAT attribute on an existing kernel memory map.  Caller
5080  * must ensure that the virtual memory in question is not accessed
5081  * during the adjustment.
5082  */
5083 void
5084 pmap_change_attr(vm_offset_t va, vm_size_t count, int mode)
5085 {
5086         pt_entry_t *pte;
5087         vm_offset_t base;
5088         int changed = 0;
5089
5090         if (va == 0)
5091                 panic("pmap_change_attr: va is NULL");
5092         base = trunc_page(va);
5093
5094         while (count) {
5095                 pte = vtopte(va);
5096                 *pte = (*pte & ~(pt_entry_t)(kernel_pmap.pmap_cache_mask)) |
5097                        kernel_pmap.pmap_cache_bits[mode];
5098                 --count;
5099                 va += PAGE_SIZE;
5100         }
5101
5102         changed = 1;    /* XXX: not optimal */
5103
5104         /*
5105          * Flush CPU caches if required to make sure any data isn't cached that
5106          * shouldn't be, etc.
5107          */
5108         if (changed) {
5109                 pmap_invalidate_range(&kernel_pmap, base, va);
5110                 pmap_invalidate_cache_range(base, va);
5111         }
5112 }
5113
5114 /*
5115  * perform the pmap work for mincore
5116  */
5117 int
5118 pmap_mincore(pmap_t pmap, vm_offset_t addr)
5119 {
5120         pt_entry_t *ptep, pte;
5121         vm_page_t m;
5122         int val = 0;
5123         
5124         lwkt_gettoken(&pmap->pm_token);
5125         ptep = pmap_pte(pmap, addr);
5126
5127         if (ptep && (pte = *ptep) != 0) {
5128                 vm_offset_t pa;
5129
5130                 val = MINCORE_INCORE;
5131                 if ((pte & pmap->pmap_bits[PG_MANAGED_IDX]) == 0)
5132                         goto done;
5133
5134                 pa = pte & PG_FRAME;
5135
5136                 if (pte & pmap->pmap_bits[PG_DEVICE_IDX])
5137                         m = NULL;
5138                 else
5139                         m = PHYS_TO_VM_PAGE(pa);
5140
5141                 /*
5142                  * Modified by us
5143                  */
5144                 if (pte & pmap->pmap_bits[PG_M_IDX])
5145                         val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
5146                 /*
5147                  * Modified by someone
5148                  */
5149                 else if (m && (m->dirty || pmap_is_modified(m)))
5150                         val |= MINCORE_MODIFIED_OTHER;
5151                 /*
5152                  * Referenced by us
5153                  */
5154                 if (pte & pmap->pmap_bits[PG_A_IDX])
5155                         val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
5156
5157                 /*
5158                  * Referenced by someone
5159                  */
5160                 else if (m && ((m->flags & PG_REFERENCED) ||
5161                                 pmap_ts_referenced(m))) {
5162                         val |= MINCORE_REFERENCED_OTHER;
5163                         vm_page_flag_set(m, PG_REFERENCED);
5164                 }
5165         } 
5166 done:
5167         lwkt_reltoken(&pmap->pm_token);
5168
5169         return val;
5170 }
5171
5172 /*
5173  * Replace p->p_vmspace with a new one.  If adjrefs is non-zero the new
5174  * vmspace will be ref'd and the old one will be deref'd.
5175  *
5176  * The vmspace for all lwps associated with the process will be adjusted
5177  * and cr3 will be reloaded if any lwp is the current lwp.
5178  *
5179  * The process must hold the vmspace->vm_map.token for oldvm and newvm
5180  */
5181 void
5182 pmap_replacevm(struct proc *p, struct vmspace *newvm, int adjrefs)
5183 {
5184         struct vmspace *oldvm;
5185         struct lwp *lp;
5186
5187         oldvm = p->p_vmspace;
5188         if (oldvm != newvm) {
5189                 if (adjrefs)
5190                         sysref_get(&newvm->vm_sysref);
5191                 p->p_vmspace = newvm;
5192                 KKASSERT(p->p_nthreads == 1);
5193                 lp = RB_ROOT(&p->p_lwp_tree);
5194                 pmap_setlwpvm(lp, newvm);
5195                 if (adjrefs)
5196                         sysref_put(&oldvm->vm_sysref);
5197         }
5198 }
5199
5200 /*
5201  * Set the vmspace for a LWP.  The vmspace is almost universally set the
5202  * same as the process vmspace, but virtual kernels need to swap out contexts
5203  * on a per-lwp basis.
5204  *
5205  * Caller does not necessarily hold any vmspace tokens.  Caller must control
5206  * the lwp (typically be in the context of the lwp).  We use a critical
5207  * section to protect against statclock and hardclock (statistics collection).
5208  */
5209 void
5210 pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm)
5211 {
5212         struct vmspace *oldvm;
5213         struct pmap *pmap;
5214
5215         oldvm = lp->lwp_vmspace;
5216
5217         if (oldvm != newvm) {
5218                 crit_enter();
5219                 lp->lwp_vmspace = newvm;
5220                 if (curthread->td_lwp == lp) {
5221                         pmap = vmspace_pmap(newvm);
5222                         atomic_set_cpumask(&pmap->pm_active, mycpu->gd_cpumask);
5223                         if (pmap->pm_active_lock & CPULOCK_EXCL)
5224                                 pmap_interlock_wait(newvm);
5225 #if defined(SWTCH_OPTIM_STATS)
5226                         tlb_flush_count++;
5227 #endif
5228                         if (pmap->pmap_bits[TYPE_IDX] == REGULAR_PMAP) {
5229                                 curthread->td_pcb->pcb_cr3 = vtophys(pmap->pm_pml4);
5230                         } else if (pmap->pmap_bits[TYPE_IDX] == EPT_PMAP) {
5231                                 curthread->td_pcb->pcb_cr3 = KPML4phys;
5232                         } else {
5233                                 panic("pmap_setlwpvm: unknown pmap type\n");
5234                         }
5235                         load_cr3(curthread->td_pcb->pcb_cr3);
5236                         pmap = vmspace_pmap(oldvm);
5237                         atomic_clear_cpumask(&pmap->pm_active, mycpu->gd_cpumask);
5238                 }
5239                 crit_exit();
5240         }
5241 }
5242
5243 /*
5244  * Called when switching to a locked pmap, used to interlock against pmaps
5245  * undergoing modifications to prevent us from activating the MMU for the
5246  * target pmap until all such modifications have completed.  We have to do
5247  * this because the thread making the modifications has already set up its
5248  * SMP synchronization mask.
5249  *
5250  * This function cannot sleep!
5251  *
5252  * No requirements.
5253  */
5254 void
5255 pmap_interlock_wait(struct vmspace *vm)
5256 {
5257         struct pmap *pmap = &vm->vm_pmap;
5258
5259         if (pmap->pm_active_lock & CPULOCK_EXCL) {
5260                 crit_enter();
5261                 KKASSERT(curthread->td_critcount >= 2);
5262                 DEBUG_PUSH_INFO("pmap_interlock_wait");
5263                 while (pmap->pm_active_lock & CPULOCK_EXCL) {
5264                         cpu_ccfence();
5265                         lwkt_process_ipiq();
5266                 }
5267                 DEBUG_POP_INFO();
5268                 crit_exit();
5269         }
5270 }
5271
5272 vm_offset_t
5273 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
5274 {
5275
5276         if ((obj == NULL) || (size < NBPDR) ||
5277             ((obj->type != OBJT_DEVICE) && (obj->type != OBJT_MGTDEVICE))) {
5278                 return addr;
5279         }
5280
5281         addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
5282         return addr;
5283 }
5284
5285 /*
5286  * Used by kmalloc/kfree, page already exists at va
5287  */
5288 vm_page_t
5289 pmap_kvtom(vm_offset_t va)
5290 {
5291         pt_entry_t *ptep = vtopte(va);
5292
5293         KKASSERT((*ptep & kernel_pmap.pmap_bits[PG_DEVICE_IDX]) == 0);
5294         return(PHYS_TO_VM_PAGE(*ptep & PG_FRAME));
5295 }
5296
5297 /*
5298  * Initialize machine-specific shared page directory support.  This
5299  * is executed when a VM object is created.
5300  */
5301 void
5302 pmap_object_init(vm_object_t object)
5303 {
5304         object->md.pmap_rw = NULL;
5305         object->md.pmap_ro = NULL;
5306 }
5307
5308 /*
5309  * Clean up machine-specific shared page directory support.  This
5310  * is executed when a VM object is destroyed.
5311  */
5312 void
5313 pmap_object_free(vm_object_t object)
5314 {
5315         pmap_t pmap;
5316
5317         if ((pmap = object->md.pmap_rw) != NULL) {
5318                 object->md.pmap_rw = NULL;
5319                 pmap_remove_noinval(pmap,
5320                                   VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
5321                 pmap->pm_active = 0;
5322                 pmap_release(pmap);
5323                 pmap_puninit(pmap);
5324                 kfree(pmap, M_OBJPMAP);
5325         }
5326         if ((pmap = object->md.pmap_ro) != NULL) {
5327                 object->md.pmap_ro = NULL;
5328                 pmap_remove_noinval(pmap,
5329                                   VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
5330                 pmap->pm_active = 0;
5331                 pmap_release(pmap);
5332                 pmap_puninit(pmap);
5333                 kfree(pmap, M_OBJPMAP);
5334         }
5335 }