4 * Copyright (c) 1991 Regents of the University of California.
6 * Copyright (c) 1994 John S. Dyson
8 * Copyright (c) 1994 David Greenman
11 * This code is derived from software contributed to Berkeley by
12 * the Systems Programming Group of the University of Utah Computer
13 * Science Department and William Jolitz of UUNET Technologies Inc.
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
18 * 1. Redistributions of source code must retain the above copyright
19 * notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 * notice, this list of conditions and the following disclaimer in the
22 * documentation and/or other materials provided with the distribution.
23 * 3. All advertising materials mentioning features or use of this software
24 * must display the following acknowledgement:
25 * This product includes software developed by the University of
26 * California, Berkeley and its contributors.
27 * 4. Neither the name of the University nor the names of its contributors
28 * may be used to endorse or promote products derived from this software
29 * without specific prior written permission.
31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
43 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91
44 * $FreeBSD: src/sys/i386/i386/pmap.c,v 1.250.2.18 2002/03/06 22:48:53 silby Exp $
48 * Manages physical address maps.
50 * In most cases we hold page table pages busy in order to manipulate them.
53 * PMAP_DEBUG - see platform/pc32/include/pmap.h
56 #include "opt_disable_pse.h"
58 #include "opt_msgbuf.h"
60 #include <sys/param.h>
61 #include <sys/systm.h>
62 #include <sys/kernel.h>
64 #include <sys/msgbuf.h>
65 #include <sys/vmmeter.h>
67 #include <sys/thread.h>
70 #include <vm/vm_param.h>
71 #include <sys/sysctl.h>
73 #include <vm/vm_kern.h>
74 #include <vm/vm_page.h>
75 #include <vm/vm_map.h>
76 #include <vm/vm_object.h>
77 #include <vm/vm_extern.h>
78 #include <vm/vm_pageout.h>
79 #include <vm/vm_pager.h>
80 #include <vm/vm_zone.h>
83 #include <sys/thread2.h>
84 #include <sys/sysref2.h>
85 #include <sys/spinlock2.h>
86 #include <vm/vm_page2.h>
88 #include <machine/cputypes.h>
89 #include <machine/md_var.h>
90 #include <machine/specialreg.h>
91 #include <machine/smp.h>
92 #include <machine_base/apic/apicreg.h>
93 #include <machine/globaldata.h>
94 #include <machine/pmap.h>
95 #include <machine/pmap_inval.h>
97 #define PMAP_KEEP_PDIRS
98 #ifndef PMAP_SHPGPERPROC
99 #define PMAP_SHPGPERPROC 200
100 #define PMAP_PVLIMIT 1400000 /* i386 kvm problems */
103 #if defined(DIAGNOSTIC)
104 #define PMAP_DIAGNOSTIC
109 #if !defined(PMAP_DIAGNOSTIC)
110 #define PMAP_INLINE __inline
116 * Get PDEs and PTEs for user/kernel address space
118 #define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
119 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
121 #define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0)
122 #define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0)
123 #define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0)
124 #define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0)
125 #define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0)
128 * Given a map and a machine independent protection code,
129 * convert to a vax protection code.
131 #define pte_prot(m, p) \
132 (protection_codes[p & (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE)])
133 static int protection_codes[8];
135 struct pmap kernel_pmap;
136 static TAILQ_HEAD(,pmap) pmap_list = TAILQ_HEAD_INITIALIZER(pmap_list);
138 vm_paddr_t avail_start; /* PA of first available physical page */
139 vm_paddr_t avail_end; /* PA of last available physical page */
140 vm_offset_t virtual_start; /* VA of first avail page (after kernel bss) */
141 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */
142 vm_offset_t virtual2_start;
143 vm_offset_t virtual2_end;
144 vm_offset_t KvaStart; /* VA start of KVA space */
145 vm_offset_t KvaEnd; /* VA end of KVA space (non-inclusive) */
146 vm_offset_t KvaSize; /* max size of kernel virtual address space */
147 static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */
148 static int pgeflag; /* PG_G or-in */
149 static int pseflag; /* PG_PS or-in */
152 static vm_object_t kptobj;
155 vm_offset_t kernel_vm_end;
157 #define PAT_INDEX_SIZE 8
158 static pt_entry_t pat_pte_index[PAT_INDEX_SIZE]; /* PAT -> PG_ bits */
159 /*static pt_entry_t pat_pde_index[PAT_INDEX_SIZE];*/ /* PAT -> PG_ bits */
162 * Data for the pv entry allocation mechanism
164 static vm_zone_t pvzone;
165 static struct vm_zone pvzone_store;
166 static struct vm_object pvzone_obj;
167 static int pv_entry_count=0, pv_entry_max=0, pv_entry_high_water=0;
168 static int pmap_pagedaemon_waken = 0;
169 static struct pv_entry *pvinit;
172 * Considering all the issues I'm having with pmap caching, if breakage
173 * continues to occur, and for debugging, I've added a sysctl that will
174 * just do an unconditional invltlb.
176 static int dreadful_invltlb;
178 SYSCTL_INT(_vm, OID_AUTO, dreadful_invltlb,
179 CTLFLAG_RW, &dreadful_invltlb, 0, "Debugging sysctl to force invltlb on pmap operations");
182 * All those kernel PT submaps that BSD is so fond of
184 pt_entry_t *CMAP1 = NULL, *ptmmap;
185 caddr_t CADDR1 = NULL, ptvmmap = NULL;
186 static pt_entry_t *msgbufmap;
187 struct msgbuf *msgbufp=NULL;
192 static pt_entry_t *pt_crashdumpmap;
193 static caddr_t crashdumpmap;
195 extern pt_entry_t *SMPpt;
197 static PMAP_INLINE void free_pv_entry (pv_entry_t pv);
198 static unsigned * get_ptbase (pmap_t pmap);
199 static pv_entry_t get_pv_entry (void);
200 static void i386_protection_init (void);
201 static __inline void pmap_clearbit (vm_page_t m, int bit);
203 static void pmap_remove_all (vm_page_t m);
204 static void pmap_remove_pte (struct pmap *pmap, unsigned *ptq,
205 vm_offset_t sva, pmap_inval_info_t info);
206 static void pmap_remove_page (struct pmap *pmap,
207 vm_offset_t va, pmap_inval_info_t info);
208 static void pmap_remove_entry (struct pmap *pmap, vm_page_t m,
209 vm_offset_t va, pmap_inval_info_t info);
210 static boolean_t pmap_testbit (vm_page_t m, int bit);
211 static void pmap_insert_entry (pmap_t pmap, pv_entry_t pv,
212 vm_offset_t va, vm_page_t mpte, vm_page_t m);
214 static vm_page_t pmap_allocpte (pmap_t pmap, vm_offset_t va);
216 static int pmap_release_free_page (pmap_t pmap, vm_page_t p);
217 static vm_page_t _pmap_allocpte (pmap_t pmap, unsigned ptepindex);
218 static unsigned * pmap_pte_quick (pmap_t pmap, vm_offset_t va);
219 static vm_page_t pmap_page_lookup (vm_object_t object, vm_pindex_t pindex);
220 static void pmap_unuse_pt (pmap_t, vm_offset_t, vm_page_t, pmap_inval_info_t);
221 static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
223 static void pmap_hold(pmap_t pmap);
224 static void pmap_drop(pmap_t pmap);
225 static void pmap_wait(pmap_t pmap, int count);
227 static unsigned pdir4mb;
231 pmap_page_stats_adding(vm_page_t m)
233 globaldata_t gd = mycpu;
235 if (TAILQ_EMPTY(&m->md.pv_list)) {
236 ++gd->gd_vmtotal.t_arm;
237 } else if (TAILQ_FIRST(&m->md.pv_list) ==
238 TAILQ_LAST(&m->md.pv_list, md_page_pv_list)) {
239 ++gd->gd_vmtotal.t_armshr;
240 ++gd->gd_vmtotal.t_avmshr;
242 ++gd->gd_vmtotal.t_avmshr;
248 pmap_page_stats_deleting(vm_page_t m)
250 globaldata_t gd = mycpu;
252 if (TAILQ_EMPTY(&m->md.pv_list)) {
253 --gd->gd_vmtotal.t_arm;
254 } else if (TAILQ_FIRST(&m->md.pv_list) ==
255 TAILQ_LAST(&m->md.pv_list, md_page_pv_list)) {
256 --gd->gd_vmtotal.t_armshr;
257 --gd->gd_vmtotal.t_avmshr;
259 --gd->gd_vmtotal.t_avmshr;
264 * Move the kernel virtual free pointer to the next
265 * 4MB. This is used to help improve performance
266 * by using a large (4MB) page for much of the kernel
267 * (.text, .data, .bss)
271 pmap_kmem_choose(vm_offset_t addr)
273 vm_offset_t newaddr = addr;
275 if (cpu_feature & CPUID_PSE) {
276 newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
283 * This function returns a pointer to the pte entry in the pmap and has
284 * the side effect of potentially retaining a cached mapping of the pmap.
286 * The caller must hold vm_token and the returned value is only valid
287 * until the caller blocks or releases the token.
291 pmap_pte(pmap_t pmap, vm_offset_t va)
295 ASSERT_LWKT_TOKEN_HELD(&vm_token);
297 pdeaddr = (unsigned *) pmap_pde(pmap, va);
298 if (*pdeaddr & PG_PS)
301 return get_ptbase(pmap) + i386_btop(va);
307 * pmap_pte using the kernel_pmap
309 * Used for debugging, no requirements.
312 pmap_kernel_pte(vm_offset_t va)
316 pdeaddr = (unsigned *) pmap_pde(&kernel_pmap, va);
317 if (*pdeaddr & PG_PS)
320 return (unsigned *)vtopte(va);
327 * Super fast pmap_pte routine best used when scanning the pv lists.
328 * This eliminates many course-grained invltlb calls. Note that many of
329 * the pv list scans are across different pmaps and it is very wasteful
330 * to do an entire invltlb when checking a single mapping.
332 * Should only be called while in a critical section.
334 * The caller must hold vm_token and the returned value is only valid
335 * until the caller blocks or releases the token.
339 pmap_pte_quick(pmap_t pmap, vm_offset_t va)
341 struct mdglobaldata *gd = mdcpu;
344 ASSERT_LWKT_TOKEN_HELD(&vm_token);
345 if ((pde = (unsigned) pmap->pm_pdir[va >> PDRSHIFT]) != 0) {
346 unsigned frame = (unsigned) pmap->pm_pdir[PTDPTDI] & PG_FRAME;
347 unsigned index = i386_btop(va);
348 /* are we current address space or kernel? */
349 if ((pmap == &kernel_pmap) ||
350 (frame == (((unsigned) PTDpde) & PG_FRAME))) {
351 return (unsigned *) PTmap + index;
353 newpf = pde & PG_FRAME;
354 if (((*(unsigned *)gd->gd_PMAP1) & PG_FRAME) != newpf) {
355 *(unsigned *)gd->gd_PMAP1 = newpf | PG_RW | PG_V;
356 cpu_invlpg(gd->gd_PADDR1);
358 return gd->gd_PADDR1 + (index & (NPTEPG - 1));
365 * Bootstrap the system enough to run with virtual memory.
367 * On the i386 this is called after mapping has already been enabled
368 * and just syncs the pmap module with what has already been done.
369 * [We can't call it easily with mapping off since the kernel is not
370 * mapped with PA == VA, hence we would have to relocate every address
371 * from the linked base (virtual) address "KERNBASE" to the actual
372 * (physical) address starting relative to 0]
375 pmap_bootstrap(vm_paddr_t firstaddr, vm_paddr_t loadaddr)
379 struct mdglobaldata *gd;
383 KvaStart = (vm_offset_t)VADDR(PTDPTDI, 0);
384 KvaSize = (vm_offset_t)VADDR(APTDPTDI, 0) - KvaStart;
385 KvaEnd = KvaStart + KvaSize;
387 avail_start = firstaddr;
390 * XXX The calculation of virtual_start is wrong. It's NKPT*PAGE_SIZE
391 * too large. It should instead be correctly calculated in locore.s and
392 * not based on 'first' (which is a physical address, not a virtual
393 * address, for the start of unused physical memory). The kernel
394 * page tables are NOT double mapped and thus should not be included
395 * in this calculation.
397 virtual_start = (vm_offset_t) KERNBASE + firstaddr;
398 virtual_start = pmap_kmem_choose(virtual_start);
399 virtual_end = VADDR(KPTDI+NKPDE-1, NPTEPG-1);
402 * Initialize protection array.
404 i386_protection_init();
407 * The kernel's pmap is statically allocated so we don't have to use
408 * pmap_create, which is unlikely to work correctly at this part of
409 * the boot sequence (XXX and which no longer exists).
411 * The kernel_pmap's pm_pteobj is used only for locking and not
414 kernel_pmap.pm_pdir = (pd_entry_t *)(KERNBASE + (u_int)IdlePTD);
415 kernel_pmap.pm_count = 1;
416 kernel_pmap.pm_active = (cpumask_t)-1;
417 kernel_pmap.pm_pteobj = NULL; /* see pmap_init */
418 TAILQ_INIT(&kernel_pmap.pm_pvlist);
419 TAILQ_INIT(&kernel_pmap.pm_pvlist_free);
420 spin_init(&kernel_pmap.pm_spin);
421 lwkt_token_init(&kernel_pmap.pm_token, "kpmap_tok");
425 * Reserve some special page table entries/VA space for temporary
428 #define SYSMAP(c, p, v, n) \
429 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
432 pte = (pt_entry_t *) pmap_kernel_pte(va);
435 * CMAP1/CMAP2 are used for zeroing and copying pages.
437 SYSMAP(caddr_t, CMAP1, CADDR1, 1)
442 SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS);
445 * ptvmmap is used for reading arbitrary physical pages via
448 SYSMAP(caddr_t, ptmmap, ptvmmap, 1)
451 * msgbufp is used to map the system message buffer.
452 * XXX msgbufmap is not used.
454 SYSMAP(struct msgbuf *, msgbufmap, msgbufp,
455 atop(round_page(MSGBUF_SIZE)))
460 for (i = 0; i < NKPT; i++)
464 * PG_G is terribly broken on SMP because we IPI invltlb's in some
465 * cases rather then invl1pg. Actually, I don't even know why it
466 * works under UP because self-referential page table mappings
471 * Initialize the 4MB page size flag
475 * The 4MB page version of the initial
476 * kernel page mapping.
480 #if !defined(DISABLE_PSE)
481 if (cpu_feature & CPUID_PSE) {
484 * Note that we have enabled PSE mode
487 ptditmp = *((unsigned *)PTmap + i386_btop(KERNBASE));
488 ptditmp &= ~(NBPDR - 1);
489 ptditmp |= PG_V | PG_RW | PG_PS | PG_U | pgeflag;
495 * We need to finish setting up the globaldata page for the BSP.
496 * locore has already populated the page table for the mdglobaldata
499 pg = MDGLOBALDATA_BASEALLOC_PAGES;
500 gd = &CPU_prvspace[0].mdglobaldata;
501 gd->gd_CMAP1 = &SMPpt[pg + 0];
502 gd->gd_CMAP2 = &SMPpt[pg + 1];
503 gd->gd_CMAP3 = &SMPpt[pg + 2];
504 gd->gd_PMAP1 = &SMPpt[pg + 3];
505 gd->gd_GDMAP1 = &PTD[APTDPTDI];
506 gd->gd_CADDR1 = CPU_prvspace[0].CPAGE1;
507 gd->gd_CADDR2 = CPU_prvspace[0].CPAGE2;
508 gd->gd_CADDR3 = CPU_prvspace[0].CPAGE3;
509 gd->gd_PADDR1 = (unsigned *)CPU_prvspace[0].PPAGE1;
510 gd->gd_GDADDR1= (unsigned *)VADDR(APTDPTDI, 0);
514 /* Initialize the PAT MSR */
528 * Default values mapping PATi,PCD,PWT bits at system reset.
529 * The default values effectively ignore the PATi bit by
530 * repeating the encodings for 0-3 in 4-7, and map the PCD
531 * and PWT bit combinations to the expected PAT types.
533 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | /* 000 */
534 PAT_VALUE(1, PAT_WRITE_THROUGH) | /* 001 */
535 PAT_VALUE(2, PAT_UNCACHED) | /* 010 */
536 PAT_VALUE(3, PAT_UNCACHEABLE) | /* 011 */
537 PAT_VALUE(4, PAT_WRITE_BACK) | /* 100 */
538 PAT_VALUE(5, PAT_WRITE_THROUGH) | /* 101 */
539 PAT_VALUE(6, PAT_UNCACHED) | /* 110 */
540 PAT_VALUE(7, PAT_UNCACHEABLE); /* 111 */
541 pat_pte_index[PAT_WRITE_BACK] = 0;
542 pat_pte_index[PAT_WRITE_THROUGH]= 0 | PG_NC_PWT;
543 pat_pte_index[PAT_UNCACHED] = PG_NC_PCD;
544 pat_pte_index[PAT_UNCACHEABLE] = PG_NC_PCD | PG_NC_PWT;
545 pat_pte_index[PAT_WRITE_PROTECTED] = pat_pte_index[PAT_UNCACHEABLE];
546 pat_pte_index[PAT_WRITE_COMBINING] = pat_pte_index[PAT_UNCACHEABLE];
548 if (cpu_feature & CPUID_PAT) {
550 * If we support the PAT then set-up entries for
551 * WRITE_PROTECTED and WRITE_COMBINING using bit patterns
554 pat_msr = (pat_msr & ~PAT_MASK(4)) |
555 PAT_VALUE(4, PAT_WRITE_PROTECTED);
556 pat_msr = (pat_msr & ~PAT_MASK(5)) |
557 PAT_VALUE(5, PAT_WRITE_COMBINING);
558 pat_pte_index[PAT_WRITE_PROTECTED] = PG_PTE_PAT | 0;
559 pat_pte_index[PAT_WRITE_COMBINING] = PG_PTE_PAT | PG_NC_PWT;
562 * Then enable the PAT
567 load_cr4(cr4 & ~CR4_PGE);
569 /* Disable caches (CD = 1, NW = 0). */
571 load_cr0((cr0 & ~CR0_NW) | CR0_CD);
573 /* Flushes caches and TLBs. */
577 /* Update PAT and index table. */
578 wrmsr(MSR_PAT, pat_msr);
580 /* Flush caches and TLBs again. */
584 /* Restore caches and PGE. */
592 * Set 4mb pdir for mp startup
597 if (pseflag && (cpu_feature & CPUID_PSE)) {
598 load_cr4(rcr4() | CR4_PSE);
599 if (pdir4mb && mycpu->gd_cpuid == 0) { /* only on BSP */
600 kernel_pmap.pm_pdir[KPTDI] =
601 PTD[KPTDI] = (pd_entry_t)pdir4mb;
608 * Initialize the pmap module, called by vm_init()
610 * Called from the low level boot code only.
619 * object for kernel page table pages
621 kptobj = vm_object_allocate(OBJT_DEFAULT, NKPDE);
622 kernel_pmap.pm_pteobj = kptobj;
625 * Allocate memory for random pmap data structures. Includes the
629 for(i = 0; i < vm_page_array_size; i++) {
632 m = &vm_page_array[i];
633 TAILQ_INIT(&m->md.pv_list);
634 m->md.pv_list_count = 0;
638 * init the pv free list
640 initial_pvs = vm_page_array_size;
641 if (initial_pvs < MINPV)
643 pvzone = &pvzone_store;
644 pvinit = (void *)kmem_alloc(&kernel_map,
645 initial_pvs * sizeof (struct pv_entry));
646 zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry),
647 pvinit, initial_pvs);
650 * Now it is safe to enable pv_table recording.
652 pmap_initialized = TRUE;
656 * Initialize the address space (zone) for the pv_entries. Set a
657 * high water mark so that the system can recover from excessive
658 * numbers of pv entries.
660 * Called from the low level boot code only.
665 int shpgperproc = PMAP_SHPGPERPROC;
668 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
669 pv_entry_max = shpgperproc * maxproc + vm_page_array_size;
673 * Horrible hack for systems with a lot of memory running i386.
674 * the calculated pv_entry_max can wind up eating a ton of KVM
675 * so put a cap on the number of entries if the user did not
676 * change any of the values. This saves about 44MB of KVM on
677 * boxes with 3+GB of ram.
679 * On the flip side, this makes it more likely that some setups
680 * will run out of pv entries. Those sysads will have to bump
681 * the limit up with vm.pamp.pv_entries or vm.pmap.shpgperproc.
683 if (shpgperproc == PMAP_SHPGPERPROC) {
684 if (pv_entry_max > PMAP_PVLIMIT)
685 pv_entry_max = PMAP_PVLIMIT;
688 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
689 pv_entry_high_water = 9 * (pv_entry_max / 10);
692 * Subtract out pages already installed in the zone (hack)
694 entry_max = pv_entry_max - vm_page_array_size;
698 zinitna(pvzone, &pvzone_obj, NULL, 0, entry_max, ZONE_INTERRUPT, 1);
702 * Typically used to initialize a fictitious page by vm/device_pager.c
705 pmap_page_init(struct vm_page *m)
708 TAILQ_INIT(&m->md.pv_list);
711 /***************************************************
712 * Low level helper routines.....
713 ***************************************************/
718 test_m_maps_pv(vm_page_t m, pv_entry_t pv)
724 KKASSERT(pv->pv_m == m);
726 TAILQ_FOREACH(spv, &m->md.pv_list, pv_list) {
733 panic("test_m_maps_pv: failed m %p pv %p", m, pv);
737 ptbase_assert(struct pmap *pmap)
739 unsigned frame = (unsigned) pmap->pm_pdir[PTDPTDI] & PG_FRAME;
741 /* are we current address space or kernel? */
742 if (pmap == &kernel_pmap || frame == (((unsigned)PTDpde) & PG_FRAME))
744 KKASSERT(frame == (*mdcpu->gd_GDMAP1 & PG_FRAME));
749 #define test_m_maps_pv(m, pv)
750 #define ptbase_assert(pmap)
754 #if defined(PMAP_DIAGNOSTIC)
757 * This code checks for non-writeable/modified pages.
758 * This should be an invalid condition.
761 pmap_nw_modified(pt_entry_t ptea)
767 if ((pte & (PG_M|PG_RW)) == PG_M)
776 * This routine defines the region(s) of memory that should not be tested
777 * for the modified bit.
781 static PMAP_INLINE int
782 pmap_track_modified(vm_offset_t va)
784 if ((va < clean_sva) || (va >= clean_eva))
791 * Retrieve the mapped page table base for a particular pmap. Use our self
792 * mapping for the kernel_pmap or our current pmap.
794 * For foreign pmaps we use the per-cpu page table map. Since this involves
795 * installing a ptd it's actually (per-process x per-cpu). However, we
796 * still cannot depend on our mapping to survive thread switches because
797 * the process might be threaded and switching to another thread for the
798 * same process on the same cpu will allow that other thread to make its
801 * This could be a bit confusing but the jist is for something like the
802 * vkernel which uses foreign pmaps all the time this represents a pretty
803 * good cache that avoids unnecessary invltlb()s.
805 * The caller must hold vm_token and the returned value is only valid
806 * until the caller blocks or releases the token.
809 get_ptbase(pmap_t pmap)
811 unsigned frame = (unsigned) pmap->pm_pdir[PTDPTDI] & PG_FRAME;
812 struct mdglobaldata *gd = mdcpu;
814 ASSERT_LWKT_TOKEN_HELD(&vm_token);
817 * We can use PTmap if the pmap is our current address space or
818 * the kernel address space.
820 if (pmap == &kernel_pmap || frame == (((unsigned) PTDpde) & PG_FRAME)) {
821 return (unsigned *) PTmap;
825 * Otherwise we use the per-cpu alternative page table map. Each
826 * cpu gets its own map. Because of this we cannot use this map
827 * from interrupts or threads which can preempt.
829 * Even if we already have the map cached we may still have to
830 * invalidate the TLB if another cpu modified a PDE in the map.
832 KKASSERT(gd->mi.gd_intr_nesting_level == 0 &&
833 (gd->mi.gd_curthread->td_flags & TDF_INTTHREAD) == 0);
835 if ((*gd->gd_GDMAP1 & PG_FRAME) != frame) {
836 *gd->gd_GDMAP1 = frame | PG_RW | PG_V;
837 CPUMASK_ORMASK(pmap->pm_cached, gd->mi.gd_cpumask);
839 } else if (CPUMASK_TESTMASK(pmap->pm_cached, gd->mi.gd_cpumask) == 0) {
840 CPUMASK_ORMASK(pmap->pm_cached, gd->mi.gd_cpumask);
842 } else if (dreadful_invltlb) {
845 return ((unsigned *)gd->gd_GDADDR1);
851 * Extract the physical page address associated with the map/VA pair.
853 * The caller may hold vm_token if it desires non-blocking operation.
856 pmap_extract(pmap_t pmap, vm_offset_t va)
859 vm_offset_t pdirindex;
861 lwkt_gettoken(&vm_token);
862 pdirindex = va >> PDRSHIFT;
863 if (pmap && (rtval = (unsigned) pmap->pm_pdir[pdirindex])) {
865 if ((rtval & PG_PS) != 0) {
866 rtval &= ~(NBPDR - 1);
867 rtval |= va & (NBPDR - 1);
869 pte = get_ptbase(pmap) + i386_btop(va);
870 rtval = ((*pte & PG_FRAME) | (va & PAGE_MASK));
875 lwkt_reltoken(&vm_token);
880 * Similar to extract but checks protections, SMP-friendly short-cut for
881 * vm_fault_page[_quick]().
884 pmap_fault_page_quick(pmap_t pmap __unused, vm_offset_t vaddr __unused,
885 vm_prot_t prot __unused)
890 /***************************************************
891 * Low level mapping routines.....
892 ***************************************************/
895 * Map a wired VM page to a KVA, fully SMP synchronized.
897 * No requirements, non blocking.
900 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
904 pmap_inval_info info;
906 pmap_inval_init(&info);
907 npte = pa | PG_RW | PG_V | pgeflag;
908 pte = (unsigned *)vtopte(va);
909 pmap_inval_interlock(&info, &kernel_pmap, va);
911 pmap_inval_deinterlock(&info, &kernel_pmap);
912 pmap_inval_done(&info);
916 * Map a wired VM page to a KVA, synchronized on current cpu only.
918 * No requirements, non blocking.
921 pmap_kenter_quick(vm_offset_t va, vm_paddr_t pa)
926 npte = pa | PG_RW | PG_V | pgeflag;
927 pte = (unsigned *)vtopte(va);
929 cpu_invlpg((void *)va);
933 * Synchronize a previously entered VA on all cpus.
935 * No requirements, non blocking.
938 pmap_kenter_sync(vm_offset_t va)
940 pmap_inval_info info;
942 pmap_inval_init(&info);
943 pmap_inval_interlock(&info, &kernel_pmap, va);
944 pmap_inval_deinterlock(&info, &kernel_pmap);
945 pmap_inval_done(&info);
949 * Synchronize a previously entered VA on the current cpu only.
951 * No requirements, non blocking.
954 pmap_kenter_sync_quick(vm_offset_t va)
956 cpu_invlpg((void *)va);
960 * Remove a page from the kernel pagetables, fully SMP synchronized.
962 * No requirements, non blocking.
965 pmap_kremove(vm_offset_t va)
968 pmap_inval_info info;
970 pmap_inval_init(&info);
971 pte = (unsigned *)vtopte(va);
972 pmap_inval_interlock(&info, &kernel_pmap, va);
974 pmap_inval_deinterlock(&info, &kernel_pmap);
975 pmap_inval_done(&info);
979 * Remove a page from the kernel pagetables, synchronized on current cpu only.
981 * No requirements, non blocking.
984 pmap_kremove_quick(vm_offset_t va)
987 pte = (unsigned *)vtopte(va);
989 cpu_invlpg((void *)va);
993 * Adjust the permissions of a page in the kernel page table,
994 * synchronized on the current cpu only.
996 * No requirements, non blocking.
999 pmap_kmodify_rw(vm_offset_t va)
1001 atomic_set_int(vtopte(va), PG_RW);
1002 cpu_invlpg((void *)va);
1006 * Adjust the permissions of a page in the kernel page table,
1007 * synchronized on the current cpu only.
1009 * No requirements, non blocking.
1012 pmap_kmodify_nc(vm_offset_t va)
1014 atomic_set_int(vtopte(va), PG_N);
1015 cpu_invlpg((void *)va);
1019 * Map a range of physical addresses into kernel virtual address space.
1021 * No requirements, non blocking.
1024 pmap_map(vm_offset_t *virtp, vm_paddr_t start, vm_paddr_t end, int prot)
1026 vm_offset_t sva, virt;
1028 sva = virt = *virtp;
1029 while (start < end) {
1030 pmap_kenter(virt, start);
1038 #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024)
1041 * Remove the specified set of pages from the data and instruction caches.
1043 * In contrast to pmap_invalidate_cache_range(), this function does not
1044 * rely on the CPU's self-snoop feature, because it is intended for use
1045 * when moving pages into a different cache domain.
1048 pmap_invalidate_cache_pages(vm_page_t *pages, int count)
1050 wbinvd(); /* XXX: not optimal */
1054 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
1056 KASSERT((sva & PAGE_MASK) == 0,
1057 ("pmap_invalidate_cache_range: sva not page-aligned"));
1058 KASSERT((eva & PAGE_MASK) == 0,
1059 ("pmap_invalidate_cache_range: eva not page-aligned"));
1061 if (cpu_feature & CPUID_SS) {
1062 ; /* If "Self Snoop" is supported, do nothing. */
1064 /* Globally invalidate caches */
1065 cpu_wbinvd_on_all_cpus();
1070 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1072 cpu_wbinvd_on_all_cpus(); /* XXX not optimal */
1076 * Add a list of wired pages to the kva
1077 * this routine is only used for temporary
1078 * kernel mappings that do not need to have
1079 * page modification or references recorded.
1080 * Note that old mappings are simply written
1081 * over. The page *must* be wired.
1084 pmap_qenter(vm_offset_t va, vm_page_t *m, int count)
1088 end_va = va + count * PAGE_SIZE;
1090 while (va < end_va) {
1094 *pte = VM_PAGE_TO_PHYS(*m) | PG_RW | PG_V |
1095 pat_pte_index[(*m)->pat_mode] | pgeflag;
1096 cpu_invlpg((void *)va);
1100 smp_invltlb(); /* XXX */
1104 * Remove pages from KVA, fully SMP synchronized.
1106 * No requirements, non blocking.
1109 pmap_qremove(vm_offset_t va, int count)
1113 end_va = va + count*PAGE_SIZE;
1115 while (va < end_va) {
1118 pte = (unsigned *)vtopte(va);
1120 cpu_invlpg((void *)va);
1127 * This routine works like vm_page_lookup() but also blocks as long as the
1128 * page is busy. This routine does not busy the page it returns.
1130 * The caller must hold the object.
1133 pmap_page_lookup(vm_object_t object, vm_pindex_t pindex)
1137 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1138 m = vm_page_lookup_busy_wait(object, pindex, FALSE, "pplookp");
1144 * Create a new thread and optionally associate it with a (new) process.
1145 * NOTE! the new thread's cpu may not equal the current cpu.
1148 pmap_init_thread(thread_t td)
1150 /* enforce pcb placement */
1151 td->td_pcb = (struct pcb *)(td->td_kstack + td->td_kstack_size) - 1;
1152 td->td_savefpu = &td->td_pcb->pcb_save;
1153 td->td_sp = (char *)td->td_pcb - 16;
1157 * This routine directly affects the fork perf for a process.
1160 pmap_init_proc(struct proc *p)
1164 /***************************************************
1165 * Page table page management routines.....
1166 ***************************************************/
1169 * This routine unwires page table pages, removing and freeing the page
1170 * tale page when the wire count drops to 0.
1172 * The caller must hold vm_token.
1173 * This function can block.
1176 _pmap_unwire_pte(pmap_t pmap, vm_page_t m, pmap_inval_info_t info)
1179 * Wait until we can busy the page ourselves. We cannot have
1180 * any active flushes if we block.
1182 vm_page_busy_wait(m, FALSE, "pmuwpt");
1183 KASSERT(m->queue == PQ_NONE,
1184 ("_pmap_unwire_pte: %p->queue != PQ_NONE", m));
1186 if (m->wire_count == 1) {
1188 * Unmap the page table page.
1190 * NOTE: We must clear pm_cached for all cpus, including
1191 * the current one, when clearing a page directory
1194 pmap_inval_interlock(info, pmap, -1);
1195 KKASSERT(pmap->pm_pdir[m->pindex]);
1196 pmap->pm_pdir[m->pindex] = 0;
1197 pmap->pm_cached = 0;
1198 pmap_inval_deinterlock(info, pmap);
1200 KKASSERT(pmap->pm_stats.resident_count > 0);
1201 --pmap->pm_stats.resident_count;
1203 if (pmap->pm_ptphint == m)
1204 pmap->pm_ptphint = NULL;
1207 * This was our last hold, the page had better be unwired
1208 * after we decrement wire_count.
1210 * FUTURE NOTE: shared page directory page could result in
1211 * multiple wire counts.
1213 vm_page_unwire(m, 0);
1214 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
1216 vm_page_free_zero(m);
1219 KKASSERT(m->wire_count > 1);
1220 if (vm_page_unwire_quick(m))
1221 panic("pmap_unwire_pte: Insufficient wire_count");
1228 * The caller must hold vm_token.
1230 * This function can block.
1232 * This function can race the wire_count 2->1 case because the page
1233 * is not busied during the unwire_quick operation. An eventual
1234 * pmap_release() will catch the case.
1236 static PMAP_INLINE int
1237 pmap_unwire_pte(pmap_t pmap, vm_page_t m, pmap_inval_info_t info)
1239 KKASSERT(m->wire_count > 0);
1240 if (m->wire_count > 1) {
1241 if (vm_page_unwire_quick(m))
1242 panic("pmap_unwire_pte: Insufficient wire_count");
1245 return _pmap_unwire_pte(pmap, m, info);
1250 * After removing a (user) page table entry, this routine is used to
1251 * conditionally free the page, and manage the hold/wire counts.
1253 * The caller must hold vm_token.
1254 * This function can block regardless.
1257 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t mpte,
1258 pmap_inval_info_t info)
1262 ASSERT_LWKT_TOKEN_HELD(vm_object_token(pmap->pm_pteobj));
1264 if (va >= UPT_MIN_ADDRESS)
1268 ptepindex = (va >> PDRSHIFT);
1269 if ((mpte = pmap->pm_ptphint) != NULL &&
1270 mpte->pindex == ptepindex &&
1271 (mpte->flags & PG_BUSY) == 0) {
1274 mpte = pmap_page_lookup(pmap->pm_pteobj, ptepindex);
1275 pmap->pm_ptphint = mpte;
1276 vm_page_wakeup(mpte);
1279 pmap_unwire_pte(pmap, mpte, info);
1283 * Initialize pmap0/vmspace0. This pmap is not added to pmap_list because
1284 * it, and IdlePTD, represents the template used to update all other pmaps.
1286 * On architectures where the kernel pmap is not integrated into the user
1287 * process pmap, this pmap represents the process pmap, not the kernel pmap.
1288 * kernel_pmap should be used to directly access the kernel_pmap.
1293 pmap_pinit0(struct pmap *pmap)
1296 (pd_entry_t *)kmem_alloc_pageable(&kernel_map, PAGE_SIZE);
1297 pmap_kenter((vm_offset_t)pmap->pm_pdir, (vm_offset_t) IdlePTD);
1299 pmap->pm_active = 0;
1300 pmap->pm_cached = 0;
1301 pmap->pm_ptphint = NULL;
1302 TAILQ_INIT(&pmap->pm_pvlist);
1303 TAILQ_INIT(&pmap->pm_pvlist_free);
1304 spin_init(&pmap->pm_spin);
1305 lwkt_token_init(&pmap->pm_token, "pmap_tok");
1306 bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1310 * Initialize a preallocated and zeroed pmap structure,
1311 * such as one in a vmspace structure.
1316 pmap_pinit(struct pmap *pmap)
1321 * No need to allocate page table space yet but we do need a valid
1322 * page directory table.
1324 if (pmap->pm_pdir == NULL) {
1326 (pd_entry_t *)kmem_alloc_pageable(&kernel_map, PAGE_SIZE);
1330 * Allocate an object for the ptes
1332 if (pmap->pm_pteobj == NULL)
1333 pmap->pm_pteobj = vm_object_allocate(OBJT_DEFAULT, PTDPTDI + 1);
1336 * Allocate the page directory page, unless we already have
1337 * one cached. If we used the cached page the wire_count will
1338 * already be set appropriately.
1340 if ((ptdpg = pmap->pm_pdirm) == NULL) {
1341 ptdpg = vm_page_grab(pmap->pm_pteobj, PTDPTDI,
1342 VM_ALLOC_NORMAL | VM_ALLOC_RETRY |
1344 pmap->pm_pdirm = ptdpg;
1345 vm_page_flag_clear(ptdpg, PG_MAPPED);
1346 vm_page_wire(ptdpg);
1347 KKASSERT(ptdpg->valid == VM_PAGE_BITS_ALL);
1348 pmap_kenter((vm_offset_t)pmap->pm_pdir, VM_PAGE_TO_PHYS(ptdpg));
1349 vm_page_wakeup(ptdpg);
1351 pmap->pm_pdir[MPPTDI] = PTD[MPPTDI];
1353 /* install self-referential address mapping entry */
1354 *(unsigned *) (pmap->pm_pdir + PTDPTDI) =
1355 VM_PAGE_TO_PHYS(ptdpg) | PG_V | PG_RW | PG_A | PG_M;
1358 pmap->pm_active = 0;
1359 pmap->pm_cached = 0;
1360 pmap->pm_ptphint = NULL;
1361 TAILQ_INIT(&pmap->pm_pvlist);
1362 TAILQ_INIT(&pmap->pm_pvlist_free);
1363 spin_init(&pmap->pm_spin);
1364 lwkt_token_init(&pmap->pm_token, "pmap_tok");
1365 bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1366 pmap->pm_stats.resident_count = 1;
1370 * Clean up a pmap structure so it can be physically freed. This routine
1371 * is called by the vmspace dtor function. A great deal of pmap data is
1372 * left passively mapped to improve vmspace management so we have a bit
1373 * of cleanup work to do here.
1378 pmap_puninit(pmap_t pmap)
1382 pmap_wait(pmap, -1);
1383 KKASSERT(pmap->pm_active == 0);
1384 if ((p = pmap->pm_pdirm) != NULL) {
1385 KKASSERT(pmap->pm_pdir != NULL);
1386 pmap_kremove((vm_offset_t)pmap->pm_pdir);
1387 vm_page_busy_wait(p, FALSE, "pgpun");
1388 vm_page_unwire(p, 0);
1389 vm_page_free_zero(p);
1390 pmap->pm_pdirm = NULL;
1392 if (pmap->pm_pdir) {
1393 kmem_free(&kernel_map, (vm_offset_t)pmap->pm_pdir, PAGE_SIZE);
1394 pmap->pm_pdir = NULL;
1396 if (pmap->pm_pteobj) {
1397 vm_object_deallocate(pmap->pm_pteobj);
1398 pmap->pm_pteobj = NULL;
1403 * Wire in kernel global address entries. To avoid a race condition
1404 * between pmap initialization and pmap_growkernel, this procedure
1405 * adds the pmap to the master list (which growkernel scans to update),
1406 * then copies the template.
1411 pmap_pinit2(struct pmap *pmap)
1414 * XXX copies current process, does not fill in MPPTDI
1416 spin_lock(&pmap_spin);
1417 TAILQ_INSERT_TAIL(&pmap_list, pmap, pm_pmnode);
1418 bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * PTESIZE);
1419 spin_unlock(&pmap_spin);
1423 * Attempt to release and free a vm_page in a pmap. Returns 1 on success,
1424 * 0 on failure (if the procedure had to sleep).
1426 * When asked to remove the page directory page itself, we actually just
1427 * leave it cached so we do not have to incur the SMP inval overhead of
1428 * removing the kernel mapping. pmap_puninit() will take care of it.
1430 * The caller must hold vm_token.
1431 * This function can block regardless.
1434 pmap_release_free_page(struct pmap *pmap, vm_page_t p)
1436 unsigned *pde = (unsigned *) pmap->pm_pdir;
1439 * This code optimizes the case of freeing non-busy
1440 * page-table pages. Those pages are zero now, and
1441 * might as well be placed directly into the zero queue.
1443 if (vm_page_busy_try(p, FALSE)) {
1444 vm_page_sleep_busy(p, FALSE, "pmaprl");
1448 KKASSERT(pmap->pm_stats.resident_count > 0);
1449 KKASSERT(pde[p->pindex]);
1452 * page table page's wire_count must be 1. Caller is the pmap
1453 * termination code which holds the pm_pteobj, there is a race
1454 * if someone else is trying to hold the VM object in order to
1455 * clean up a wire_count.
1457 if (p->wire_count != 1) {
1458 if (pmap->pm_pteobj->hold_count <= 1)
1459 panic("pmap_release: freeing wired page table page");
1460 kprintf("pmap_release_free_page: unwire race detected\n");
1462 tsleep(p, 0, "pmapx", 1);
1467 * Remove the page table page from the processes address space.
1469 pmap->pm_cached = 0;
1471 --pmap->pm_stats.resident_count;
1472 if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == p->pindex))
1473 pmap->pm_ptphint = NULL;
1476 * We leave the page directory page cached, wired, and mapped in
1477 * the pmap until the dtor function (pmap_puninit()) gets called.
1478 * However, still clean it up so we can set PG_ZERO.
1480 * The pmap has already been removed from the pmap_list in the
1483 if (p->pindex == PTDPTDI) {
1484 bzero(pde + KPTDI, nkpt * PTESIZE);
1485 bzero(pde + MPPTDI, (NPDEPG - MPPTDI) * PTESIZE);
1486 vm_page_flag_set(p, PG_ZERO);
1490 * This case can occur if a pmap_unwire_pte() loses a race
1491 * while the page is unbusied.
1493 /*panic("pmap_release: page should already be gone %p", p);*/
1494 vm_page_flag_clear(p, PG_MAPPED);
1495 vm_page_unwire(p, 0);
1496 vm_page_free_zero(p);
1502 * This routine is called if the page table page is not mapped correctly.
1504 * The caller must hold vm_token.
1507 _pmap_allocpte(pmap_t pmap, unsigned ptepindex)
1513 * Find or fabricate a new pagetable page. Setting VM_ALLOC_ZERO
1514 * will zero any new page and mark it valid.
1516 m = vm_page_grab(pmap->pm_pteobj, ptepindex,
1517 VM_ALLOC_NORMAL | VM_ALLOC_ZERO | VM_ALLOC_RETRY);
1519 KASSERT(m->queue == PQ_NONE,
1520 ("_pmap_allocpte: %p->queue != PQ_NONE", m));
1523 * Increment the wire count for the page we will be returning to
1529 * It is possible that someone else got in and mapped by the page
1530 * directory page while we were blocked, if so just unbusy and
1531 * return the wired page.
1533 if ((ptepa = pmap->pm_pdir[ptepindex]) != 0) {
1534 KKASSERT((ptepa & PG_FRAME) == VM_PAGE_TO_PHYS(m));
1540 * Map the pagetable page into the process address space, if
1541 * it isn't already there.
1543 * NOTE: For safety clear pm_cached for all cpus including the
1544 * current one when adding a PDE to the map.
1546 ++pmap->pm_stats.resident_count;
1548 ptepa = VM_PAGE_TO_PHYS(m);
1549 pmap->pm_pdir[ptepindex] =
1550 (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
1551 pmap->pm_cached = 0;
1554 * Set the page table hint
1556 pmap->pm_ptphint = m;
1557 vm_page_flag_set(m, PG_MAPPED);
1564 * Allocate a page table entry for a va.
1566 * The caller must hold vm_token.
1569 pmap_allocpte(pmap_t pmap, vm_offset_t va)
1575 ASSERT_LWKT_TOKEN_HELD(vm_object_token(pmap->pm_pteobj));
1578 * Calculate pagetable page index
1580 ptepindex = va >> PDRSHIFT;
1583 * Get the page directory entry
1585 ptepa = (vm_offset_t) pmap->pm_pdir[ptepindex];
1588 * This supports switching from a 4MB page to a
1591 if (ptepa & PG_PS) {
1592 pmap->pm_pdir[ptepindex] = 0;
1599 * If the page table page is mapped, we just increment the
1600 * wire count, and activate it.
1604 * In order to get the page table page, try the
1607 if ((mpte = pmap->pm_ptphint) != NULL &&
1608 (mpte->pindex == ptepindex) &&
1609 (mpte->flags & PG_BUSY) == 0) {
1610 vm_page_wire_quick(mpte);
1612 mpte = pmap_page_lookup(pmap->pm_pteobj, ptepindex);
1613 pmap->pm_ptphint = mpte;
1614 vm_page_wire_quick(mpte);
1615 vm_page_wakeup(mpte);
1620 * Here if the pte page isn't mapped, or if it has been deallocated.
1622 return _pmap_allocpte(pmap, ptepindex);
1626 /***************************************************
1627 * Pmap allocation/deallocation routines.
1628 ***************************************************/
1631 * Release any resources held by the given physical map.
1632 * Called when a pmap initialized by pmap_pinit is being released.
1633 * Should only be called if the map contains no valid mappings.
1635 * Caller must hold pmap->pm_token
1637 static int pmap_release_callback(struct vm_page *p, void *data);
1640 pmap_release(struct pmap *pmap)
1642 vm_object_t object = pmap->pm_pteobj;
1643 struct rb_vm_page_scan_info info;
1645 KASSERT(pmap->pm_active == 0,
1646 ("pmap still active! %08x", pmap->pm_active));
1647 #if defined(DIAGNOSTIC)
1648 if (object->ref_count != 1)
1649 panic("pmap_release: pteobj reference count != 1");
1653 info.object = object;
1655 spin_lock(&pmap_spin);
1656 TAILQ_REMOVE(&pmap_list, pmap, pm_pmnode);
1657 spin_unlock(&pmap_spin);
1659 vm_object_hold(object);
1660 /*lwkt_gettoken(&vm_token);*/
1664 info.limit = object->generation;
1666 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
1667 pmap_release_callback, &info);
1668 if (info.error == 0 && info.mpte) {
1669 if (!pmap_release_free_page(pmap, info.mpte))
1672 } while (info.error);
1673 /*lwkt_reltoken(&vm_token);*/
1674 vm_object_drop(object);
1676 pmap->pm_cached = 0;
1680 * The caller must hold vm_token.
1683 pmap_release_callback(struct vm_page *p, void *data)
1685 struct rb_vm_page_scan_info *info = data;
1687 if (p->pindex == PTDPTDI) {
1691 if (!pmap_release_free_page(info->pmap, p)) {
1695 if (info->object->generation != info->limit) {
1703 * Grow the number of kernel page table entries, if needed.
1708 pmap_growkernel(vm_offset_t kstart, vm_offset_t kend)
1710 vm_offset_t addr = kend;
1712 vm_offset_t ptppaddr;
1716 vm_object_hold(kptobj);
1717 if (kernel_vm_end == 0) {
1718 kernel_vm_end = KERNBASE;
1720 while (pdir_pde(PTD, kernel_vm_end)) {
1721 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) &
1722 ~(PAGE_SIZE * NPTEPG - 1);
1726 addr = (addr + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1727 while (kernel_vm_end < addr) {
1728 if (pdir_pde(PTD, kernel_vm_end)) {
1729 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) &
1730 ~(PAGE_SIZE * NPTEPG - 1);
1735 * This index is bogus, but out of the way
1737 nkpg = vm_page_alloc(kptobj, nkpt, VM_ALLOC_NORMAL |
1739 VM_ALLOC_INTERRUPT);
1741 panic("pmap_growkernel: no memory to grow kernel");
1744 ptppaddr = VM_PAGE_TO_PHYS(nkpg);
1745 pmap_zero_page(ptppaddr);
1746 newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
1747 pdir_pde(PTD, kernel_vm_end) = newpdir;
1748 *pmap_pde(&kernel_pmap, kernel_vm_end) = newpdir;
1752 * This update must be interlocked with pmap_pinit2.
1754 spin_lock(&pmap_spin);
1755 TAILQ_FOREACH(pmap, &pmap_list, pm_pmnode) {
1756 *pmap_pde(pmap, kernel_vm_end) = newpdir;
1758 spin_unlock(&pmap_spin);
1759 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) &
1760 ~(PAGE_SIZE * NPTEPG - 1);
1762 vm_object_drop(kptobj);
1766 * Add a reference to the specified pmap.
1771 pmap_reference(pmap_t pmap)
1774 lwkt_gettoken(&vm_token);
1776 lwkt_reltoken(&vm_token);
1781 * vm_token must be held
1785 pmap_hold(pmap_t pmap)
1791 * vm_token must be held
1795 pmap_drop(pmap_t pmap)
1798 if (pmap->pm_count == (int)0x80000000)
1804 pmap_wait(pmap_t pmap, int count)
1806 lwkt_gettoken(&vm_token);
1807 pmap->pm_count += count;
1808 if (pmap->pm_count & 0x7FFFFFFF) {
1809 while (pmap->pm_count & 0x7FFFFFFF) {
1810 pmap->pm_count |= 0x80000000;
1811 tsleep(pmap, 0, "pmapd", 0);
1812 pmap->pm_count &= ~0x80000000;
1813 kprintf("pmap_wait: race averted\n");
1816 lwkt_reltoken(&vm_token);
1819 /***************************************************
1820 * page management routines.
1821 ***************************************************/
1824 * free the pv_entry back to the free list. This function may be
1825 * called from an interrupt.
1827 * The caller must hold vm_token.
1829 static PMAP_INLINE void
1830 free_pv_entry(pv_entry_t pv)
1832 struct mdglobaldata *gd;
1835 KKASSERT(pv->pv_m != NULL);
1840 if (gd->gd_freepv == NULL)
1847 * get a new pv_entry, allocating a block from the system
1848 * when needed. This function may be called from an interrupt thread.
1850 * THIS FUNCTION CAN BLOCK ON THE ZALLOC TOKEN, serialization of other
1851 * tokens (aka vm_token) to be temporarily lost.
1853 * The caller must hold vm_token.
1858 struct mdglobaldata *gd;
1862 if (pv_entry_high_water &&
1863 (pv_entry_count > pv_entry_high_water) &&
1864 (pmap_pagedaemon_waken == 0)) {
1865 pmap_pagedaemon_waken = 1;
1866 wakeup (&vm_pages_needed);
1869 if ((pv = gd->gd_freepv) != NULL)
1870 gd->gd_freepv = NULL;
1872 pv = zalloc(pvzone);
1877 * This routine is very drastic, but can save the system
1887 static int warningdone=0;
1889 if (pmap_pagedaemon_waken == 0)
1891 lwkt_gettoken(&vm_token);
1892 pmap_pagedaemon_waken = 0;
1894 if (warningdone < 5) {
1895 kprintf("pmap_collect: collecting pv entries -- "
1896 "suggest increasing PMAP_SHPGPERPROC\n");
1900 for (i = 0; i < vm_page_array_size; i++) {
1901 m = &vm_page_array[i];
1902 if (m->wire_count || m->hold_count)
1904 if (vm_page_busy_try(m, TRUE) == 0) {
1905 if (m->wire_count == 0 && m->hold_count == 0) {
1911 lwkt_reltoken(&vm_token);
1916 * Remove the pv entry and unwire the page table page related to the
1917 * pte the caller has cleared from the page table.
1919 * The caller must hold vm_token.
1922 pmap_remove_entry(struct pmap *pmap, vm_page_t m,
1923 vm_offset_t va, pmap_inval_info_t info)
1930 ASSERT_LWKT_TOKEN_HELD(&vm_token);
1931 if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
1932 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
1933 if (pmap == pv->pv_pmap && va == pv->pv_va)
1937 TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) {
1939 KKASSERT(pv->pv_pmap == pmap);
1941 if (va == pv->pv_va)
1950 test_m_maps_pv(m, pv);
1951 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1952 pmap_page_stats_deleting(m);
1953 m->md.pv_list_count--;
1955 atomic_add_int(&m->object->agg_pv_list_count, -1);
1956 if (TAILQ_EMPTY(&m->md.pv_list))
1957 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
1958 TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
1959 ++pmap->pm_generation;
1964 vm_object_hold(pmap->pm_pteobj);
1965 pmap_unuse_pt(pmap, va, pv->pv_ptem, info);
1966 vm_object_drop(pmap->pm_pteobj);
1971 * Create a pv entry for page at pa for (pmap, va).
1973 * The caller must hold vm_token.
1976 pmap_insert_entry(pmap_t pmap, pv_entry_t pv, vm_offset_t va,
1977 vm_page_t mpte, vm_page_t m)
1980 KKASSERT(pv->pv_m == NULL);
1987 pmap_page_stats_adding(m);
1988 TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
1989 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
1990 ++pmap->pm_generation;
1991 m->md.pv_list_count++;
1993 atomic_add_int(&m->object->agg_pv_list_count, 1);
1997 * pmap_remove_pte: do the things to unmap a page in a process.
1999 * The caller must hold vm_token.
2001 * WARNING! As with most other pmap functions this one can block, so
2002 * callers using temporary page table mappings must reload
2006 pmap_remove_pte(struct pmap *pmap, unsigned *ptq, vm_offset_t va,
2007 pmap_inval_info_t info)
2012 ptbase_assert(pmap);
2013 pmap_inval_interlock(info, pmap, va);
2014 ptbase_assert(pmap);
2015 oldpte = loadandclear(ptq);
2017 pmap->pm_stats.wired_count -= 1;
2018 pmap_inval_deinterlock(info, pmap);
2019 KKASSERT(oldpte & PG_V);
2021 * Machines that don't support invlpg, also don't support
2022 * PG_G. XXX PG_G is disabled for SMP so don't worry about
2026 cpu_invlpg((void *)va);
2027 KKASSERT(pmap->pm_stats.resident_count > 0);
2028 --pmap->pm_stats.resident_count;
2029 if (oldpte & PG_MANAGED) {
2030 m = PHYS_TO_VM_PAGE(oldpte);
2031 if (oldpte & PG_M) {
2032 #if defined(PMAP_DIAGNOSTIC)
2033 if (pmap_nw_modified((pt_entry_t) oldpte)) {
2034 kprintf("pmap_remove: modified page not "
2035 "writable: va: %p, pte: 0x%lx\n",
2036 (void *)va, (long)oldpte);
2039 if (pmap_track_modified(va))
2043 vm_page_flag_set(m, PG_REFERENCED);
2044 pmap_remove_entry(pmap, m, va, info);
2046 pmap_unuse_pt(pmap, va, NULL, info);
2051 * Remove a single page from a process address space.
2053 * The caller must hold vm_token.
2056 pmap_remove_page(struct pmap *pmap, vm_offset_t va, pmap_inval_info_t info)
2061 * If there is no pte for this address, just skip it!!! Otherwise
2062 * get a local va for mappings for this pmap and remove the entry.
2064 if (*pmap_pde(pmap, va) != 0) {
2065 ptq = get_ptbase(pmap) + i386_btop(va);
2067 pmap_remove_pte(pmap, ptq, va, info);
2074 * Remove the given range of addresses from the specified map.
2076 * It is assumed that the start and end are properly rounded to the page
2082 pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva)
2086 vm_offset_t ptpaddr;
2087 vm_offset_t sindex, eindex;
2088 struct pmap_inval_info info;
2093 vm_object_hold(pmap->pm_pteobj);
2094 lwkt_gettoken(&vm_token);
2095 if (pmap->pm_stats.resident_count == 0) {
2096 lwkt_reltoken(&vm_token);
2097 vm_object_drop(pmap->pm_pteobj);
2101 pmap_inval_init(&info);
2104 * special handling of removing one page. a very
2105 * common operation and easy to short circuit some
2108 if (((sva + PAGE_SIZE) == eva) &&
2109 (((unsigned) pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
2110 pmap_remove_page(pmap, sva, &info);
2111 pmap_inval_done(&info);
2112 lwkt_reltoken(&vm_token);
2113 vm_object_drop(pmap->pm_pteobj);
2118 * Get a local virtual address for the mappings that are being
2121 sindex = i386_btop(sva);
2122 eindex = i386_btop(eva);
2124 while (sindex < eindex) {
2128 * Stop scanning if no pages are left
2130 if (pmap->pm_stats.resident_count == 0)
2134 * Calculate index for next page table, limited by eindex.
2136 pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1));
2140 pdirindex = sindex / NPDEPG;
2141 ptpaddr = (unsigned)pmap->pm_pdir[pdirindex];
2142 if (ptpaddr & PG_PS) {
2143 pmap_inval_interlock(&info, pmap, -1);
2144 pmap->pm_pdir[pdirindex] = 0;
2145 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
2146 pmap->pm_cached = 0;
2147 pmap_inval_deinterlock(&info, pmap);
2153 * Weed out invalid mappings. Note: we assume that the page
2154 * directory table is always allocated, and in kernel virtual.
2162 * Sub-scan the page table page. pmap_remove_pte() can
2163 * block on us, invalidating ptbase, so we must reload
2164 * ptbase and we must also check whether the page directory
2165 * page is still present.
2167 while (sindex < pdnxt) {
2170 ptbase = get_ptbase(pmap);
2171 if (ptbase[sindex]) {
2172 va = i386_ptob(sindex);
2173 pmap_remove_pte(pmap, ptbase + sindex,
2176 if (pmap->pm_pdir[pdirindex] == 0 ||
2177 (pmap->pm_pdir[pdirindex] & PG_PS)) {
2183 pmap_inval_done(&info);
2184 lwkt_reltoken(&vm_token);
2185 vm_object_drop(pmap->pm_pteobj);
2189 * Removes this physical page from all physical maps in which it resides.
2190 * Reflects back modify bits to the pager.
2192 * vm_token must be held by caller.
2195 pmap_remove_all(vm_page_t m)
2197 struct pmap_inval_info info;
2198 unsigned *pte, tpte;
2202 if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
2204 if (TAILQ_EMPTY(&m->md.pv_list))
2207 pmap_inval_init(&info);
2208 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2210 KKASSERT(pmap->pm_stats.resident_count > 0);
2212 vm_object_hold(pmap->pm_pteobj);
2214 if (pv != TAILQ_FIRST(&m->md.pv_list)) {
2215 vm_object_drop(pmap->pm_pteobj);
2220 --pmap->pm_stats.resident_count;
2221 pte = pmap_pte_quick(pmap, pv->pv_va);
2222 pmap_inval_interlock(&info, pmap, pv->pv_va);
2223 tpte = loadandclear(pte);
2225 pmap->pm_stats.wired_count--;
2226 pmap_inval_deinterlock(&info, pmap);
2228 vm_page_flag_set(m, PG_REFERENCED);
2229 KKASSERT(PHYS_TO_VM_PAGE(tpte) == m);
2232 * Update the vm_page_t clean and reference bits.
2235 #if defined(PMAP_DIAGNOSTIC)
2236 if (pmap_nw_modified((pt_entry_t) tpte)) {
2237 kprintf("pmap_remove_all: modified page "
2238 "not writable: va: %p, pte: 0x%lx\n",
2239 (void *)pv->pv_va, (long)tpte);
2242 if (pmap_track_modified(pv->pv_va))
2246 KKASSERT(pv->pv_m == m);
2248 KKASSERT(pv == TAILQ_FIRST(&m->md.pv_list));
2249 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2250 TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
2251 pmap_page_stats_deleting(m);
2252 ++pmap->pm_generation;
2253 m->md.pv_list_count--;
2255 atomic_add_int(&m->object->agg_pv_list_count, -1);
2256 if (TAILQ_EMPTY(&m->md.pv_list))
2257 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
2258 pmap_unuse_pt(pmap, pv->pv_va, pv->pv_ptem, &info);
2259 vm_object_drop(pmap->pm_pteobj);
2263 KKASSERT((m->flags & (PG_MAPPED|PG_WRITEABLE)) == 0);
2264 pmap_inval_done(&info);
2268 * Set the physical protection on the specified range of this map
2274 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
2277 vm_offset_t pdnxt, ptpaddr;
2278 vm_pindex_t sindex, eindex;
2279 pmap_inval_info info;
2284 if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
2285 pmap_remove(pmap, sva, eva);
2289 if (prot & VM_PROT_WRITE)
2292 lwkt_gettoken(&vm_token);
2293 pmap_inval_init(&info);
2295 ptbase = get_ptbase(pmap);
2297 sindex = i386_btop(sva);
2298 eindex = i386_btop(eva);
2300 for (; sindex < eindex; sindex = pdnxt) {
2303 pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1));
2305 pdirindex = sindex / NPDEPG;
2306 if (((ptpaddr = (unsigned) pmap->pm_pdir[pdirindex]) & PG_PS) != 0) {
2307 pmap_inval_interlock(&info, pmap, -1);
2308 pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW);
2309 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
2310 pmap_inval_deinterlock(&info, pmap);
2315 * Weed out invalid mappings. Note: we assume that the page
2316 * directory table is always allocated, and in kernel virtual.
2321 if (pdnxt > eindex) {
2325 for (; sindex != pdnxt; sindex++) {
2333 pmap_inval_interlock(&info, pmap, i386_ptob(sindex));
2335 pbits = ptbase[sindex];
2338 if (pbits & PG_MANAGED) {
2341 m = PHYS_TO_VM_PAGE(pbits);
2342 vm_page_flag_set(m, PG_REFERENCED);
2346 if (pmap_track_modified(i386_ptob(sindex))) {
2348 m = PHYS_TO_VM_PAGE(pbits);
2355 if (pbits != cbits &&
2356 !atomic_cmpset_int(ptbase + sindex, pbits, cbits)) {
2359 pmap_inval_deinterlock(&info, pmap);
2362 pmap_inval_done(&info);
2363 lwkt_reltoken(&vm_token);
2367 * Insert the given physical page (p) at the specified virtual address (v)
2368 * in the target physical map with the protection requested.
2370 * If specified, the page will be wired down, meaning that the related pte
2371 * cannot be reclaimed.
2376 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
2377 boolean_t wired, vm_map_entry_t entry __unused)
2382 vm_offset_t origpte, newpte;
2384 pmap_inval_info info;
2391 #ifdef PMAP_DIAGNOSTIC
2393 panic("pmap_enter: toobig");
2394 if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS)) {
2395 panic("pmap_enter: invalid to pmap_enter page "
2396 "table pages (va: %p)", (void *)va);
2399 if (va < UPT_MAX_ADDRESS && pmap == &kernel_pmap) {
2400 kprintf("Warning: pmap_enter called on UVA with kernel_pmap\n");
2401 print_backtrace(-1);
2403 if (va >= UPT_MAX_ADDRESS && pmap != &kernel_pmap) {
2404 kprintf("Warning: pmap_enter called on KVA without kernel_pmap\n");
2405 print_backtrace(-1);
2408 vm_object_hold(pmap->pm_pteobj);
2409 lwkt_gettoken(&vm_token);
2412 * This can block, get it before we do anything important.
2414 if (pmap_initialized &&
2415 (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) {
2416 pv = get_pv_entry();
2422 * In the case that a page table page is not
2423 * resident, we are creating it here.
2425 if (va < UPT_MIN_ADDRESS)
2426 mpte = pmap_allocpte(pmap, va);
2430 if ((prot & VM_PROT_NOSYNC) == 0)
2431 pmap_inval_init(&info);
2432 pte = pmap_pte(pmap, va);
2435 * Page Directory table entry not valid, we need a new PT page
2438 panic("pmap_enter: invalid page directory pdir=0x%lx, va=%p",
2439 (long)pmap->pm_pdir[PTDPTDI], (void *)va);
2442 pa = VM_PAGE_TO_PHYS(m) & PG_FRAME;
2443 origpte = *(vm_offset_t *)pte;
2444 opa = origpte & PG_FRAME;
2446 if (origpte & PG_PS)
2447 panic("pmap_enter: attempted pmap_enter on 4MB page");
2450 * Mapping has not changed, must be protection or wiring change.
2452 if (origpte && (opa == pa)) {
2454 * Wiring change, just update stats. We don't worry about
2455 * wiring PT pages as they remain resident as long as there
2456 * are valid mappings in them. Hence, if a user page is wired,
2457 * the PT page will be also.
2459 if (wired && ((origpte & PG_W) == 0))
2460 pmap->pm_stats.wired_count++;
2461 else if (!wired && (origpte & PG_W))
2462 pmap->pm_stats.wired_count--;
2464 #if defined(PMAP_DIAGNOSTIC)
2465 if (pmap_nw_modified((pt_entry_t) origpte)) {
2466 kprintf("pmap_enter: modified page not "
2467 "writable: va: %p, pte: 0x%lx\n",
2468 (void *)va, (long )origpte);
2473 * We might be turning off write access to the page,
2474 * so we go ahead and sense modify status.
2476 if (origpte & PG_MANAGED) {
2477 if ((origpte & PG_M) && pmap_track_modified(va)) {
2479 om = PHYS_TO_VM_PAGE(opa);
2483 KKASSERT(m->flags & PG_MAPPED);
2488 * Mapping has changed, invalidate old range and fall through to
2489 * handle validating new mapping.
2491 * Since we have a ref on the page directory page pmap_pte()
2492 * will always return non-NULL.
2494 * NOTE: pmap_remove_pte() can block and cause the temporary ptbase
2495 * to get wiped. reload the ptbase. I'm not sure if it is
2496 * also possible to race another pmap_enter() but check for
2500 KKASSERT((origpte & PG_FRAME) ==
2501 (*(vm_offset_t *)pte & PG_FRAME));
2502 if (prot & VM_PROT_NOSYNC) {
2503 prot &= ~VM_PROT_NOSYNC;
2504 pmap_inval_init(&info);
2506 pmap_remove_pte(pmap, pte, va, &info);
2507 pte = pmap_pte(pmap, va);
2508 origpte = *(vm_offset_t *)pte;
2509 opa = origpte & PG_FRAME;
2511 kprintf("pmap_enter: Warning, raced pmap %p va %p\n",
2517 * Enter on the PV list if part of our managed memory. Note that we
2518 * raise IPL while manipulating pv_table since pmap_enter can be
2519 * called at interrupt time.
2521 if (pmap_initialized &&
2522 (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) {
2523 pmap_insert_entry(pmap, pv, va, mpte, m);
2525 ptbase_assert(pmap);
2527 vm_page_flag_set(m, PG_MAPPED);
2531 * Increment counters
2533 ++pmap->pm_stats.resident_count;
2535 pmap->pm_stats.wired_count++;
2536 KKASSERT(*pte == 0);
2540 * Now validate mapping with desired protection/wiring.
2542 ptbase_assert(pmap);
2543 newpte = (vm_offset_t) (pa | pte_prot(pmap, prot) | PG_V);
2547 if (va < UPT_MIN_ADDRESS)
2549 if (pmap == &kernel_pmap)
2551 newpte |= pat_pte_index[m->pat_mode];
2554 * If the mapping or permission bits are different, we need
2555 * to update the pte. If the pte is already present we have
2556 * to get rid of the extra wire-count on mpte we had obtained
2559 * mpte has a new wire_count, which also serves to prevent the
2560 * page table page from getting ripped out while we work. If we
2561 * are modifying an existing pte instead of installing a new one
2562 * we have to drop it.
2564 if ((origpte & ~(PG_M|PG_A)) != newpte) {
2565 if (prot & VM_PROT_NOSYNC)
2566 cpu_invlpg((void *)va);
2568 pmap_inval_interlock(&info, pmap, va);
2569 ptbase_assert(pmap);
2572 KKASSERT((*pte & PG_FRAME) == (newpte & PG_FRAME));
2573 if (mpte && vm_page_unwire_quick(mpte))
2574 panic("pmap_enter: Insufficient wire_count");
2577 *pte = newpte | PG_A;
2578 if ((prot & VM_PROT_NOSYNC) == 0)
2579 pmap_inval_deinterlock(&info, pmap);
2581 vm_page_flag_set(m, PG_WRITEABLE);
2584 KKASSERT((*pte & PG_FRAME) == (newpte & PG_FRAME));
2585 if (mpte && vm_page_unwire_quick(mpte))
2586 panic("pmap_enter: Insufficient wire_count");
2591 * NOTE: mpte invalid after this point if we block.
2593 KKASSERT((newpte & PG_MANAGED) == 0 || (m->flags & PG_MAPPED));
2594 if ((prot & VM_PROT_NOSYNC) == 0)
2595 pmap_inval_done(&info);
2598 lwkt_reltoken(&vm_token);
2599 vm_object_drop(pmap->pm_pteobj);
2603 * This code works like pmap_enter() but assumes VM_PROT_READ and not-wired.
2604 * This code also assumes that the pmap has no pre-existing entry for this
2607 * This code currently may only be used on user pmaps, not kernel_pmap.
2612 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m)
2619 pmap_inval_info info;
2622 vm_object_hold(pmap->pm_pteobj);
2623 lwkt_gettoken(&vm_token);
2626 * This can block, get it before we do anything important.
2628 if (pmap_initialized &&
2629 (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) {
2630 pv = get_pv_entry();
2635 pmap_inval_init(&info);
2637 if (va < UPT_MAX_ADDRESS && pmap == &kernel_pmap) {
2638 kprintf("Warning: pmap_enter_quick called on UVA with kernel_pmap\n");
2639 print_backtrace(-1);
2641 if (va >= UPT_MAX_ADDRESS && pmap != &kernel_pmap) {
2642 kprintf("Warning: pmap_enter_quick called on KVA without kernel_pmap\n");
2643 print_backtrace(-1);
2646 KKASSERT(va < UPT_MIN_ADDRESS); /* assert used on user pmaps only */
2649 * Calculate the page table page (mpte), allocating it if necessary.
2651 * A held page table page (mpte), or NULL, is passed onto the
2652 * section following.
2654 if (va < UPT_MIN_ADDRESS) {
2656 * Calculate pagetable page index
2658 ptepindex = va >> PDRSHIFT;
2662 * Get the page directory entry
2664 ptepa = (vm_offset_t) pmap->pm_pdir[ptepindex];
2667 * If the page table page is mapped, we just increment
2668 * the wire count, and activate it.
2672 panic("pmap_enter_quick: unexpected mapping into 4MB page");
2673 if ((mpte = pmap->pm_ptphint) != NULL &&
2674 (mpte->pindex == ptepindex) &&
2675 (mpte->flags & PG_BUSY) == 0) {
2676 vm_page_wire_quick(mpte);
2678 mpte = pmap_page_lookup(pmap->pm_pteobj,
2680 pmap->pm_ptphint = mpte;
2681 vm_page_wire_quick(mpte);
2682 vm_page_wakeup(mpte);
2685 mpte = _pmap_allocpte(pmap, ptepindex);
2687 } while (mpte == NULL);
2690 /* this code path is not yet used */
2694 * With a valid (and held) page directory page, we can just use
2695 * vtopte() to get to the pte. If the pte is already present
2696 * we do not disturb it.
2698 pte = (unsigned *)vtopte(va);
2700 KKASSERT(*pte & PG_V);
2701 pa = VM_PAGE_TO_PHYS(m);
2702 KKASSERT(((*pte ^ pa) & PG_FRAME) == 0);
2703 pmap_inval_done(&info);
2705 pmap_unwire_pte(pmap, mpte, &info);
2710 lwkt_reltoken(&vm_token);
2711 vm_object_drop(pmap->pm_pteobj);
2716 * Enter on the PV list if part of our managed memory
2718 if (pmap_initialized &&
2719 (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) {
2720 pmap_insert_entry(pmap, pv, va, mpte, m);
2722 vm_page_flag_set(m, PG_MAPPED);
2726 * Increment counters
2728 ++pmap->pm_stats.resident_count;
2730 pa = VM_PAGE_TO_PHYS(m);
2733 * Now validate mapping with RO protection
2735 if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
2736 *pte = pa | PG_V | PG_U;
2738 *pte = pa | PG_V | PG_U | PG_MANAGED;
2739 /* pmap_inval_add(&info, pmap, va); shouldn't be needed inval->valid */
2740 pmap_inval_done(&info);
2745 lwkt_reltoken(&vm_token);
2746 vm_object_drop(pmap->pm_pteobj);
2750 * Make a temporary mapping for a physical address. This is only intended
2751 * to be used for panic dumps.
2753 * The caller is responsible for calling smp_invltlb().
2758 pmap_kenter_temporary(vm_paddr_t pa, long i)
2760 pmap_kenter_quick((vm_offset_t)crashdumpmap + (i * PAGE_SIZE), pa);
2761 return ((void *)crashdumpmap);
2764 #define MAX_INIT_PT (96)
2767 * This routine preloads the ptes for a given object into the specified pmap.
2768 * This eliminates the blast of soft faults on process startup and
2769 * immediately after an mmap.
2773 static int pmap_object_init_pt_callback(vm_page_t p, void *data);
2776 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_prot_t prot,
2777 vm_object_t object, vm_pindex_t pindex,
2778 vm_size_t size, int limit)
2780 struct rb_vm_page_scan_info info;
2785 * We can't preinit if read access isn't set or there is no pmap
2788 if ((prot & VM_PROT_READ) == 0 || pmap == NULL || object == NULL)
2792 * We can't preinit if the pmap is not the current pmap
2794 lp = curthread->td_lwp;
2795 if (lp == NULL || pmap != vmspace_pmap(lp->lwp_vmspace))
2798 psize = i386_btop(size);
2800 if ((object->type != OBJT_VNODE) ||
2801 ((limit & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) &&
2802 (object->resident_page_count > MAX_INIT_PT))) {
2806 if (psize + pindex > object->size) {
2807 if (object->size < pindex)
2809 psize = object->size - pindex;
2816 * Use a red-black scan to traverse the requested range and load
2817 * any valid pages found into the pmap.
2819 * We cannot safely scan the object's memq unless we are in a
2820 * critical section since interrupts can remove pages from objects.
2822 info.start_pindex = pindex;
2823 info.end_pindex = pindex + psize - 1;
2829 vm_object_hold_shared(object);
2830 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
2831 pmap_object_init_pt_callback, &info);
2832 vm_object_drop(object);
2836 * The caller must hold vm_token.
2840 pmap_object_init_pt_callback(vm_page_t p, void *data)
2842 struct rb_vm_page_scan_info *info = data;
2843 vm_pindex_t rel_index;
2845 * don't allow an madvise to blow away our really
2846 * free pages allocating pv entries.
2848 if ((info->limit & MAP_PREFAULT_MADVISE) &&
2849 vmstats.v_free_count < vmstats.v_free_reserved) {
2854 * Ignore list markers and ignore pages we cannot instantly
2855 * busy (while holding the object token).
2857 if (p->flags & PG_MARKER)
2859 if (vm_page_busy_try(p, TRUE))
2861 if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
2862 (p->flags & PG_FICTITIOUS) == 0) {
2863 if ((p->queue - p->pc) == PQ_CACHE)
2864 vm_page_deactivate(p);
2865 rel_index = p->pindex - info->start_pindex;
2866 pmap_enter_quick(info->pmap,
2867 info->addr + i386_ptob(rel_index), p);
2874 * Return TRUE if the pmap is in shape to trivially
2875 * pre-fault the specified address.
2877 * Returns FALSE if it would be non-trivial or if a
2878 * pte is already loaded into the slot.
2883 pmap_prefault_ok(pmap_t pmap, vm_offset_t addr)
2888 lwkt_gettoken(&vm_token);
2889 if ((*pmap_pde(pmap, addr)) == 0) {
2892 pte = (unsigned *) vtopte(addr);
2893 ret = (*pte) ? 0 : 1;
2895 lwkt_reltoken(&vm_token);
2900 * Change the wiring attribute for a map/virtual-adderss pair. The mapping
2901 * must already exist.
2906 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired,
2907 vm_map_entry_t entry __unused)
2914 lwkt_gettoken(&vm_token);
2915 pte = pmap_pte(pmap, va);
2917 if (wired && !pmap_pte_w(pte))
2918 pmap->pm_stats.wired_count++;
2919 else if (!wired && pmap_pte_w(pte))
2920 pmap->pm_stats.wired_count--;
2923 * Wiring is not a hardware characteristic so there is no need to
2924 * invalidate TLB. However, in an SMP environment we must use
2925 * a locked bus cycle to update the pte (if we are not using
2926 * the pmap_inval_*() API that is)... it's ok to do this for simple
2930 atomic_set_int(pte, PG_W);
2932 atomic_clear_int(pte, PG_W);
2933 lwkt_reltoken(&vm_token);
2937 * Copy the range specified by src_addr/len from the source map to the
2938 * range dst_addr/len in the destination map.
2940 * This routine is only advisory and need not do anything.
2945 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr,
2946 vm_size_t len, vm_offset_t src_addr)
2952 * Zero the specified PA by mapping the page into KVM and clearing its
2958 pmap_zero_page(vm_paddr_t phys)
2960 struct mdglobaldata *gd = mdcpu;
2963 if (*(int *)gd->gd_CMAP3)
2964 panic("pmap_zero_page: CMAP3 busy");
2965 *(int *)gd->gd_CMAP3 =
2966 PG_V | PG_RW | (phys & PG_FRAME) | PG_A | PG_M;
2967 cpu_invlpg(gd->gd_CADDR3);
2968 bzero(gd->gd_CADDR3, PAGE_SIZE);
2969 *(int *) gd->gd_CMAP3 = 0;
2974 * Assert that a page is empty, panic if it isn't.
2979 pmap_page_assertzero(vm_paddr_t phys)
2981 struct mdglobaldata *gd = mdcpu;
2985 if (*(int *)gd->gd_CMAP3)
2986 panic("pmap_zero_page: CMAP3 busy");
2987 *(int *)gd->gd_CMAP3 =
2988 PG_V | PG_RW | (phys & PG_FRAME) | PG_A | PG_M;
2989 cpu_invlpg(gd->gd_CADDR3);
2990 for (i = 0; i < PAGE_SIZE; i += 4) {
2991 if (*(int *)((char *)gd->gd_CADDR3 + i) != 0) {
2992 panic("pmap_page_assertzero() @ %p not zero!",
2993 (void *)gd->gd_CADDR3);
2996 *(int *) gd->gd_CMAP3 = 0;
3001 * Zero part of a physical page by mapping it into memory and clearing
3002 * its contents with bzero.
3004 * off and size may not cover an area beyond a single hardware page.
3009 pmap_zero_page_area(vm_paddr_t phys, int off, int size)
3011 struct mdglobaldata *gd = mdcpu;
3014 if (*(int *) gd->gd_CMAP3)
3015 panic("pmap_zero_page: CMAP3 busy");
3016 *(int *) gd->gd_CMAP3 = PG_V | PG_RW | (phys & PG_FRAME) | PG_A | PG_M;
3017 cpu_invlpg(gd->gd_CADDR3);
3018 bzero((char *)gd->gd_CADDR3 + off, size);
3019 *(int *) gd->gd_CMAP3 = 0;
3024 * Copy the physical page from the source PA to the target PA.
3025 * This function may be called from an interrupt. No locking
3031 pmap_copy_page(vm_paddr_t src, vm_paddr_t dst)
3033 struct mdglobaldata *gd = mdcpu;
3036 if (*(int *) gd->gd_CMAP1)
3037 panic("pmap_copy_page: CMAP1 busy");
3038 if (*(int *) gd->gd_CMAP2)
3039 panic("pmap_copy_page: CMAP2 busy");
3041 *(int *) gd->gd_CMAP1 = PG_V | (src & PG_FRAME) | PG_A;
3042 *(int *) gd->gd_CMAP2 = PG_V | PG_RW | (dst & PG_FRAME) | PG_A | PG_M;
3044 cpu_invlpg(gd->gd_CADDR1);
3045 cpu_invlpg(gd->gd_CADDR2);
3047 bcopy(gd->gd_CADDR1, gd->gd_CADDR2, PAGE_SIZE);
3049 *(int *) gd->gd_CMAP1 = 0;
3050 *(int *) gd->gd_CMAP2 = 0;
3055 * Copy the physical page from the source PA to the target PA.
3056 * This function may be called from an interrupt. No locking
3062 pmap_copy_page_frag(vm_paddr_t src, vm_paddr_t dst, size_t bytes)
3064 struct mdglobaldata *gd = mdcpu;
3067 if (*(int *) gd->gd_CMAP1)
3068 panic("pmap_copy_page: CMAP1 busy");
3069 if (*(int *) gd->gd_CMAP2)
3070 panic("pmap_copy_page: CMAP2 busy");
3072 *(int *) gd->gd_CMAP1 = PG_V | (src & PG_FRAME) | PG_A;
3073 *(int *) gd->gd_CMAP2 = PG_V | PG_RW | (dst & PG_FRAME) | PG_A | PG_M;
3075 cpu_invlpg(gd->gd_CADDR1);
3076 cpu_invlpg(gd->gd_CADDR2);
3078 bcopy((char *)gd->gd_CADDR1 + (src & PAGE_MASK),
3079 (char *)gd->gd_CADDR2 + (dst & PAGE_MASK),
3082 *(int *) gd->gd_CMAP1 = 0;
3083 *(int *) gd->gd_CMAP2 = 0;
3088 * Returns true if the pmap's pv is one of the first
3089 * 16 pvs linked to from this page. This count may
3090 * be changed upwards or downwards in the future; it
3091 * is only necessary that true be returned for a small
3092 * subset of pmaps for proper page aging.
3097 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
3102 if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
3105 lwkt_gettoken(&vm_token);
3106 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3107 if (pv->pv_pmap == pmap) {
3108 lwkt_reltoken(&vm_token);
3115 lwkt_reltoken(&vm_token);
3120 * Remove all pages from specified address space
3121 * this aids process exit speeds. Also, this code
3122 * is special cased for current process only, but
3123 * can have the more generic (and slightly slower)
3124 * mode enabled. This is much faster than pmap_remove
3125 * in the case of running down an entire address space.
3130 pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
3133 unsigned *pte, tpte;
3136 pmap_inval_info info;
3138 int32_t save_generation;
3140 lp = curthread->td_lwp;
3141 if (lp && pmap == vmspace_pmap(lp->lwp_vmspace))
3146 if (pmap->pm_pteobj)
3147 vm_object_hold(pmap->pm_pteobj);
3148 lwkt_gettoken(&vm_token);
3149 pmap_inval_init(&info);
3151 for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
3152 if (pv->pv_va >= eva || pv->pv_va < sva) {
3153 npv = TAILQ_NEXT(pv, pv_plist);
3157 KKASSERT(pmap == pv->pv_pmap);
3160 pte = (unsigned *)vtopte(pv->pv_va);
3162 pte = pmap_pte_quick(pmap, pv->pv_va);
3164 pmap_inval_interlock(&info, pmap, pv->pv_va);
3167 * We cannot remove wired pages from a process' mapping
3171 pmap_inval_deinterlock(&info, pmap);
3172 npv = TAILQ_NEXT(pv, pv_plist);
3176 tpte = loadandclear(pte);
3177 pmap_inval_deinterlock(&info, pmap);
3179 m = PHYS_TO_VM_PAGE(tpte);
3180 test_m_maps_pv(m, pv);
3182 KASSERT(m < &vm_page_array[vm_page_array_size],
3183 ("pmap_remove_pages: bad tpte %x", tpte));
3185 KKASSERT(pmap->pm_stats.resident_count > 0);
3186 --pmap->pm_stats.resident_count;
3189 * Update the vm_page_t clean and reference bits.
3195 npv = TAILQ_NEXT(pv, pv_plist);
3197 KKASSERT(pv->pv_m == m);
3198 KKASSERT(pv->pv_pmap == pmap);
3200 TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
3201 save_generation = ++pmap->pm_generation;
3203 m->md.pv_list_count--;
3205 atomic_add_int(&m->object->agg_pv_list_count, -1);
3206 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
3207 pmap_page_stats_deleting(m);
3208 if (TAILQ_EMPTY(&m->md.pv_list))
3209 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
3211 pmap_unuse_pt(pmap, pv->pv_va, pv->pv_ptem, &info);
3215 * Restart the scan if we blocked during the unuse or free
3216 * calls and other removals were made.
3218 if (save_generation != pmap->pm_generation) {
3219 kprintf("Warning: pmap_remove_pages race-A avoided\n");
3220 npv = TAILQ_FIRST(&pmap->pm_pvlist);
3223 pmap_inval_done(&info);
3224 lwkt_reltoken(&vm_token);
3225 if (pmap->pm_pteobj)
3226 vm_object_drop(pmap->pm_pteobj);
3230 * pmap_testbit tests bits in pte's
3231 * note that the testbit/clearbit routines are inline,
3232 * and a lot of things compile-time evaluate.
3234 * The caller must hold vm_token.
3237 pmap_testbit(vm_page_t m, int bit)
3242 if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
3245 if (TAILQ_FIRST(&m->md.pv_list) == NULL)
3248 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3250 * if the bit being tested is the modified bit, then
3251 * mark clean_map and ptes as never
3254 if (bit & (PG_A|PG_M)) {
3255 if (!pmap_track_modified(pv->pv_va))
3259 #if defined(PMAP_DIAGNOSTIC)
3261 kprintf("Null pmap (tb) at va: %p\n",
3266 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
3275 * This routine is used to modify bits in ptes
3277 * The caller must hold vm_token.
3279 static __inline void
3280 pmap_clearbit(vm_page_t m, int bit)
3282 struct pmap_inval_info info;
3287 if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
3290 pmap_inval_init(&info);
3293 * Loop over all current mappings setting/clearing as appropos If
3294 * setting RO do we need to clear the VAC?
3296 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3298 * don't write protect pager mappings
3301 if (!pmap_track_modified(pv->pv_va))
3305 #if defined(PMAP_DIAGNOSTIC)
3307 kprintf("Null pmap (cb) at va: %p\n",
3314 * Careful here. We can use a locked bus instruction to
3315 * clear PG_A or PG_M safely but we need to synchronize
3316 * with the target cpus when we mess with PG_RW.
3318 * We do not have to force synchronization when clearing
3319 * PG_M even for PTEs generated via virtual memory maps,
3320 * because the virtual kernel will invalidate the pmap
3321 * entry when/if it needs to resynchronize the Modify bit.
3324 pmap_inval_interlock(&info, pv->pv_pmap, pv->pv_va);
3325 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
3332 atomic_clear_int(pte, PG_M|PG_RW);
3335 * The cpu may be trying to set PG_M
3336 * simultaniously with our clearing
3339 if (!atomic_cmpset_int(pte, pbits,
3343 } else if (bit == PG_M) {
3345 * We could also clear PG_RW here to force
3346 * a fault on write to redetect PG_M for
3347 * virtual kernels, but it isn't necessary
3348 * since virtual kernels invalidate the pte
3349 * when they clear the VPTE_M bit in their
3350 * virtual page tables.
3352 atomic_clear_int(pte, PG_M);
3354 atomic_clear_int(pte, bit);
3358 pmap_inval_deinterlock(&info, pv->pv_pmap);
3360 pmap_inval_done(&info);
3364 * Lower the permission for all mappings to a given page.
3369 pmap_page_protect(vm_page_t m, vm_prot_t prot)
3371 if ((prot & VM_PROT_WRITE) == 0) {
3372 lwkt_gettoken(&vm_token);
3373 if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) {
3374 pmap_clearbit(m, PG_RW);
3375 vm_page_flag_clear(m, PG_WRITEABLE);
3379 lwkt_reltoken(&vm_token);
3384 * Return the physical address given a physical page index.
3389 pmap_phys_address(vm_pindex_t ppn)
3391 return (i386_ptob(ppn));
3395 * Return a count of reference bits for a page, clearing those bits.
3396 * It is not necessary for every reference bit to be cleared, but it
3397 * is necessary that 0 only be returned when there are truly no
3398 * reference bits set.
3403 pmap_ts_referenced(vm_page_t m)
3405 pv_entry_t pv, pvf, pvn;
3409 if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
3412 lwkt_gettoken(&vm_token);
3414 if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3419 pvn = TAILQ_NEXT(pv, pv_list);
3421 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
3422 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
3424 if (!pmap_track_modified(pv->pv_va))
3427 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
3429 if (pte && (*pte & PG_A)) {
3430 atomic_clear_int(pte, PG_A);
3436 } while ((pv = pvn) != NULL && pv != pvf);
3439 lwkt_reltoken(&vm_token);
3445 * Return whether or not the specified physical page was modified
3446 * in any physical maps.
3451 pmap_is_modified(vm_page_t m)
3455 lwkt_gettoken(&vm_token);
3456 res = pmap_testbit(m, PG_M);
3457 lwkt_reltoken(&vm_token);
3462 * Clear the modify bits on the specified physical page.
3467 pmap_clear_modify(vm_page_t m)
3469 lwkt_gettoken(&vm_token);
3470 pmap_clearbit(m, PG_M);
3471 lwkt_reltoken(&vm_token);
3475 * Clear the reference bit on the specified physical page.
3480 pmap_clear_reference(vm_page_t m)
3482 lwkt_gettoken(&vm_token);
3483 pmap_clearbit(m, PG_A);
3484 lwkt_reltoken(&vm_token);
3488 * Miscellaneous support routines follow
3490 * Called from the low level boot code only.
3493 i386_protection_init(void)
3497 kp = protection_codes;
3498 for (prot = 0; prot < 8; prot++) {
3500 case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE:
3502 * Read access is also 0. There isn't any execute bit,
3503 * so just make it readable.
3505 case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE:
3506 case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE:
3507 case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE:
3510 case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE:
3511 case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE:
3512 case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE:
3513 case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE:
3521 * Map a set of physical memory pages into the kernel virtual
3522 * address space. Return a pointer to where it is mapped. This
3523 * routine is intended to be used for mapping device memory,
3526 * NOTE: We can't use pgeflag unless we invalidate the pages one at
3529 * NOTE: The PAT attributes {WRITE_BACK, WRITE_THROUGH, UNCACHED, UNCACHEABLE}
3530 * work whether the cpu supports PAT or not. The remaining PAT
3531 * attributes {WRITE_PROTECTED, WRITE_COMBINING} only work if the cpu
3535 pmap_mapdev(vm_paddr_t pa, vm_size_t size)
3537 return(pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
3541 pmap_mapdev_uncacheable(vm_paddr_t pa, vm_size_t size)
3543 return(pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
3547 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
3549 return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
3553 * Map a set of physical memory pages into the kernel virtual
3554 * address space. Return a pointer to where it is mapped. This
3555 * routine is intended to be used for mapping device memory,
3559 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
3561 vm_offset_t va, tmpva, offset;
3565 offset = pa & PAGE_MASK;
3566 size = roundup(offset + size, PAGE_SIZE);
3568 va = kmem_alloc_nofault(&kernel_map, size, PAGE_SIZE);
3570 panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
3572 pa = pa & ~PAGE_MASK;
3573 for (tmpva = va, tmpsize = size; tmpsize > 0;) {
3574 pte = vtopte(tmpva);
3575 *pte = pa | PG_RW | PG_V | /* pgeflag | */
3576 pat_pte_index[mode];
3577 tmpsize -= PAGE_SIZE;
3581 pmap_invalidate_range(&kernel_pmap, va, va + size);
3582 pmap_invalidate_cache_range(va, va + size);
3584 return ((void *)(va + offset));
3591 pmap_unmapdev(vm_offset_t va, vm_size_t size)
3593 vm_offset_t base, offset;
3595 base = va & PG_FRAME;
3596 offset = va & PAGE_MASK;
3597 size = roundup(offset + size, PAGE_SIZE);
3598 pmap_qremove(va, size >> PAGE_SHIFT);
3599 kmem_free(&kernel_map, base, size);
3603 * Sets the memory attribute for the specified page.
3606 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
3613 * The following code is NOP, until we get pmap_change_attr()
3618 * If "m" is a normal page, update its direct mapping. This update
3619 * can be relied upon to perform any cache operations that are
3620 * required for data coherence.
3622 if ((m->flags & PG_FICTITIOUS) == 0)
3623 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
3629 * Change the PAT attribute on an existing kernel memory map. Caller
3630 * must ensure that the virtual memory in question is not accessed
3631 * during the adjustment.
3634 pmap_change_attr(vm_offset_t va, vm_size_t count, int mode)
3641 panic("pmap_change_attr: va is NULL");
3642 base = trunc_page(va);
3646 *pte = (*pte & ~(pt_entry_t)(PG_PTE_PAT | PG_NC_PCD |
3648 pat_pte_index[mode];
3653 changed = 1; /* XXX: not optimal */
3656 * Flush CPU caches if required to make sure any data isn't cached that
3657 * shouldn't be, etc.
3660 pmap_invalidate_range(&kernel_pmap, base, va);
3661 pmap_invalidate_cache_range(base, va);
3666 * Perform the pmap work for mincore
3668 * The caller must hold vm_token if the caller wishes a stable result,
3669 * and even in that case some bits can change due to third party accesses
3675 pmap_mincore(pmap_t pmap, vm_offset_t addr)
3677 unsigned *ptep, pte;
3681 lwkt_gettoken(&vm_token);
3682 ptep = pmap_pte(pmap, addr);
3684 if (ptep && (pte = *ptep) != 0) {
3687 val = MINCORE_INCORE;
3688 if ((pte & PG_MANAGED) == 0)
3691 pa = pte & PG_FRAME;
3693 m = PHYS_TO_VM_PAGE(pa);
3699 val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
3700 } else if (m->dirty || pmap_is_modified(m)) {
3702 * Modified by someone else
3704 val |= MINCORE_MODIFIED_OTHER;
3711 val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
3712 } else if ((m->flags & PG_REFERENCED) ||
3713 pmap_ts_referenced(m)) {
3715 * Referenced by someone else
3717 val |= MINCORE_REFERENCED_OTHER;
3718 vm_page_flag_set(m, PG_REFERENCED);
3722 lwkt_reltoken(&vm_token);
3727 * Replace p->p_vmspace with a new one. If adjrefs is non-zero the new
3728 * vmspace will be ref'd and the old one will be deref'd.
3730 * cr3 will be reloaded if any lwp is the current lwp.
3732 * Only called with new VM spaces.
3733 * The process must have only a single thread.
3734 * The process must hold the vmspace->vm_map.token for oldvm and newvm
3735 * No other requirements.
3738 pmap_replacevm(struct proc *p, struct vmspace *newvm, int adjrefs)
3740 struct vmspace *oldvm;
3743 oldvm = p->p_vmspace;
3744 if (oldvm != newvm) {
3746 sysref_get(&newvm->vm_sysref);
3747 p->p_vmspace = newvm;
3748 KKASSERT(p->p_nthreads == 1);
3749 lp = RB_ROOT(&p->p_lwp_tree);
3750 pmap_setlwpvm(lp, newvm);
3752 sysref_put(&oldvm->vm_sysref);
3757 * Set the vmspace for a LWP. The vmspace is almost universally set the
3758 * same as the process vmspace, but virtual kernels need to swap out contexts
3759 * on a per-lwp basis.
3761 * Always called with a lp under the caller's direct control, either
3762 * unscheduled or the current lwp.
3767 pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm)
3769 struct vmspace *oldvm;
3772 oldvm = lp->lwp_vmspace;
3774 if (oldvm != newvm) {
3775 lp->lwp_vmspace = newvm;
3776 if (curthread->td_lwp == lp) {
3777 pmap = vmspace_pmap(newvm);
3778 ATOMIC_CPUMASK_ORMASK(pmap->pm_active,
3780 if (pmap->pm_active_lock & CPULOCK_EXCL)
3781 pmap_interlock_wait(newvm);
3782 #if defined(SWTCH_OPTIM_STATS)
3785 curthread->td_pcb->pcb_cr3 = vtophys(pmap->pm_pdir);
3786 load_cr3(curthread->td_pcb->pcb_cr3);
3787 pmap = vmspace_pmap(oldvm);
3788 ATOMIC_CPUMASK_NANDMASK(pmap->pm_active,
3795 * Called when switching to a locked pmap, used to interlock against pmaps
3796 * undergoing modifications to prevent us from activating the MMU for the
3797 * target pmap until all such modifications have completed. We have to do
3798 * this because the thread making the modifications has already set up its
3799 * SMP synchronization mask.
3804 pmap_interlock_wait(struct vmspace *vm)
3806 struct pmap *pmap = &vm->vm_pmap;
3808 if (pmap->pm_active_lock & CPULOCK_EXCL) {
3810 DEBUG_PUSH_INFO("pmap_interlock_wait");
3811 while (pmap->pm_active_lock & CPULOCK_EXCL) {
3813 lwkt_process_ipiq();
3821 * Return a page-directory alignment hint for device mappings which will
3822 * allow the use of super-pages for the mapping.
3827 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
3830 if ((obj == NULL) || (size < NBPDR) ||
3831 ((obj->type != OBJT_DEVICE) && (obj->type != OBJT_MGTDEVICE))) {
3835 addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
3840 * Return whether the PGE flag is supported globally.
3845 pmap_get_pgeflag(void)
3851 * Used by kmalloc/kfree, page already exists at va
3854 pmap_kvtom(vm_offset_t va)
3856 return(PHYS_TO_VM_PAGE(*vtopte(va) & PG_FRAME));
3860 pmap_object_init(vm_object_t object)
3866 pmap_object_free(vm_object_t object)