From 701c977e41807eb03343c695da4625c091d152b1 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Wed, 26 Oct 2011 11:42:18 -0700 Subject: [PATCH] kernel - Rewrite the x86-64 pmap code * Use unassociated VM pages (without a VM object) for all page table pages. * Remove kptobj and pmap->pm_pteobj. * For the moment implement a Red-Black tree for pv_entry_t manipulation. Revamp the pindex to include all page table page levels, from terminal pages to the PML4 page. The hierarchy is now arranged via the PV system. * As before, the kernel page tables only use PV entries for terminal pages. * Refactor the locking to allow blocking operations during deep scans. Individual PV entries are now locked and critical PMAP operations do not require the pmap->pm_token. This should greatly improve threaded program performance. * Fix kgdb on the live kernel (pmap_extract() was not handling short-cutted page directory pages). --- sys/platform/pc64/include/pmap.h | 54 +- sys/platform/pc64/include/vmparam.h | 2 +- sys/platform/pc64/x86_64/machdep.c | 1 + sys/platform/pc64/x86_64/pmap.c | 3275 +++++++++++++-------------- 4 files changed, 1673 insertions(+), 1659 deletions(-) diff --git a/sys/platform/pc64/include/pmap.h b/sys/platform/pc64/include/pmap.h index 8469067e63..563809e27b 100644 --- a/sys/platform/pc64/include/pmap.h +++ b/sys/platform/pc64/include/pmap.h @@ -85,9 +85,21 @@ #define NKPML4E 1 /* number of kernel PML4 slots */ /* NKPDPE defined in vmparam.h */ -#define NUPML4E (NPML4EPG/2) /* number of userland PML4 pages */ -#define NUPDPE (NUPML4E*NPDPEPG)/* number of userland PDP pages */ -#define NUPDE (NUPDPE*NPDEPG) /* number of userland PD entries */ +/* + * NUPDPs 512 (256 user) number of PDPs in user page table + * NUPDs 512 * 512 number of PDs in user page table + * NUPTs 512 * 512 * 512 number of PTs in user page table + * NUPTEs 512 * 512 * 512 * 512 number of PTEs in user page table + * + * NUPDP_USER number of PDPs reserved for userland + * NUPTE_USER number of PTEs reserved for userland (big number) + */ +#define NUPDP_USER (NPML4EPG/2) +#define NUPDP_TOTAL (NPML4EPG) +#define NUPD_TOTAL (NPDPEPG * NUPDP_TOTAL) +#define NUPT_TOTAL (NPDEPG * NUPD_TOTAL) +#define NUPTE_TOTAL ((vm_pindex_t)NPTEPG * NUPT_TOTAL) +#define NUPTE_USER ((vm_pindex_t)NPTEPG * NPDEPG * NPDPEPG * NUPDP_USER) #define NDMPML4E 1 /* number of dmap PML4 slots */ @@ -131,6 +143,9 @@ #ifndef _SYS_QUEUE_H_ #include #endif +#ifndef _SYS_TREE_H_ +#include +#endif #ifndef _SYS_SPINLOCK_H_ #include #endif @@ -192,8 +207,6 @@ struct vm_object; struct vmspace; struct md_page { - int pv_list_count; - int pv_generation; TAILQ_HEAD(,pv_entry) pv_list; }; @@ -212,20 +225,21 @@ struct pmap_statistics { }; typedef struct pmap_statistics *pmap_statistics_t; +struct pv_entry_rb_tree; +RB_PROTOTYPE2(pv_entry_rb_tree, pv_entry, pv_entry, + pv_entry_compare, vm_pindex_t); + struct pmap { pml4_entry_t *pm_pml4; /* KVA of level 4 page table */ - struct vm_page *pm_pdirm; /* VM page for pg directory */ - struct vm_object *pm_pteobj; /* Container for pte's */ + struct pv_entry *pm_pmlpv; /* PV entry for pml4 */ TAILQ_ENTRY(pmap) pm_pmnode; /* list of pmaps */ - TAILQ_HEAD(,pv_entry) pm_pvlist; /* list of mappings in pmap */ - TAILQ_HEAD(,pv_entry) pm_pvlist_free; /* free mappings */ + RB_HEAD(pv_entry_rb_tree, pv_entry) pm_pvroot; int pm_count; /* reference count */ cpumask_t pm_active; /* active on cpus */ int pm_filler02; /* (filler sync w/vkernel) */ struct pmap_statistics pm_stats; /* pmap statistics */ - struct vm_page *pm_ptphint; /* pmap ptp hint */ + struct pv_entry *pm_pvhint; /* pv_entry lookup hint */ int pm_generation; /* detect pvlist deletions */ - int pm_hold; struct spinlock pm_spin; struct lwkt_token pm_token; }; @@ -247,13 +261,23 @@ extern struct pmap kernel_pmap; */ typedef struct pv_entry { pmap_t pv_pmap; /* pmap where mapping lies */ - vm_offset_t pv_va; /* virtual address for mapping */ + vm_pindex_t pv_pindex; /* PTE, PT, PD, PDP, or PML4 */ TAILQ_ENTRY(pv_entry) pv_list; - TAILQ_ENTRY(pv_entry) pv_plist; - struct vm_page *pv_ptem; /* VM page for pte */ - u_int pv_hold; /* hold on destruction count */ + RB_ENTRY(pv_entry) pv_entry; + struct vm_page *pv_m; /* page being mapped */ + u_int pv_hold; /* interlock action */ + u_int pv_unused01; +#ifdef PMAP_DEBUG + const char *pv_func; + int pv_line; +#endif } *pv_entry_t; +#define PV_HOLD_LOCKED 0x80000000U +#define PV_HOLD_WAITING 0x40000000U +#define PV_HOLD_DELETED 0x20000000U +#define PV_HOLD_MASK 0x1FFFFFFFU + #ifdef _KERNEL #define NPPROVMTRR 8 diff --git a/sys/platform/pc64/include/vmparam.h b/sys/platform/pc64/include/vmparam.h index 35c593874b..2e575e6417 100644 --- a/sys/platform/pc64/include/vmparam.h +++ b/sys/platform/pc64/include/vmparam.h @@ -124,7 +124,7 @@ #define UPT_MIN_ADDRESS KVADDR(PML4PML4I, 0, 0, 0) #define VM_MIN_USER_ADDRESS ((vm_offset_t)0) -#define VM_MAX_USER_ADDRESS UVADDR(NUPML4E, 0, 0, 0) +#define VM_MAX_USER_ADDRESS UVADDR(NUPDP_USER, 0, 0, 0) #define USRSTACK VM_MAX_USER_ADDRESS diff --git a/sys/platform/pc64/x86_64/machdep.c b/sys/platform/pc64/x86_64/machdep.c index f662a3ca9d..39361c2242 100644 --- a/sys/platform/pc64/x86_64/machdep.c +++ b/sys/platform/pc64/x86_64/machdep.c @@ -1010,6 +1010,7 @@ cpu_idle(void) if (quick && (cpu_mi_feature & CPU_MI_MONITOR) && (reqflags & RQF_IDLECHECK_WK_MASK) == 0) { + splz(); /* XXX */ cpu_mmw_pause_int(&gd->gd_reqflags, reqflags); ++cpu_idle_hltcnt; } else if (cpu_idle_hlt) { diff --git a/sys/platform/pc64/x86_64/pmap.c b/sys/platform/pc64/x86_64/pmap.c index ed13709b2f..38a23e0956 100644 --- a/sys/platform/pc64/x86_64/pmap.c +++ b/sys/platform/pc64/x86_64/pmap.c @@ -8,6 +8,7 @@ * Copyright (c) 2005-2008 Alan L. Cox * Copyright (c) 2008, 2009 The DragonFly Project. * Copyright (c) 2008, 2009 Jordan Gordeev. + * Copyright (c) 2011 Matthew Dillon * All rights reserved. * * This code is derived from software contributed to Berkeley by @@ -127,10 +128,40 @@ #define MINPV 2048 +/* + * pmap debugging will report who owns a pv lock when blocking. + */ +#ifdef PMAP_DEBUG + +#define PMAP_DEBUG_DECL ,const char *func, int lineno +#define PMAP_DEBUG_ARGS , __func__, __LINE__ +#define PMAP_DEBUG_COPY , func, lineno + +#define pv_get(pmap, pindex) _pv_get(pmap, pindex \ + PMAP_DEBUG_ARGS) +#define pv_lock(pv) _pv_lock(pv \ + PMAP_DEBUG_ARGS) +#define pv_hold_try(pv) _pv_hold_try(pv \ + PMAP_DEBUG_ARGS) +#define pv_alloc(pmap, pindex, isnewp) _pv_alloc(pmap, pindex, isnewp \ + PMAP_DEBUG_ARGS) + +#else + +#define PMAP_DEBUG_DECL +#define PMAP_DEBUG_ARGS +#define PMAP_DEBUG_COPY + +#define pv_get(pmap, pindex) _pv_get(pmap, pindex) +#define pv_lock(pv) _pv_lock(pv) +#define pv_hold_try(pv) _pv_hold_try(pv) +#define pv_alloc(pmap, pindex, isnewp) _pv_alloc(pmap, pindex, isnewp) + +#endif + /* * Get PDEs and PTEs for user/kernel address space */ -static pd_entry_t *pmap_pde(pmap_t pmap, vm_offset_t va); #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) #define pmap_pde_v(pte) ((*(pd_entry_t *)pte & PG_V) != 0) @@ -139,7 +170,6 @@ static pd_entry_t *pmap_pde(pmap_t pmap, vm_offset_t va); #define pmap_pte_u(pte) ((*(pt_entry_t *)pte & PG_A) != 0) #define pmap_pte_v(pte) ((*(pt_entry_t *)pte & PG_V) != 0) - /* * Given a map and a machine independent protection code, * convert to a vax protection code. @@ -164,8 +194,6 @@ static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ static int pgeflag; /* PG_G or-in */ static int pseflag; /* PG_PS or-in */ -static vm_object_t kptobj; - static int ndmpdp; static vm_paddr_t dmaplimit; static int nkpt; @@ -187,7 +215,7 @@ static uint64_t DMPDPphys; /* phys addr of direct mapped level 3 */ static vm_zone_t pvzone; static struct vm_zone pvzone_store; static struct vm_object pvzone_obj; -static int pv_entry_count=0, pv_entry_max=0, pv_entry_high_water=0; +static int pv_entry_max=0, pv_entry_high_water=0; static int pmap_pagedaemon_waken = 0; static struct pv_entry *pvinit; @@ -211,32 +239,58 @@ SYSCTL_INT(_machdep, OID_AUTO, pmap_yield_count, CTLFLAG_RW, #define DISABLE_PSE -static pv_entry_t get_pv_entry (void); +static void pv_hold(pv_entry_t pv); +static int _pv_hold_try(pv_entry_t pv + PMAP_DEBUG_DECL); +static void pv_drop(pv_entry_t pv); +static void _pv_lock(pv_entry_t pv + PMAP_DEBUG_DECL); +static void pv_unlock(pv_entry_t pv); +static pv_entry_t _pv_alloc(pmap_t pmap, vm_pindex_t pindex, int *isnew + PMAP_DEBUG_DECL); +static pv_entry_t _pv_get(pmap_t pmap, vm_pindex_t pindex + PMAP_DEBUG_DECL); +static pv_entry_t pv_get_try(pmap_t pmap, vm_pindex_t pindex, int *errorp); +static pv_entry_t pv_find(pmap_t pmap, vm_pindex_t pindex); +static void pv_put(pv_entry_t pv); +static void pv_free(pv_entry_t pv); +static void *pv_pte_lookup(pv_entry_t pv, vm_pindex_t pindex); +static pv_entry_t pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, + pv_entry_t *pvpp); +static void pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, + struct pmap_inval_info *info); +static vm_page_t pmap_remove_pv_page(pv_entry_t pv, int holdpg); + +static void pmap_remove_callback(pmap_t pmap, struct pmap_inval_info *info, + pv_entry_t pte_pv, pv_entry_t pt_pv, vm_offset_t va, + pt_entry_t *ptep, void *arg __unused); +static void pmap_protect_callback(pmap_t pmap, struct pmap_inval_info *info, + pv_entry_t pte_pv, pv_entry_t pt_pv, vm_offset_t va, + pt_entry_t *ptep, void *arg __unused); + static void i386_protection_init (void); static void create_pagetables(vm_paddr_t *firstaddr); static void pmap_remove_all (vm_page_t m); -static int pmap_remove_pte (struct pmap *pmap, pt_entry_t *ptq, - vm_offset_t sva, pmap_inval_info_t info); -static void pmap_remove_page (struct pmap *pmap, - vm_offset_t va, pmap_inval_info_t info); -static int pmap_remove_entry (struct pmap *pmap, vm_page_t m, - vm_offset_t va, pmap_inval_info_t info); static boolean_t pmap_testbit (vm_page_t m, int bit); -static void pmap_insert_entry (pmap_t pmap, vm_offset_t va, - vm_page_t mpte, vm_page_t m); - -static vm_page_t pmap_allocpte (pmap_t pmap, vm_offset_t va); -static int pmap_release_free_page (pmap_t pmap, vm_page_t p); -static vm_page_t _pmap_allocpte (pmap_t pmap, vm_pindex_t ptepindex); static pt_entry_t * pmap_pte_quick (pmap_t pmap, vm_offset_t va); -static vm_page_t pmap_page_lookup (vm_object_t object, vm_pindex_t pindex); -static int pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, - pmap_inval_info_t info); static vm_offset_t pmap_kmem_choose(vm_offset_t addr); static unsigned pdir4mb; +static int +pv_entry_compare(pv_entry_t pv1, pv_entry_t pv2) +{ + if (pv1->pv_pindex < pv2->pv_pindex) + return(-1); + if (pv1->pv_pindex > pv2->pv_pindex) + return(1); + return(0); +} + +RB_GENERATE2(pv_entry_rb_tree, pv_entry, pv_entry, + pv_entry_compare, vm_pindex_t, pv_pindex); + /* * Move the kernel virtual free pointer to the next * 2MB. This is used to help improve performance @@ -270,143 +324,230 @@ pmap_pte_quick(pmap_t pmap, vm_offset_t va) return pmap_pte(pmap, va); } -/* Return a non-clipped PD index for a given VA */ +/* + * Returns the pindex of a page table entry (representing a terminal page). + * There are NUPTE_TOTAL page table entries possible (a huge number) + * + * x86-64 has a 48-bit address space, where bit 47 is sign-extended out. + * We want to properly translate negative KVAs. + */ static __inline vm_pindex_t -pmap_pde_pindex(vm_offset_t va) +pmap_pte_pindex(vm_offset_t va) { - return va >> PDRSHIFT; + return ((va >> PAGE_SHIFT) & (NUPTE_TOTAL - 1)); } -/* Return various clipped indexes for a given VA */ +/* + * Returns the pindex of a page table. + */ static __inline vm_pindex_t -pmap_pte_index(vm_offset_t va) +pmap_pt_pindex(vm_offset_t va) { + return (NUPTE_TOTAL + ((va >> PDRSHIFT) & (NUPT_TOTAL - 1))); +} - return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1)); +/* + * Returns the pindex of a page directory. + */ +static __inline +vm_pindex_t +pmap_pd_pindex(vm_offset_t va) +{ + return (NUPTE_TOTAL + NUPT_TOTAL + + ((va >> PDPSHIFT) & (NUPD_TOTAL - 1))); } static __inline vm_pindex_t -pmap_pde_index(vm_offset_t va) +pmap_pdp_pindex(vm_offset_t va) { + return (NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + + ((va >> PML4SHIFT) & (NUPDP_TOTAL - 1))); +} - return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1)); +static __inline +vm_pindex_t +pmap_pml4_pindex(void) +{ + return (NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL); } +/* + * Return various clipped indexes for a given VA + * + * Returns the index of a pte in a page table, representing a terminal + * page. + */ static __inline vm_pindex_t -pmap_pdpe_index(vm_offset_t va) +pmap_pte_index(vm_offset_t va) { + return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1)); +} - return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1)); +/* + * Returns the index of a pt in a page directory, representing a page + * table. + */ +static __inline +vm_pindex_t +pmap_pt_index(vm_offset_t va) +{ + return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1)); } +/* + * Returns the index of a pd in a page directory page, representing a page + * directory. + */ static __inline vm_pindex_t -pmap_pml4e_index(vm_offset_t va) +pmap_pd_index(vm_offset_t va) { + return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1)); +} +/* + * Returns the index of a pdp in the pml4 table, representing a page + * directory page. + */ +static __inline +vm_pindex_t +pmap_pdp_index(vm_offset_t va) +{ return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1)); } -/* Return a pointer to the PML4 slot that corresponds to a VA */ +/* + * Generic procedure to index a pte from a pt, pd, or pdp. + */ +static +void * +pv_pte_lookup(pv_entry_t pv, vm_pindex_t pindex) +{ + pt_entry_t *pte; + + pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pv->pv_m)); + return(&pte[pindex]); +} + +/* + * Return pointer to PDP slot in the PML4 + */ static __inline pml4_entry_t * -pmap_pml4e(pmap_t pmap, vm_offset_t va) +pmap_pdp(pmap_t pmap, vm_offset_t va) { - - return (&pmap->pm_pml4[pmap_pml4e_index(va)]); + return (&pmap->pm_pml4[pmap_pdp_index(va)]); } -/* Return a pointer to the PDP slot that corresponds to a VA */ +/* + * Return pointer to PD slot in the PDP given a pointer to the PDP + */ static __inline pdp_entry_t * -pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) +pmap_pdp_to_pd(pml4_entry_t *pdp, vm_offset_t va) { - pdp_entry_t *pdpe; + pdp_entry_t *pd; - pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME); - return (&pdpe[pmap_pdpe_index(va)]); + pd = (pdp_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); + return (&pd[pmap_pd_index(va)]); } -/* Return a pointer to the PDP slot that corresponds to a VA */ +/* + * Return pointer to PD slot in the PDP + **/ static __inline pdp_entry_t * -pmap_pdpe(pmap_t pmap, vm_offset_t va) +pmap_pd(pmap_t pmap, vm_offset_t va) { - pml4_entry_t *pml4e; + pml4_entry_t *pdp; - pml4e = pmap_pml4e(pmap, va); - if ((*pml4e & PG_V) == 0) + pdp = pmap_pdp(pmap, va); + if ((*pdp & PG_V) == 0) return NULL; - return (pmap_pml4e_to_pdpe(pml4e, va)); + return (pmap_pdp_to_pd(pdp, va)); } -/* Return a pointer to the PD slot that corresponds to a VA */ +/* + * Return pointer to PT slot in the PD given a pointer to the PD + */ static __inline pd_entry_t * -pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) +pmap_pd_to_pt(pdp_entry_t *pd, vm_offset_t va) { - pd_entry_t *pde; + pd_entry_t *pt; - pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME); - return (&pde[pmap_pde_index(va)]); + pt = (pd_entry_t *)PHYS_TO_DMAP(*pd & PG_FRAME); + return (&pt[pmap_pt_index(va)]); } -/* Return a pointer to the PD slot that corresponds to a VA */ +/* + * Return pointer to PT slot in the PD + */ static __inline pd_entry_t * -pmap_pde(pmap_t pmap, vm_offset_t va) +pmap_pt(pmap_t pmap, vm_offset_t va) { - pdp_entry_t *pdpe; + pdp_entry_t *pd; - pdpe = pmap_pdpe(pmap, va); - if (pdpe == NULL || (*pdpe & PG_V) == 0) + pd = pmap_pd(pmap, va); + if (pd == NULL || (*pd & PG_V) == 0) return NULL; - return (pmap_pdpe_to_pde(pdpe, va)); + return (pmap_pd_to_pt(pd, va)); } -/* Return a pointer to the PT slot that corresponds to a VA */ +/* + * Return pointer to PTE slot in the PT given a pointer to the PT + */ static __inline pt_entry_t * -pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) +pmap_pt_to_pte(pd_entry_t *pt, vm_offset_t va) { pt_entry_t *pte; - pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); + pte = (pt_entry_t *)PHYS_TO_DMAP(*pt & PG_FRAME); return (&pte[pmap_pte_index(va)]); } -/* Return a pointer to the PT slot that corresponds to a VA */ +/* + * Return pointer to PTE slot in the PT + */ static __inline pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va) { - pd_entry_t *pde; + pd_entry_t *pt; - pde = pmap_pde(pmap, va); - if (pde == NULL || (*pde & PG_V) == 0) - return NULL; - if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */ - return ((pt_entry_t *)pde); - return (pmap_pde_to_pte(pde, va)); + pt = pmap_pt(pmap, va); + if (pt == NULL || (*pt & PG_V) == 0) + return NULL; + if ((*pt & PG_PS) != 0) + return ((pt_entry_t *)pt); + return (pmap_pt_to_pte(pt, va)); } +/* + * Of all the layers (PTE, PT, PD, PDP, PML4) the best one to cache is + * the PT layer. This will speed up core pmap operations considerably. + */ static __inline -pt_entry_t * -vtopte(vm_offset_t va) +void +pv_cache(pv_entry_t pv, vm_pindex_t pindex) { - uint64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + - NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); - - return (PTmap + ((va >> PAGE_SHIFT) & mask)); + if (pindex >= pmap_pt_pindex(0) && pindex <= pmap_pd_pindex(0)) + pv->pv_pmap->pm_pvhint = pv; } + +/* + * KVM - return address of PT slot in PD + */ static __inline pd_entry_t * -vtopde(vm_offset_t va) +vtopt(vm_offset_t va) { uint64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); @@ -414,6 +555,19 @@ vtopde(vm_offset_t va) return (PDmap + ((va >> PDRSHIFT) & mask)); } +/* + * KVM - return address of PTE slot in PT + */ +static __inline +pt_entry_t * +vtopte(vm_offset_t va) +{ + uint64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); + + return (PTmap + ((va >> PAGE_SHIFT) & mask)); +} + static uint64_t allocpages(vm_paddr_t *firstaddr, long n) { @@ -616,17 +770,11 @@ pmap_bootstrap(vm_paddr_t *firstaddr) * The kernel's pmap is statically allocated so we don't have to use * pmap_create, which is unlikely to work correctly at this part of * the boot sequence (XXX and which no longer exists). - * - * The kernel_pmap's pm_pteobj is used only for locking and not - * for mmu pages. */ kernel_pmap.pm_pml4 = (pdp_entry_t *) (PTOV_OFFSET + KPML4phys); kernel_pmap.pm_count = 1; kernel_pmap.pm_active = (cpumask_t)-1 & ~CPUMASK_LOCK; - kernel_pmap.pm_pteobj = &kernel_object; - TAILQ_INIT(&kernel_pmap.pm_pvlist); - TAILQ_INIT(&kernel_pmap.pm_pvlist_free); - kernel_pmap.pm_hold = 0; + RB_INIT(&kernel_pmap.pm_pvroot); spin_init(&kernel_pmap.pm_spin); lwkt_token_init(&kernel_pmap.pm_token, "kpmap_tok"); @@ -764,23 +912,16 @@ pmap_init(void) int i; int initial_pvs; - /* - * object for kernel page table pages - */ - /* JG I think the number can be arbitrary */ - kptobj = vm_object_allocate(OBJT_DEFAULT, 5); - /* * Allocate memory for random pmap data structures. Includes the * pv_head_table. */ - for(i = 0; i < vm_page_array_size; i++) { + for (i = 0; i < vm_page_array_size; i++) { vm_page_t m; m = &vm_page_array[i]; TAILQ_INIT(&m->md.pv_list); - m->md.pv_list_count = 0; } /* @@ -856,8 +997,9 @@ pmap_nw_modified(pt_entry_t pte) */ static __inline int -pmap_track_modified(vm_offset_t va) +pmap_track_modified(vm_pindex_t pindex) { + vm_offset_t va = (vm_offset_t)pindex << PAGE_SHIFT; if ((va < clean_sva) || (va >= clean_eva)) return 1; else @@ -866,31 +1008,55 @@ pmap_track_modified(vm_offset_t va) /* * Extract the physical page address associated with the map/VA pair. + * The page must be wired for this to work reliably. * - * The caller must hold pmap->pm_token if non-blocking operation is desired. + * XXX for the moment we're using pv_find() instead of pv_get(), as + * callers might be expecting non-blocking operation. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { vm_paddr_t rtval; - pt_entry_t *pte; - pd_entry_t pde, *pdep; + pv_entry_t pt_pv; + pt_entry_t *ptep; - lwkt_gettoken(&pmap->pm_token); rtval = 0; - pdep = pmap_pde(pmap, va); - if (pdep != NULL) { - pde = *pdep; - if (pde) { - if ((pde & PG_PS) != 0) { - rtval = (pde & PG_PS_FRAME) | (va & PDRMASK); + if (va >= VM_MAX_USER_ADDRESS) { + /* + * Kernel page directories might be direct-mapped and + * there is typically no PV tracking of pte's + */ + pd_entry_t *pt; + + pt = pmap_pt(pmap, va); + if (pt && (*pt & PG_V)) { + if (*pt & PG_PS) { + rtval = *pt & PG_PS_FRAME; + rtval |= va & PDRMASK; } else { - pte = pmap_pde_to_pte(pdep, va); - rtval = (*pte & PG_FRAME) | (va & PAGE_MASK); + ptep = pmap_pt_to_pte(pt, va); + if (*pt & PG_V) { + rtval = *ptep & PG_FRAME; + rtval |= va & PAGE_MASK; + } + } + } + } else { + /* + * User pages currently do not direct-map the page directory + * and some pages might not used managed PVs. But all PT's + * will have a PV. + */ + pt_pv = pv_find(pmap, pmap_pt_pindex(va)); + if (pt_pv) { + ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); + if (*ptep & PG_V) { + rtval = *ptep & PG_FRAME; + rtval |= va & PAGE_MASK; } + pv_drop(pt_pv); } } - lwkt_reltoken(&pmap->pm_token); return rtval; } @@ -900,15 +1066,15 @@ pmap_extract(pmap_t pmap, vm_offset_t va) vm_paddr_t pmap_kextract(vm_offset_t va) { - pd_entry_t pde; + pd_entry_t pt; /* pt entry in pd */ vm_paddr_t pa; if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { pa = DMAP_TO_PHYS(va); } else { - pde = *vtopde(va); - if (pde & PG_PS) { - pa = (pde & PG_PS_FRAME) | (va & PDRMASK); + pt = *vtopt(va); + if (pt & PG_PS) { + pa = (pt & PG_PS_FRAME) | (va & PDRMASK); } else { /* * Beware of a concurrent promotion that changes the @@ -918,7 +1084,7 @@ pmap_kextract(vm_offset_t va) * because the page table page is preserved by the * promotion. */ - pa = *pmap_pde_to_pte(&pde, va); + pa = *pmap_pt_to_pte(&pt, va); pa = (pa & PG_FRAME) | (va & PAGE_MASK); } } @@ -943,13 +1109,13 @@ pmap_kenter(vm_offset_t va, vm_paddr_t pa) pt_entry_t npte; pmap_inval_info info; - pmap_inval_init(&info); + pmap_inval_init(&info); /* XXX remove */ npte = pa | PG_RW | PG_V | pgeflag; pte = vtopte(va); - pmap_inval_interlock(&info, &kernel_pmap, va); + pmap_inval_interlock(&info, &kernel_pmap, va); /* XXX remove */ *pte = npte; - pmap_inval_deinterlock(&info, &kernel_pmap); - pmap_inval_done(&info); + pmap_inval_deinterlock(&info, &kernel_pmap); /* XXX remove */ + pmap_inval_done(&info); /* XXX remove */ } /* @@ -1019,14 +1185,14 @@ pmap_kremove_quick(vm_offset_t va) void pmap_kmodify_rw(vm_offset_t va) { - *vtopte(va) |= PG_RW; + atomic_set_long(vtopte(va), PG_RW); cpu_invlpg((void *)va); } void pmap_kmodify_nc(vm_offset_t va) { - *vtopte(va) |= PG_N; + atomic_set_long(vtopte(va), PG_N); cpu_invlpg((void *)va); } @@ -1114,27 +1280,6 @@ pmap_qremove(vm_offset_t va, int count) smp_invltlb(); } -/* - * This routine works like vm_page_lookup() but also blocks as long as the - * page is busy. This routine does not busy the page it returns. - * - * The call should be made with the governing object held so the page's - * object association remains valid on return. - * - * This function can block! - */ -static -vm_page_t -pmap_page_lookup(vm_object_t object, vm_pindex_t pindex) -{ - vm_page_t m; - - ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); - m = vm_page_lookup_busy_wait(object, pindex, FALSE, "pplookp"); - - return(m); -} - /* * Create a new thread and optionally associate it with a (new) process. * NOTE! the new thread's cpu may not equal the current cpu. @@ -1167,110 +1312,6 @@ pmap_dispose_proc(struct proc *p) KASSERT(p->p_lock == 0, ("attempt to dispose referenced proc! %p", p)); } -/*************************************************** - * Page table page management routines..... - ***************************************************/ - -/* - * After removing a page table entry, this routine is used to - * conditionally free the page, and manage the hold/wire counts. - * - * This routine reduces the wire_count on a page. If the wire_count - * would drop to zero we remove the PT, PD, or PDP from its parent page - * table. Under normal operation this only occurs with PT pages. - * - * mpte is never NULL for a user va, even for unmanaged pages. mpte should - * always be NULL for a kernel va. - */ -static __inline -int -pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t mpte, - pmap_inval_info_t info) -{ - if (mpte == NULL) - return 0; - if (!vm_page_unwire_quick(mpte)) - return 0; - - /* - * Wait until we can busy the page ourselves. We cannot have - * any active flushes if we block. We own one hold count on the - * page so it cannot be freed out from under us. - */ - vm_page_busy_wait(mpte, FALSE, "pmuwpt"); - KASSERT(mpte->queue == PQ_NONE, - ("_pmap_unwire_pte_hold: %p->queue != PQ_NONE", mpte)); - - /* - * New references can bump the wire_count while we were blocked, - * try to unwire quickly again (e.g. 2->1). - */ - if (vm_page_unwire_quick(mpte) == 0) { - vm_page_wakeup(mpte); - return 0; - } - - /* - * Unmap the page table page - */ - KKASSERT(mpte->wire_count == 1); - pmap_inval_interlock(info, pmap, -1); - - if (mpte->pindex >= (NUPDE + NUPDPE)) { - /* PDP page */ - pml4_entry_t *pml4; - pml4 = pmap_pml4e(pmap, va); - KKASSERT(*pml4); - *pml4 = 0; - } else if (mpte->pindex >= NUPDE) { - /* PD page */ - pdp_entry_t *pdp; - pdp = pmap_pdpe(pmap, va); - KKASSERT(*pdp); - *pdp = 0; - } else { - /* PT page */ - pd_entry_t *pd; - pd = pmap_pde(pmap, va); - KKASSERT(*pd); - *pd = 0; - } - - KKASSERT(pmap->pm_stats.resident_count > 0); - --pmap->pm_stats.resident_count; - - if (pmap->pm_ptphint == mpte) - pmap->pm_ptphint = NULL; - pmap_inval_deinterlock(info, pmap); - - if (mpte->pindex < NUPDE) { - /* We just released a PT, unhold the matching PD */ - vm_page_t pdpg; - - pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); - pmap_unwire_pte_hold(pmap, va, pdpg, info); - } - if (mpte->pindex >= NUPDE && mpte->pindex < (NUPDE + NUPDPE)) { - /* We just released a PD, unhold the matching PDP */ - vm_page_t pdppg; - - pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); - pmap_unwire_pte_hold(pmap, va, pdppg, info); - } - - /* - * This was our wiring. - */ - KKASSERT(mpte->flags & PG_UNMANAGED); - vm_page_unwire(mpte, 0); - KKASSERT(mpte->wire_count == 0); - vm_page_flag_clear(mpte, PG_MAPPED | PG_WRITEABLE); - vm_page_flash(mpte); - vm_page_free_zero(mpte); - - return 1; -} - /* * Initialize pmap0/vmspace0. This pmap is not added to pmap_list because * it, and IdlePTD, represents the template used to update all other pmaps. @@ -1285,10 +1326,8 @@ pmap_pinit0(struct pmap *pmap) pmap->pm_pml4 = (pml4_entry_t *)(PTOV_OFFSET + KPML4phys); pmap->pm_count = 1; pmap->pm_active = 0; - pmap->pm_ptphint = NULL; - TAILQ_INIT(&pmap->pm_pvlist); - TAILQ_INIT(&pmap->pm_pvlist_free); - pmap->pm_hold = 0; + pmap->pm_pvhint = NULL; + RB_INIT(&pmap->pm_pvroot); spin_init(&pmap->pm_spin); lwkt_token_init(&pmap->pm_token, "pmap_tok"); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); @@ -1301,7 +1340,20 @@ pmap_pinit0(struct pmap *pmap) void pmap_pinit(struct pmap *pmap) { - vm_page_t pml4pg; + pv_entry_t pv; + + /* + * Misc initialization + */ + pmap->pm_count = 1; + pmap->pm_active = 0; + pmap->pm_pvhint = NULL; + if (pmap->pm_pmlpv == NULL) { + RB_INIT(&pmap->pm_pvroot); + bzero(&pmap->pm_stats, sizeof pmap->pm_stats); + spin_init(&pmap->pm_spin); + lwkt_token_init(&pmap->pm_token, "pmap_tok"); + } /* * No need to allocate page table space yet but we do need a valid @@ -1313,56 +1365,27 @@ pmap_pinit(struct pmap *pmap) } /* - * Allocate an object for the ptes - */ - if (pmap->pm_pteobj == NULL) { - pmap->pm_pteobj = vm_object_allocate(OBJT_DEFAULT, - NUPDE + NUPDPE + PML4PML4I + 1); - } - - /* - * Allocate the page directory page, unless we already have - * one cached. If we used the cached page the wire_count will - * already be set appropriately. + * Allocate the page directory page, which wires it even though + * it isn't being entered into some higher level page table (it + * being the highest level). If one is already cached we don't + * have to do anything. */ - if ((pml4pg = pmap->pm_pdirm) == NULL) { - pml4pg = vm_page_grab(pmap->pm_pteobj, - NUPDE + NUPDPE + PML4PML4I, - VM_ALLOC_NORMAL | VM_ALLOC_RETRY); - pmap->pm_pdirm = pml4pg; - vm_page_unmanage(pml4pg); - vm_page_flag_clear(pml4pg, PG_MAPPED); - pml4pg->valid = VM_PAGE_BITS_ALL; - vm_page_wire(pml4pg); - vm_page_wakeup(pml4pg); + if ((pv = pmap->pm_pmlpv) == NULL) { + pv = pmap_allocpte(pmap, pmap_pml4_pindex(), NULL); + pmap->pm_pmlpv = pv; pmap_kenter((vm_offset_t)pmap->pm_pml4, - VM_PAGE_TO_PHYS(pml4pg)); + VM_PAGE_TO_PHYS(pv->pv_m)); + pv_put(pv); + pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U; + pmap->pm_pml4[DMPML4I] = DMPDPphys | PG_RW | PG_V | PG_U; + + /* install self-referential address mapping entry */ + pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pv->pv_m) | + PG_V | PG_RW | PG_A | PG_M; + } else { + KKASSERT(pv->pv_m->flags & PG_MAPPED); + KKASSERT(pv->pv_m->flags & PG_WRITEABLE); } - if ((pml4pg->flags & PG_ZERO) == 0) - bzero(pmap->pm_pml4, PAGE_SIZE); -#ifdef PMAP_DEBUG - else - pmap_page_assertzero(VM_PAGE_TO_PHYS(pml4pg)); -#endif - vm_page_flag_clear(pml4pg, PG_ZERO); - - pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U; - pmap->pm_pml4[DMPML4I] = DMPDPphys | PG_RW | PG_V | PG_U; - - /* install self-referential address mapping entry */ - pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | - PG_V | PG_RW | PG_A | PG_M; - - pmap->pm_count = 1; - pmap->pm_active = 0; - pmap->pm_ptphint = NULL; - TAILQ_INIT(&pmap->pm_pvlist); - TAILQ_INIT(&pmap->pm_pvlist_free); - pmap->pm_hold = 0; - spin_init(&pmap->pm_spin); - lwkt_token_init(&pmap->pm_token, "pmap_tok"); - bzero(&pmap->pm_stats, sizeof pmap->pm_stats); - pmap->pm_stats.resident_count = 1; } /* @@ -1374,28 +1397,36 @@ pmap_pinit(struct pmap *pmap) void pmap_puninit(pmap_t pmap) { + pv_entry_t pv; vm_page_t p; KKASSERT(pmap->pm_active == 0); - if ((p = pmap->pm_pdirm) != NULL) { - KKASSERT(pmap->pm_pml4 != NULL); - KKASSERT(pmap->pm_pml4 != (void *)(PTOV_OFFSET + KPML4phys)); + if ((pv = pmap->pm_pmlpv) != NULL) { + if (pv_hold_try(pv) == 0) + pv_lock(pv); + p = pmap_remove_pv_page(pv, 1); + pv_free(pv); pmap_kremove((vm_offset_t)pmap->pm_pml4); vm_page_busy_wait(p, FALSE, "pgpun"); - KKASSERT(p->flags & PG_UNMANAGED); + vm_page_unhold(p); + KKASSERT(p->flags & (PG_FICTITIOUS|PG_UNMANAGED)); vm_page_unwire(p, 0); - vm_page_free_zero(p); - pmap->pm_pdirm = NULL; + vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE); + + /* + * XXX eventually clean out PML4 static entries and + * use vm_page_free_zero() + */ + vm_page_free(p); + pmap->pm_pmlpv = NULL; } if (pmap->pm_pml4) { KKASSERT(pmap->pm_pml4 != (void *)(PTOV_OFFSET + KPML4phys)); kmem_free(&kernel_map, (vm_offset_t)pmap->pm_pml4, PAGE_SIZE); pmap->pm_pml4 = NULL; } - if (pmap->pm_pteobj) { - vm_object_deallocate(pmap->pm_pteobj); - pmap->pm_pteobj = NULL; - } + KKASSERT(pmap->pm_stats.resident_count == 0); + KKASSERT(pmap->pm_stats.wired_count == 0); } /* @@ -1416,482 +1447,507 @@ pmap_pinit2(struct pmap *pmap) } /* - * Attempt to release and free a vm_page in a pmap. Returns 1 on success, - * 0 on failure (if the procedure had to sleep). + * This routine is called when various levels in the page table need to + * be populated. This routine cannot fail. * - * When asked to remove the page directory page itself, we actually just - * leave it cached so we do not have to incur the SMP inval overhead of - * removing the kernel mapping. pmap_puninit() will take care of it. + * This function returns two locked pv_entry's, one representing the + * requested pv and one representing the requested pv's parent pv. If + * the pv did not previously exist it will be mapped into its parent + * and wired, otherwise no additional wire count will be added. */ static -int -pmap_release_free_page(struct pmap *pmap, vm_page_t p) +pv_entry_t +pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp) { + pt_entry_t *ptep; + pv_entry_t pv; + pv_entry_t pvp; + vm_pindex_t pt_pindex; + vm_page_t m; + int isnew; + /* - * This code optimizes the case of freeing non-busy - * page-table pages. Those pages are zero now, and - * might as well be placed directly into the zero queue. + * If the pv already exists and we aren't being asked for the + * parent page table page we can just return it. A locked+held pv + * is returned. */ - if (vm_page_busy_try(p, FALSE)) { - vm_page_sleep_busy(p, FALSE, "pmaprl"); - return 0; + pv = pv_alloc(pmap, ptepindex, &isnew); + if (isnew == 0 && pvpp == NULL) + return(pv); + + /* + * This is a new PV, we have to resolve its parent page table and + * add an additional wiring to the page if necessary. + */ + + /* + * Special case terminal PVs. These are not page table pages so + * no vm_page is allocated (the caller supplied the vm_page). If + * pvpp is non-NULL we are being asked to also removed the pt_pv + * for this pv. + * + * Note that pt_pv's are only returned for user VAs. We assert that + * a pt_pv is not being requested for kernel VAs. + */ + if (ptepindex < pmap_pt_pindex(0)) { + if (ptepindex >= NUPTE_USER) + KKASSERT(pvpp == NULL); + else + KKASSERT(pvpp != NULL); + if (pvpp) { + pt_pindex = NUPTE_TOTAL + (ptepindex >> NPTEPGSHIFT); + pvp = pmap_allocpte(pmap, pt_pindex, NULL); + if (isnew) + vm_page_wire_quick(pvp->pv_m); + *pvpp = pvp; + } else { + pvp = NULL; + } + return(pv); } /* - * Remove the page table page from the processes address space. + * Non-terminal PVs allocate a VM page to represent the page table, + * so we have to resolve pvp and calculate ptepindex for the pvp + * and then for the page table entry index in the pvp for + * fall-through. */ - if (p->pindex == NUPDE + NUPDPE + PML4PML4I) { + if (ptepindex < pmap_pd_pindex(0)) { /* - * We are the pml4 table itself. + * pv is PT, pvp is PD */ - /* XXX anything to do here? */ - } else if (p->pindex >= (NUPDE + NUPDPE)) { + ptepindex = (ptepindex - pmap_pt_pindex(0)) >> NPDEPGSHIFT; + ptepindex += NUPTE_TOTAL + NUPT_TOTAL; + pvp = pmap_allocpte(pmap, ptepindex, NULL); + if (!isnew) + goto notnew; + /* - * Remove a PDP page from the PML4. We do not maintain - * wire counts on the PML4 page. + * PT index in PD */ - pml4_entry_t *pml4; - vm_page_t m4; - int idx; - - m4 = vm_page_lookup(pmap->pm_pteobj, - NUPDE + NUPDPE + PML4PML4I); - KKASSERT(m4 != NULL); - pml4 = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m4)); - idx = (p->pindex - (NUPDE + NUPDPE)) % NPML4EPG; - KKASSERT(pml4[idx] != 0); - pml4[idx] = 0; - } else if (p->pindex >= NUPDE) { + ptepindex = pv->pv_pindex - pmap_pt_pindex(0); + ptepindex &= ((1ul << NPDEPGSHIFT) - 1); + } else if (ptepindex < pmap_pdp_pindex(0)) { /* - * Remove a PD page from the PDP and drop the wire count - * on the PDP. The PDP has a wire_count just from being - * mapped so the wire_count should never drop to 0 here. + * pv is PD, pvp is PDP */ - vm_page_t m3; - pdp_entry_t *pdp; - int idx; - - m3 = vm_page_lookup(pmap->pm_pteobj, - NUPDE + NUPDPE + (p->pindex - NUPDE) / NPDPEPG); - KKASSERT(m3 != NULL); - pdp = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m3)); - idx = (p->pindex - NUPDE) % NPDPEPG; - KKASSERT(pdp[idx] != 0); - pdp[idx] = 0; - if (vm_page_unwire_quick(m3)) - panic("pmap_release_free_page: m3 wire_count 1->0"); - } else { + ptepindex = (ptepindex - pmap_pd_pindex(0)) >> NPDPEPGSHIFT; + ptepindex += NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL; + pvp = pmap_allocpte(pmap, ptepindex, NULL); + if (!isnew) + goto notnew; + + /* + * PD index in PDP + */ + ptepindex = pv->pv_pindex - pmap_pd_pindex(0); + ptepindex &= ((1ul << NPDPEPGSHIFT) - 1); + } else if (ptepindex < pmap_pml4_pindex()) { /* - * Remove a PT page from the PD and drop the wire count - * on the PD. The PD has a wire_count just from being - * mapped so the wire_count should never drop to 0 here. + * pv is PDP, pvp is the root pml4 table */ - vm_page_t m2; - pd_entry_t *pd; - int idx; + pvp = pmap_allocpte(pmap, pmap_pml4_pindex(), NULL); + if (!isnew) + goto notnew; - m2 = vm_page_lookup(pmap->pm_pteobj, - NUPDE + p->pindex / NPDEPG); - KKASSERT(m2 != NULL); - pd = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m2)); - idx = p->pindex % NPDEPG; - pd[idx] = 0; - if (vm_page_unwire_quick(m2)) - panic("pmap_release_free_page: m2 wire_count 1->0"); + /* + * PDP index in PML4 + */ + ptepindex = pv->pv_pindex - pmap_pdp_pindex(0); + ptepindex &= ((1ul << NPML4EPGSHIFT) - 1); + } else { + /* + * pv represents the top-level PML4, there is no parent. + */ + pvp = NULL; + if (!isnew) + goto notnew; } /* - * p's wire_count should be transitioning from 1 to 0 here. - */ - KKASSERT(p->wire_count == 1); - KKASSERT(p->flags & PG_UNMANAGED); - KKASSERT(pmap->pm_stats.resident_count > 0); - vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE); - --pmap->pm_stats.resident_count; - if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == p->pindex)) - pmap->pm_ptphint = NULL; - - /* - * We leave the top-level page table page cached, wired, and mapped in - * the pmap until the dtor function (pmap_puninit()) gets called. - * However, still clean it up so we can set PG_ZERO. + * This code is only reached if isnew is TRUE and this is not a + * terminal PV. We need to allocate a vm_page for the page table + * at this level and enter it into the parent page table. + * + * page table pages are marked PG_WRITEABLE and PG_MAPPED. */ - if (p->pindex == NUPDE + NUPDPE + PML4PML4I) { - bzero(pmap->pm_pml4, PAGE_SIZE); - vm_page_flag_set(p, PG_ZERO); - vm_page_wakeup(p); - } else { - vm_page_unwire(p, 0); - KKASSERT(p->wire_count == 0); - /* JG eventually revert to using vm_page_free_zero() */ - vm_page_free(p); + for (;;) { + m = vm_page_alloc(NULL, pv->pv_pindex, + VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM | + VM_ALLOC_INTERRUPT); + if (m) + break; + vm_wait(0); } - return 1; -} + vm_page_spin_lock(m); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); + pv->pv_m = m; + vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE); + vm_page_spin_unlock(m); + vm_page_unmanage(m); /* m must be spinunlocked */ + + if ((m->flags & PG_ZERO) == 0) { + pmap_zero_page(VM_PAGE_TO_PHYS(m)); + } +#ifdef PMAP_DEBUG + else { + pmap_page_assertzero(VM_PAGE_TO_PHYS(m)); + } +#endif + m->valid = VM_PAGE_BITS_ALL; + vm_page_flag_clear(m, PG_ZERO); + vm_page_wire(m); /* wire for mapping in parent */ + + /* + * Wire the page into pvp, bump the wire-count for pvp's page table + * page. Bump the resident_count for the pmap. There is no pvp + * for the top level, address the pm_pml4[] array directly. + * + * If the caller wants the parent we return it, otherwise + * we just put it away. + * + * No interlock is needed for pte 0 -> non-zero. + */ + if (pvp) { + vm_page_wire_quick(pvp->pv_m); + ptep = pv_pte_lookup(pvp, ptepindex); + KKASSERT((*ptep & PG_V) == 0); + *ptep = VM_PAGE_TO_PHYS(m) | (PG_U | PG_RW | PG_V | + PG_A | PG_M); + } + vm_page_wakeup(m); +notnew: + if (pvpp) + *pvpp = pvp; + else if (pvp) + pv_put(pvp); + return (pv); +} /* - * This routine is called when various levels in the page table need to - * be populated. This routine cannot fail. + * Release any resources held by the given physical map. + * + * Called when a pmap initialized by pmap_pinit is being released. Should + * only be called if the map contains no valid mappings. * - * We returned a page wired for the caller. If we had to map the page into - * a parent page table it will receive an additional wire_count. For example, - * an empty page table directory which is still mapped into its pdp will - * retain a wire_count of 1. + * Caller must hold pmap->pm_token */ -static -vm_page_t -_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex) +struct pmap_release_info { + pmap_t pmap; + int retry; +}; + +static int pmap_release_callback(pv_entry_t pv, void *data); + +void +pmap_release(struct pmap *pmap) { - vm_page_t m; + struct pmap_release_info info; + + KASSERT(pmap->pm_active == 0, + ("pmap still active! %016jx", (uintmax_t)pmap->pm_active)); +#if defined(DIAGNOSTIC) + if (object->ref_count != 1) + panic("pmap_release: pteobj reference count != 1"); +#endif + + spin_lock(&pmap_spin); + TAILQ_REMOVE(&pmap_list, pmap, pm_pmnode); + spin_unlock(&pmap_spin); /* - * Find or fabricate a new pagetable page. This will busy the page. + * Pull pv's off the RB tree in order from low to high and release + * each page. */ - m = vm_page_grab(pmap->pm_pteobj, ptepindex, - VM_ALLOC_NORMAL | VM_ALLOC_ZERO | VM_ALLOC_RETRY); + info.pmap = pmap; + do { + info.retry = 0; + spin_lock(&pmap->pm_spin); + RB_SCAN(pv_entry_rb_tree, &pmap->pm_pvroot, NULL, + pmap_release_callback, &info); + spin_unlock(&pmap->pm_spin); + } while (info.retry); + /* - * The grab may have blocked and raced another thread populating - * the same page table. m->valid will be 0 on a newly allocated page - * so use this to determine if we have to zero it out or not. We - * don't want to zero-out a raced page as this would desynchronize - * the pv_entry's for the related pte's and cause pmap_remove_all() - * to panic. - * - * Page table pages are unmanaged (do not use the normal PQ_s) + * One resident page (the pml4 page) should remain. + * No wired pages should remain. */ - if (m->valid == 0) { - vm_page_unmanage(m); - if ((m->flags & PG_ZERO) == 0) { - pmap_zero_page(VM_PAGE_TO_PHYS(m)); - } -#ifdef PMAP_DEBUG - else { - pmap_page_assertzero(VM_PAGE_TO_PHYS(m)); + KKASSERT(pmap->pm_stats.resident_count == 1); + KKASSERT(pmap->pm_stats.wired_count == 0); +} + +static int +pmap_release_callback(pv_entry_t pv, void *data) +{ + struct pmap_release_info *info = data; + pmap_t pmap = info->pmap; + vm_page_t p; + + if (pv_hold_try(pv)) { + spin_unlock(&pmap->pm_spin); + } else { + spin_unlock(&pmap->pm_spin); + pv_lock(pv); + if (pv->pv_pmap != pmap) { + pv_put(pv); + spin_lock(&pmap->pm_spin); + info->retry = 1; + return(-1); } -#endif - m->valid = VM_PAGE_BITS_ALL; - vm_page_flag_clear(m, PG_ZERO); } -#ifdef PMAP_DEBUG - else { - KKASSERT((m->flags & PG_ZERO) == 0); - } -#endif - KASSERT(m->queue == PQ_NONE, - ("_pmap_allocpte: %p->queue != PQ_NONE", m)); + /* + * The pmap is currently not spinlocked, pv is held+locked. + * Remove the pv's page from its parent's page table. The + * parent's page table page's wire_count will be decremented. + */ + pmap_remove_pv_pte(pv, NULL, NULL); /* - * Increment the wire_count for the page we will be returning to - * the caller. + * Terminal pvs are unhooked from their vm_pages. Because + * terminal pages aren't page table pages they aren't wired + * by us, so we have to be sure not to unwire them either. */ - vm_page_wire(m); + if (pv->pv_pindex < pmap_pt_pindex(0)) { + pmap_remove_pv_page(pv, 0); + goto skip; + } /* - * Map the pagetable page into the process address space, if - * it isn't already there. + * We leave the top-level page table page cached, wired, and + * mapped in the pmap until the dtor function (pmap_puninit()) + * gets called. * - * It is possible that someone else got in and mapped the page - * directory page while we were blocked, if so just unbusy and - * return the held page. + * Since we are leaving the top-level pv intact we need + * to break out of what would otherwise be an infinite loop. */ - if (ptepindex >= (NUPDE + NUPDPE)) { - /* - * Wire up a new PDP page in the PML4. - * - * (m) is busied so we cannot race another thread trying - * to map the PDP entry in the PML4. - */ - vm_pindex_t pml4index; - pml4_entry_t *pml4; - - pml4index = ptepindex - (NUPDE + NUPDPE); - pml4 = &pmap->pm_pml4[pml4index]; - if ((*pml4 & PG_V) == 0) { - *pml4 = VM_PAGE_TO_PHYS(m) | (PG_U | PG_RW | PG_V | - PG_A | PG_M); - ++pmap->pm_stats.resident_count; - vm_page_wire_quick(m); /* wire for mapping */ - } - /* return (m) wired for the caller */ - } else if (ptepindex >= NUPDE) { - /* - * Wire up a new PD page in the PDP - */ - vm_pindex_t pml4index; - vm_pindex_t pdpindex; - vm_page_t pdppg; - pml4_entry_t *pml4; - pdp_entry_t *pdp; + if (pv->pv_pindex == pmap_pml4_pindex()) { + pv_put(pv); + spin_lock(&pmap->pm_spin); + return(-1); + } + + /* + * For page table pages (other than the top-level page), + * remove and free the vm_page. The representitive mapping + * removed above by pmap_remove_pv_pte() did not undo the + * last wire_count so we have to do that as well. + */ + p = pmap_remove_pv_page(pv, 1); + vm_page_busy_wait(p, FALSE, "pmaprl"); + vm_page_unhold(p); + if (p->wire_count != 1) { + kprintf("p->wire_count was %016lx %d\n", + pv->pv_pindex, p->wire_count); + } + KKASSERT(p->wire_count == 1); + KKASSERT(p->flags & PG_UNMANAGED); + + vm_page_unwire(p, 0); + KKASSERT(p->wire_count == 0); + /* JG eventually revert to using vm_page_free_zero() */ + vm_page_free(p); +skip: + pv_free(pv); + spin_lock(&pmap->pm_spin); + return(0); +} + +/* + * This function will remove the pte associated with a pv from its parent. + * Terminal pv's are supported. The removal will be interlocked if info + * is non-NULL. The caller must dispose of pv instead of just unlocking + * it. + * + * The wire count will be dropped on the parent page table. The wire + * count on the page being removed (pv->pv_m) from the parent page table + * is NOT touched. Note that terminal pages will not have any additional + * wire counts while page table pages will have at least one representing + * the mapping, plus others representing sub-mappings. + * + * NOTE: Cannot be called on kernel page table pages, only KVM terminal + * pages and user page table and terminal pages. + * + * The pv must be locked. + * + * XXX must lock parent pv's if they exist to remove pte XXX + */ +static +void +pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, struct pmap_inval_info *info) +{ + vm_pindex_t ptepindex = pv->pv_pindex; + pmap_t pmap = pv->pv_pmap; + vm_page_t p; + int gotpvp = 0; - pdpindex = ptepindex - NUPDE; - pml4index = pdpindex >> NPML4EPGSHIFT; + KKASSERT(pmap); + if (ptepindex == pmap_pml4_pindex()) { /* - * Once mapped the PDP is not unmapped during normal operation - * so we only need to handle races in the unmapped case. - * - * Mapping a PD into the PDP requires an additional wiring - * of the PDP. + * We are the top level pml4 table, there is no parent. */ - pml4 = &pmap->pm_pml4[pml4index]; - if ((*pml4 & PG_V) == 0) { - pdppg = _pmap_allocpte(pmap, - NUPDE + NUPDPE + pml4index); - /* pdppg wired for the map and also wired for return */ - } else { - pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); - vm_page_wire_quick(pdppg); - } - /* we have an extra ref on pdppg now for our use */ - + p = pmap->pm_pmlpv->pv_m; + } else if (ptepindex >= pmap_pdp_pindex(0)) { /* - * Now find the PD entry in the PDP and map it. - * - * (m) is busied so we cannot race another thread trying - * to map the PD entry in the PDP. - * - * If the PD entry is already mapped we have to drop one - * wire count on the pdppg that we had bumped above. + * Remove a PDP page from the pml4e. This can only occur + * with user page tables. We do not have to lock the + * pml4 PV so just ignore pvp. */ - pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); - pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; - - if ((*pdp & PG_V) == 0) { - *pdp = VM_PAGE_TO_PHYS(m) | (PG_U | PG_RW | PG_V | - PG_A | PG_M); - vm_page_wire_quick(m); /* wire for mapping */ - ++pmap->pm_stats.resident_count; - /* eat extra pdppg wiring for mapping */ - } else { - if (vm_page_unwire_quick(pdppg)) - panic("pmap_allocpte: unwire case 1"); + vm_pindex_t pml4_pindex; + vm_pindex_t pdp_index; + pml4_entry_t *pdp; + + pdp_index = ptepindex - pmap_pdp_pindex(0); + if (pvp == NULL) { + pml4_pindex = pmap_pml4_pindex(); + pvp = pv_get(pv->pv_pmap, pml4_pindex); + gotpvp = 1; } - /* return (m) wired for the caller */ - } else { + pdp = &pmap->pm_pml4[pdp_index & ((1ul << NPML4EPGSHIFT) - 1)]; + KKASSERT((*pdp & PG_V) != 0); + p = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); + *pdp = 0; + KKASSERT(info == NULL); + } else if (ptepindex >= pmap_pd_pindex(0)) { /* - * Wire up the new PT page in the PD + * Remove a PD page from the pdp */ - vm_pindex_t pml4index; - vm_pindex_t pdpindex; - pml4_entry_t *pml4; - pdp_entry_t *pdp; - pd_entry_t *pd; - vm_page_t pdppg; - vm_page_t pdpg; + vm_pindex_t pdp_pindex; + vm_pindex_t pd_index; + pdp_entry_t *pd; - pdpindex = ptepindex >> NPDPEPGSHIFT; - pml4index = pdpindex >> NPML4EPGSHIFT; + pd_index = ptepindex - pmap_pd_pindex(0); + if (pvp == NULL) { + pdp_pindex = NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + + (pd_index >> NPML4EPGSHIFT); + pvp = pv_get(pv->pv_pmap, pdp_pindex); + gotpvp = 1; + } + pd = pv_pte_lookup(pvp, pd_index & ((1ul << NPDPEPGSHIFT) - 1)); + KKASSERT((*pd & PG_V) != 0); + p = PHYS_TO_VM_PAGE(*pd & PG_FRAME); + *pd = 0; + KKASSERT(info == NULL); + } else if (ptepindex >= pmap_pt_pindex(0)) { /* - * Locate the PDP page in the PML4 - * - * Once mapped the PDP is not unmapped during normal operation - * so we only need to handle races in the unmapped case. + * Remove a PT page from the pd */ - pml4 = &pmap->pm_pml4[pml4index]; - if ((*pml4 & PG_V) == 0) { - pdppg = _pmap_allocpte(pmap, NUPDE + pdpindex); - } else { - pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); - vm_page_wire_quick(pdppg); - } - /* we have an extra ref on pdppg now for our use */ + vm_pindex_t pd_pindex; + vm_pindex_t pt_index; + pd_entry_t *pt; + pt_index = ptepindex - pmap_pt_pindex(0); + + if (pvp == NULL) { + pd_pindex = NUPTE_TOTAL + NUPT_TOTAL + + (pt_index >> NPDPEPGSHIFT); + pvp = pv_get(pv->pv_pmap, pd_pindex); + gotpvp = 1; + } + pt = pv_pte_lookup(pvp, pt_index & ((1ul << NPDPEPGSHIFT) - 1)); + KKASSERT((*pt & PG_V) != 0); + p = PHYS_TO_VM_PAGE(*pt & PG_FRAME); + *pt = 0; + KKASSERT(info == NULL); + } else { /* - * Locate the PD page in the PDP - * - * Once mapped the PDP is not unmapped during normal operation - * so we only need to handle races in the unmapped case. + * Remove a PTE from the PT page * - * We can scrap the extra reference on pdppg not needed if - * *pdp is already mapped and also not needed if it wasn't - * because the _pmap_allocpte() picked up the case for us. + * NOTE: pv's must be locked bottom-up to avoid deadlocking. + * pv is a pte_pv so we can safely lock pt_pv. */ - pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); - pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; + vm_pindex_t pt_pindex; + pt_entry_t *ptep; + pt_entry_t pte; + vm_offset_t va; - if ((*pdp & PG_V) == 0) { - pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex); + pt_pindex = ptepindex >> NPTEPGSHIFT; + va = (vm_offset_t)ptepindex << PAGE_SHIFT; + + if (ptepindex >= NUPTE_USER) { + ptep = vtopte(ptepindex << PAGE_SHIFT); + KKASSERT(pvp == NULL); } else { - pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); - vm_page_wire_quick(pdpg); + if (pvp == NULL) { + pt_pindex = NUPTE_TOTAL + + (ptepindex >> NPDPEPGSHIFT); + pvp = pv_get(pv->pv_pmap, pt_pindex); + gotpvp = 1; + } + ptep = pv_pte_lookup(pvp, ptepindex & + ((1ul << NPDPEPGSHIFT) - 1)); } - vm_page_unwire_quick(pdppg); - /* we have an extra ref on pdpg now for our use */ + + if (info) + pmap_inval_interlock(info, pmap, va); + pte = pte_load_clear(ptep); + if (info) + pmap_inval_deinterlock(info, pmap); /* - * Locate the PT page in the PD. - * - * (m) is busied so we cannot race another thread trying - * to map the PT page in the PD. + * Now update the vm_page_t */ - pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); - pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)]; - if ((*pd & PG_V) == 0) { - *pd = VM_PAGE_TO_PHYS(m) | (PG_U | PG_RW | PG_V | - PG_A | PG_M); - ++pmap->pm_stats.resident_count; - vm_page_wire_quick(m); /* wire for mapping */ - /* eat extra pdpg wiring for mapping */ - } else { - if (vm_page_unwire_quick(pdpg)) - panic("pmap_allocpte: unwire case 2"); + if ((pte & (PG_MANAGED|PG_V)) != (PG_MANAGED|PG_V)) { + kprintf("remove_pte badpte %016lx %016lx %d\n", + pte, pv->pv_pindex, + pv->pv_pindex < pmap_pt_pindex(0)); + } + /*KKASSERT((pte & (PG_MANAGED|PG_V)) == (PG_MANAGED|PG_V));*/ + p = PHYS_TO_VM_PAGE(pte & PG_FRAME); + + if (pte & PG_M) { + if (pmap_track_modified(ptepindex)) + vm_page_dirty(p); + } + if (pte & PG_A) { + vm_page_flag_set(p, PG_REFERENCED); } - /* return (m) wired for the caller */ + if (pte & PG_W) + atomic_add_long(&pmap->pm_stats.wired_count, -1); + if (pte & PG_G) + cpu_invlpg((void *)va); } /* - * We successfully loaded a PDP, PD, or PTE. Set the page table hint, - * valid bits, mapped flag, unbusy, and we're done. + * Unwire the parent page table page. The wire_count cannot go below + * 1 here because the parent page table page is itself still mapped. + * + * XXX remove the assertions later. */ - pmap->pm_ptphint = m; - -#if 0 - m->valid = VM_PAGE_BITS_ALL; - vm_page_flag_clear(m, PG_ZERO); -#endif - vm_page_flag_set(m, PG_MAPPED); - vm_page_wakeup(m); + KKASSERT(pv->pv_m == p); + if (pvp && vm_page_unwire_quick(pvp->pv_m)) + panic("pmap_remove_pv_pte: Insufficient wire_count"); - return (m); + if (gotpvp) + pv_put(pvp); } static vm_page_t -pmap_allocpte(pmap_t pmap, vm_offset_t va) +pmap_remove_pv_page(pv_entry_t pv, int holdpg) { - vm_pindex_t ptepindex; - pd_entry_t *pd; vm_page_t m; - ASSERT_LWKT_TOKEN_HELD(vm_object_token(pmap->pm_pteobj)); - - /* - * Calculate pagetable page index - */ - ptepindex = pmap_pde_pindex(va); - - /* - * Get the page directory entry - */ - pd = pmap_pde(pmap, va); - - /* - * This supports switching from a 2MB page to a - * normal 4K page. - */ - if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { - panic("no promotion/demotion yet"); - *pd = 0; - pd = NULL; - cpu_invltlb(); - smp_invltlb(); - } - - /* - * If the page table page is mapped, we just increment the - * wire count, and activate it. - */ - if (pd != NULL && (*pd & PG_V) != 0) { - m = pmap_page_lookup(pmap->pm_pteobj, ptepindex); - pmap->pm_ptphint = m; - vm_page_wire_quick(m); - vm_page_wakeup(m); - return m; - } + m = pv->pv_m; + if (holdpg) + vm_page_hold(m); + KKASSERT(m); + vm_page_spin_lock(m); + pv->pv_m = NULL; + TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); /* - * Here if the pte page isn't mapped, or if it has been deallocated. - */ - return _pmap_allocpte(pmap, ptepindex); -} - - -/*************************************************** - * Pmap allocation/deallocation routines. - ***************************************************/ - -/* - * Release any resources held by the given physical map. - * Called when a pmap initialized by pmap_pinit is being released. - * Should only be called if the map contains no valid mappings. - * - * Caller must hold pmap->pm_token - */ -static int pmap_release_callback(struct vm_page *p, void *data); - -static __inline -void -pmap_auto_yield(struct rb_vm_page_scan_info *info) -{ - if (++info->desired >= pmap_yield_count) { - info->desired = 0; - lwkt_yield(); - } -} - -void -pmap_release(struct pmap *pmap) -{ - vm_object_t object = pmap->pm_pteobj; - struct rb_vm_page_scan_info info; - - KASSERT(pmap->pm_active == 0, - ("pmap still active! %016jx", (uintmax_t)pmap->pm_active)); -#if defined(DIAGNOSTIC) - if (object->ref_count != 1) - panic("pmap_release: pteobj reference count != 1"); -#endif - - info.pmap = pmap; - info.object = object; - - spin_lock(&pmap_spin); - TAILQ_REMOVE(&pmap_list, pmap, pm_pmnode); - spin_unlock(&pmap_spin); - - info.desired = 0; - vm_object_hold(object); - do { - info.error = 0; - info.mpte = NULL; - info.limit = object->generation; - - vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL, - pmap_release_callback, &info); - if (info.error == 0 && info.mpte) { - if (!pmap_release_free_page(pmap, info.mpte)) - info.error = 1; - } - } while (info.error); - vm_object_drop(object); - - while (pmap->pm_hold) - tsleep(pmap, 0, "pmapx", 1); -} - -static -int -pmap_release_callback(struct vm_page *p, void *data) -{ - struct rb_vm_page_scan_info *info = data; - - if (p->pindex == NUPDE + NUPDPE + PML4PML4I) { - info->mpte = p; - return(0); - } - if (!pmap_release_free_page(info->pmap, p)) { - info->error = 1; - pmap_auto_yield(info); - return(-1); - } - if (info->object->generation != info->limit) { - info->error = 1; - pmap_auto_yield(info); - return(-1); - } - return(0); + if (m->object) + atomic_add_int(&m->object->agg_pv_list_count, -1); + */ + if (TAILQ_EMPTY(&m->md.pv_list)) + vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); + vm_page_spin_unlock(m); + if (holdpg) + return(m); + return(NULL); } /* @@ -1907,19 +1963,17 @@ pmap_growkernel(vm_offset_t kstart, vm_offset_t kend) vm_paddr_t paddr; vm_offset_t ptppaddr; vm_page_t nkpg; - pd_entry_t *pde, newpdir; - pdp_entry_t newpdp; + pd_entry_t *pt, newpt; + pdp_entry_t newpd; int update_kernel_vm_end; - vm_object_hold(kptobj); - /* * bootstrap kernel_vm_end on first real VM use */ if (kernel_vm_end == 0) { kernel_vm_end = VM_MIN_KERNEL_ADDRESS; nkpt = 0; - while ((*pmap_pde(&kernel_pmap, kernel_vm_end) & PG_V) != 0) { + while ((*pmap_pt(&kernel_pmap, kernel_vm_end) & PG_V) != 0) { kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); nkpt++; @@ -1951,10 +2005,10 @@ pmap_growkernel(vm_offset_t kstart, vm_offset_t kend) kend = kernel_map.max_offset; while (kstart < kend) { - pde = pmap_pde(&kernel_pmap, kstart); - if (pde == NULL) { + pt = pmap_pt(&kernel_pmap, kstart); + if (pt == NULL) { /* We need a new PDP entry */ - nkpg = vm_page_alloc(kptobj, nkpt, + nkpg = vm_page_alloc(NULL, nkpt, VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM | VM_ALLOC_INTERRUPT); @@ -1966,13 +2020,13 @@ pmap_growkernel(vm_offset_t kstart, vm_offset_t kend) if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(paddr); vm_page_flag_clear(nkpg, PG_ZERO); - newpdp = (pdp_entry_t) + newpd = (pdp_entry_t) (paddr | PG_V | PG_RW | PG_A | PG_M); - *pmap_pdpe(&kernel_pmap, kstart) = newpdp; + *pmap_pd(&kernel_pmap, kstart) = newpd; nkpt++; continue; /* try again */ } - if ((*pde & PG_V) != 0) { + if ((*pt & PG_V) != 0) { kstart = (kstart + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); if (kstart - 1 >= kernel_map.max_offset) { @@ -1985,7 +2039,7 @@ pmap_growkernel(vm_offset_t kstart, vm_offset_t kend) /* * This index is bogus, but out of the way */ - nkpg = vm_page_alloc(kptobj, nkpt, + nkpg = vm_page_alloc(NULL, nkpt, VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM | VM_ALLOC_INTERRUPT); @@ -1996,8 +2050,8 @@ pmap_growkernel(vm_offset_t kstart, vm_offset_t kend) ptppaddr = VM_PAGE_TO_PHYS(nkpg); pmap_zero_page(ptppaddr); vm_page_flag_clear(nkpg, PG_ZERO); - newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); - *pmap_pde(&kernel_pmap, kstart) = newpdir; + newpt = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); + *pmap_pt(&kernel_pmap, kstart) = newpt; nkpt++; kstart = (kstart + PAGE_SIZE * NPTEPG) & @@ -2014,8 +2068,6 @@ pmap_growkernel(vm_offset_t kstart, vm_offset_t kend) */ if (update_kernel_vm_end && kernel_vm_end < kstart) kernel_vm_end = kstart; - - vm_object_drop(kptobj); } /* @@ -2054,302 +2106,546 @@ pmap_reference(pmap_t pmap) } /*************************************************** -* page management routines. + * page management routines. ***************************************************/ /* - * free the pv_entry back to the free list. This function may be - * called from an interrupt. + * Hold a pv without locking it */ -static __inline -void -free_pv_entry(pv_entry_t pv) +static void +pv_hold(pv_entry_t pv) { - atomic_add_int(&pv_entry_count, -1); - KKASSERT(pv_entry_count >= 0); - zfree(pvzone, pv); + u_int count; + + if (atomic_cmpset_int(&pv->pv_hold, 0, 1)) + return; + + for (;;) { + count = pv->pv_hold; + cpu_ccfence(); + if (atomic_cmpset_int(&pv->pv_hold, count, count + 1)) + return; + /* retry */ + } } /* - * get a new pv_entry, allocating a block from the system - * when needed. This function may be called from an interrupt. + * Hold a pv_entry, preventing its destruction. TRUE is returned if the pv + * was successfully locked, FALSE if it wasn't. The caller must dispose of + * the pv properly. + * + * Either the pmap->pm_spin or the related vm_page_spin (if traversing a + * pv list via its page) must be held by the caller. */ -static -pv_entry_t -get_pv_entry(void) +static int +_pv_hold_try(pv_entry_t pv PMAP_DEBUG_DECL) { - atomic_add_int(&pv_entry_count, 1); - if (pv_entry_high_water && - (pv_entry_count > pv_entry_high_water) && - (pmap_pagedaemon_waken == 0)) { - pmap_pagedaemon_waken = 1; - wakeup(&vm_pages_needed); + u_int count; + + if (atomic_cmpset_int(&pv->pv_hold, 0, PV_HOLD_LOCKED | 1)) { +#ifdef PMAP_DEBUG + pv->pv_func = func; + pv->pv_line = lineno; +#endif + return TRUE; + } + + for (;;) { + count = pv->pv_hold; + cpu_ccfence(); + if ((count & PV_HOLD_LOCKED) == 0) { + if (atomic_cmpset_int(&pv->pv_hold, count, + (count + 1) | PV_HOLD_LOCKED)) { +#ifdef PMAP_DEBUG + pv->pv_func = func; + pv->pv_line = lineno; +#endif + return TRUE; + } + } else { + if (atomic_cmpset_int(&pv->pv_hold, count, count + 1)) + return FALSE; + } + /* retry */ } - return zalloc(pvzone); } /* - * This routine is very drastic, but can save the system - * in a pinch. + * Drop a previously held pv_entry which could not be locked, allowing its + * destruction. + * + * Must not be called with a spinlock held as we might zfree() the pv if it + * is no longer associated with a pmap and this was the last hold count. */ -void -pmap_collect(void) +static void +pv_drop(pv_entry_t pv) { - int i; - vm_page_t m; - static int warningdone=0; + u_int count; - if (pmap_pagedaemon_waken == 0) + if (atomic_cmpset_int(&pv->pv_hold, 1, 0)) { + if (pv->pv_pmap == NULL) + zfree(pvzone, pv); return; - pmap_pagedaemon_waken = 0; - if (warningdone < 5) { - kprintf("pmap_collect: collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n"); - warningdone++; } - for (i = 0; i < vm_page_array_size; i++) { - m = &vm_page_array[i]; - if (m->wire_count || m->hold_count) - continue; - if (vm_page_busy_try(m, TRUE) == 0) { - if (m->wire_count == 0 && m->hold_count == 0) { - pmap_remove_all(m); - } - vm_page_wakeup(m); + for (;;) { + count = pv->pv_hold; + cpu_ccfence(); + KKASSERT((count & PV_HOLD_MASK) > 0); + KKASSERT((count & (PV_HOLD_LOCKED | PV_HOLD_MASK)) != + (PV_HOLD_LOCKED | 1)); + if (atomic_cmpset_int(&pv->pv_hold, count, count - 1)) { + if (count == 1 && pv->pv_pmap == NULL) + zfree(pvzone, pv); + return; } + /* retry */ } } - /* - * If it is the first entry on the list, it is actually in the header and - * we must copy the following entry up to the header. - * - * Otherwise we must search the list for the entry. In either case we - * free the now unused entry. - * - * Caller must hold pmap->pm_token + * Find or allocate the requested PV entry, returning a locked pv */ static -int -pmap_remove_entry(struct pmap *pmap, vm_page_t m, - vm_offset_t va, pmap_inval_info_t info) +pv_entry_t +_pv_alloc(pmap_t pmap, vm_pindex_t pindex, int *isnew PMAP_DEBUG_DECL) { pv_entry_t pv; - int rtval; + pv_entry_t pnew = NULL; - spin_lock(&pmap_spin); - if (m->md.pv_list_count < pmap->pm_stats.resident_count) { - TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { - if (pmap == pv->pv_pmap && va == pv->pv_va) - break; + spin_lock(&pmap->pm_spin); + for (;;) { + if ((pv = pmap->pm_pvhint) == NULL || pv->pv_pindex != pindex) { + pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, + pindex); } - } else { - TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) { - if (va == pv->pv_va) - break; - } - } + if (pv == NULL) { + if (pnew == NULL) { + spin_unlock(&pmap->pm_spin); + pnew = zalloc(pvzone); + spin_lock(&pmap->pm_spin); + continue; + } + pnew->pv_pmap = pmap; + pnew->pv_pindex = pindex; + pnew->pv_hold = PV_HOLD_LOCKED | 1; +#ifdef PMAP_DEBUG + pnew->pv_func = func; + pnew->pv_line = lineno; +#endif + pv_entry_rb_tree_RB_INSERT(&pmap->pm_pvroot, pnew); + atomic_add_long(&pmap->pm_stats.resident_count, 1); + spin_unlock(&pmap->pm_spin); + *isnew = 1; + return(pnew); + } + if (pnew) { + spin_unlock(&pmap->pm_spin); + zfree(pvzone, pnew); + pnew = NULL; + spin_lock(&pmap->pm_spin); + continue; + } + if (_pv_hold_try(pv PMAP_DEBUG_COPY)) { + spin_unlock(&pmap->pm_spin); + *isnew = 0; + return(pv); + } + spin_unlock(&pmap->pm_spin); + _pv_lock(pv PMAP_DEBUG_COPY); + if (pv->pv_pmap == pmap && pv->pv_pindex == pindex) { + *isnew = 0; + return(pv); + } + pv_put(pv); + spin_lock(&pmap->pm_spin); + } - rtval = 0; - KKASSERT(pv); - TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); - m->md.pv_generation++; - m->md.pv_list_count--; - vm_page_spin_lock(m); - if (m->object) - atomic_add_int(&m->object->agg_pv_list_count, -1); - vm_page_spin_unlock(m); - KKASSERT(m->md.pv_list_count >= 0); - if (TAILQ_EMPTY(&m->md.pv_list)) - vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); - TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); - ++pmap->pm_generation; - spin_unlock(&pmap_spin); +} - rtval = pmap_unwire_pte_hold(pmap, va, pv->pv_ptem, info); - free_pv_entry(pv); +/* + * Find the requested PV entry, returning a locked+held pv or NULL + */ +static +pv_entry_t +_pv_get(pmap_t pmap, vm_pindex_t pindex PMAP_DEBUG_DECL) +{ + pv_entry_t pv; - return rtval; + spin_lock(&pmap->pm_spin); + for (;;) { + /* + * Shortcut cache + */ + if ((pv = pmap->pm_pvhint) == NULL || pv->pv_pindex != pindex) { + pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, + pindex); + } + if (pv == NULL) { + spin_unlock(&pmap->pm_spin); + return NULL; + } + if (_pv_hold_try(pv PMAP_DEBUG_COPY)) { + pv_cache(pv, pindex); + spin_unlock(&pmap->pm_spin); + return(pv); + } + spin_unlock(&pmap->pm_spin); + _pv_lock(pv PMAP_DEBUG_COPY); + if (pv->pv_pmap == pmap && pv->pv_pindex == pindex) + return(pv); + pv_put(pv); + spin_lock(&pmap->pm_spin); + } } /* - * Create a pv entry for page at pa for (pmap, va). + * Lookup, hold, and attempt to lock (pmap,pindex). + * + * If the entry does not exist NULL is returned and *errorp is set to 0 * - * Caller must hold pmap token + * If the entry exists and could be successfully locked it is returned and + * errorp is set to 0. + * + * If the entry exists but could NOT be successfully locked it is returned + * held and *errorp is set to 1. */ static -void -pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m) +pv_entry_t +pv_get_try(pmap_t pmap, vm_pindex_t pindex, int *errorp) { pv_entry_t pv; - pv = get_pv_entry(); - pv->pv_va = va; - pv->pv_pmap = pmap; - pv->pv_ptem = mpte; - - spin_lock(&pmap_spin); - TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist); - TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); - m->md.pv_generation++; - m->md.pv_list_count++; - vm_page_spin_lock(m); - if (m->object) - atomic_add_int(&m->object->agg_pv_list_count, 1); - vm_page_spin_unlock(m); - pmap->pm_generation++; - spin_unlock(&pmap_spin); + spin_lock(&pmap->pm_spin); + if ((pv = pmap->pm_pvhint) == NULL || pv->pv_pindex != pindex) + pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pindex); + if (pv == NULL) { + spin_unlock(&pmap->pm_spin); + *errorp = 0; + return NULL; + } + if (pv_hold_try(pv)) { + pv_cache(pv, pindex); + spin_unlock(&pmap->pm_spin); + *errorp = 0; + return(pv); /* lock succeeded */ + } + spin_unlock(&pmap->pm_spin); + *errorp = 1; + return (pv); /* lock failed */ } /* - * pmap_remove_pte: do the things to unmap a page in a process - * - * Caller must hold pmap token - * Caller must hold pmap object + * Find the requested PV entry, returning a held pv or NULL */ static -int -pmap_remove_pte(struct pmap *pmap, pt_entry_t *ptq, vm_offset_t va, - pmap_inval_info_t info) +pv_entry_t +pv_find(pmap_t pmap, vm_pindex_t pindex) { - pt_entry_t oldpte; - vm_page_t m; - vm_page_t mpte; - vm_pindex_t ptepindex; + pv_entry_t pv; - ASSERT_LWKT_TOKEN_HELD(&pmap->pm_token); + spin_lock(&pmap->pm_spin); - pmap_inval_interlock(info, pmap, va); - oldpte = pte_load_clear(ptq); - pmap_inval_deinterlock(info, pmap); - if (oldpte & PG_W) - pmap->pm_stats.wired_count -= 1; - /* - * Machines that don't support invlpg, also don't support - * PG_G. XXX PG_G is disabled for SMP so don't worry about - * the SMP case. - */ - if (oldpte & PG_G) - cpu_invlpg((void *)va); - KKASSERT(pmap->pm_stats.resident_count > 0); - --pmap->pm_stats.resident_count; - if (oldpte & PG_MANAGED) { - m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); - if (oldpte & PG_M) { -#if defined(PMAP_DIAGNOSTIC) - if (pmap_nw_modified((pt_entry_t) oldpte)) { - kprintf( - "pmap_remove: modified page not writable: va: 0x%lx, pte: 0x%lx\n", - va, oldpte); + if ((pv = pmap->pm_pvhint) == NULL || pv->pv_pindex != pindex) + pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pindex); + if (pv == NULL) { + spin_unlock(&pmap->pm_spin); + return NULL; + } + pv_hold(pv); + pv_cache(pv, pindex); + spin_unlock(&pmap->pm_spin); + return(pv); +} + +/* + * Lock a held pv, keeping the hold count + */ +static +void +_pv_lock(pv_entry_t pv PMAP_DEBUG_DECL) +{ + u_int count; + + for (;;) { + count = pv->pv_hold; + cpu_ccfence(); + if ((count & PV_HOLD_LOCKED) == 0) { + if (atomic_cmpset_int(&pv->pv_hold, count, + count | PV_HOLD_LOCKED)) { +#ifdef PMAP_DEBUG + pv->pv_func = func; + pv->pv_line = lineno; +#endif + return; } + continue; + } + tsleep_interlock(pv, 0); + if (atomic_cmpset_int(&pv->pv_hold, count, + count | PV_HOLD_WAITING)) { +#ifdef PMAP_DEBUG + kprintf("pv waiting on %s:%d\n", + pv->pv_func, pv->pv_line); #endif - if (pmap_track_modified(va)) - vm_page_dirty(m); + tsleep(pv, PINTERLOCKED, "pvwait", hz); } - if (oldpte & PG_A) - vm_page_flag_set(m, PG_REFERENCED); - return pmap_remove_entry(pmap, m, va, info); + /* retry */ } +} - /* - * Unmanaged pages in userspace still wire the PT page, we have - * to look up the mpte for the PDE page and pass it in. - */ - if (va < VM_MAX_USER_ADDRESS) { - ptepindex = pmap_pde_pindex(va); - mpte = vm_page_lookup(pmap->pm_pteobj, ptepindex); - KKASSERT(mpte); - } else { - mpte = NULL; +/* + * Unlock a held and locked pv, keeping the hold count. + */ +static +void +pv_unlock(pv_entry_t pv) +{ + u_int count; + + if (atomic_cmpset_int(&pv->pv_hold, PV_HOLD_LOCKED | 1, 1)) + return; + + for (;;) { + count = pv->pv_hold; + cpu_ccfence(); + KKASSERT((count & (PV_HOLD_LOCKED|PV_HOLD_MASK)) >= + (PV_HOLD_LOCKED | 1)); + if (atomic_cmpset_int(&pv->pv_hold, count, + count & + ~(PV_HOLD_LOCKED | PV_HOLD_WAITING))) { + if (count & PV_HOLD_WAITING) + wakeup(pv); + break; + } } - return pmap_unwire_pte_hold(pmap, va, mpte, info); } /* - * Remove a single page from a process address space. - * - * This function may not be called from an interrupt if the pmap is - * not kernel_pmap. + * Unlock and drop a pv. If the pv is no longer associated with a pmap + * and the hold count drops to zero we will free it. * - * Caller must hold pmap->pm_token - * Caller must hold pmap object + * Caller should not hold any spin locks. We are protected from hold races + * by virtue of holds only occuring only with a pmap_spin or vm_page_spin + * lock held. A pv cannot be located otherwise. */ static void -pmap_remove_page(struct pmap *pmap, vm_offset_t va, pmap_inval_info_t info) +pv_put(pv_entry_t pv) { - pt_entry_t *pte; + if (atomic_cmpset_int(&pv->pv_hold, PV_HOLD_LOCKED | 1, 0)) { + if (pv->pv_pmap == NULL) + zfree(pvzone, pv); + return; + } + pv_unlock(pv); + pv_drop(pv); +} - ASSERT_LWKT_TOKEN_HELD(&pmap->pm_token); +/* + * Unlock, drop, and free a pv, destroying it. The pv is removed from its + * pmap. Any pte operations must have already been completed. + */ +static +void +pv_free(pv_entry_t pv) +{ + pmap_t pmap; - pte = pmap_pte(pmap, va); - if (pte == NULL) - return; - if ((*pte & PG_V) == 0) + KKASSERT(pv->pv_m == NULL); + if ((pmap = pv->pv_pmap) != NULL) { + spin_lock(&pmap->pm_spin); + pv_entry_rb_tree_RB_REMOVE(&pmap->pm_pvroot, pv); + if (pmap->pm_pvhint == pv) + pmap->pm_pvhint = NULL; + atomic_add_long(&pmap->pm_stats.resident_count, -1); + pv->pv_pmap = NULL; + pv->pv_pindex = 0; + spin_unlock(&pmap->pm_spin); + } + pv_put(pv); +} + +/* + * This routine is very drastic, but can save the system + * in a pinch. + */ +void +pmap_collect(void) +{ + int i; + vm_page_t m; + static int warningdone=0; + + if (pmap_pagedaemon_waken == 0) return; - pmap_remove_pte(pmap, pte, va, info); + pmap_pagedaemon_waken = 0; + if (warningdone < 5) { + kprintf("pmap_collect: collecting pv entries -- " + "suggest increasing PMAP_SHPGPERPROC\n"); + warningdone++; + } + + for (i = 0; i < vm_page_array_size; i++) { + m = &vm_page_array[i]; + if (m->wire_count || m->hold_count) + continue; + if (vm_page_busy_try(m, TRUE) == 0) { + if (m->wire_count == 0 && m->hold_count == 0) { + pmap_remove_all(m); + } + vm_page_wakeup(m); + } + } } /* - * Remove the given range of addresses from the specified map. + * Scan the pmap for active page table entries and issue a callback. + * The callback must dispose of pte_pv. * - * It is assumed that the start and end are properly rounded to the page size. + * NOTE: Unmanaged page table entries will not have a pte_pv * - * This function may not be called from an interrupt if the pmap is not - * kernel_pmap. + * NOTE: Kernel page table entries will not have a pt_pv. That is, wiring + * counts are not tracked in kernel page table pages. + * + * It is assumed that the start and end are properly rounded to the page size. */ -void -pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva) -{ +static void +pmap_scan(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva, + void (*func)(pmap_t, struct pmap_inval_info *, + pv_entry_t, pv_entry_t, vm_offset_t, + pt_entry_t *, void *), + void *arg) +{ + pv_entry_t pdp_pv; /* A page directory page PV */ + pv_entry_t pd_pv; /* A page directory PV */ + pv_entry_t pt_pv; /* A page table PV */ + pv_entry_t pte_pv; /* A page table entry PV */ + pt_entry_t *ptep; vm_offset_t va_next; - pml4_entry_t *pml4e; - pdp_entry_t *pdpe; - pd_entry_t ptpaddr, *pde; - pt_entry_t *pte; struct pmap_inval_info info; + int error; if (pmap == NULL) return; - vm_object_hold(pmap->pm_pteobj); + /* + * Hold the token for stability; if the pmap is empty we have nothing + * to do. + */ lwkt_gettoken(&pmap->pm_token); +#if 0 if (pmap->pm_stats.resident_count == 0) { lwkt_reltoken(&pmap->pm_token); - vm_object_drop(pmap->pm_pteobj); return; } +#endif pmap_inval_init(&info); /* - * special handling of removing one page. a very - * common operation and easy to short circuit some - * code. + * Special handling for removing one page, which is a very common + * operation (it is?). + * NOTE: Locks must be ordered bottom-up. pte,pt,pd,pdp,pml4 */ if (sva + PAGE_SIZE == eva) { - pde = pmap_pde(pmap, sva); - if (pde && (*pde & PG_PS) == 0) { - pmap_remove_page(pmap, sva, &info); - pmap_inval_done(&info); - lwkt_reltoken(&pmap->pm_token); - vm_object_drop(pmap->pm_pteobj); - return; + if (sva >= VM_MAX_USER_ADDRESS) { + /* + * Kernel mappings do not track wire counts on + * page table pages. + */ + pt_pv = NULL; + pte_pv = pv_get(pmap, pmap_pte_pindex(sva)); + ptep = vtopte(sva); + } else { + /* + * User mappings may or may not have a pte_pv but + * will always have a pt_pv if the page is present. + */ + pte_pv = pv_get(pmap, pmap_pte_pindex(sva)); + pt_pv = pv_get(pmap, pmap_pt_pindex(sva)); + if (pt_pv == NULL) { + KKASSERT(pte_pv == NULL); + goto fast_skip; + } + ptep = pv_pte_lookup(pt_pv, pmap_pte_index(sva)); } + if (*ptep == 0) { + KKASSERT(pte_pv == NULL); + } else if (pte_pv) { + KKASSERT((*ptep & (PG_MANAGED|PG_V)) == + (PG_MANAGED|PG_V)); + func(pmap, &info, pte_pv, pt_pv, sva, ptep, arg); + } else { + KKASSERT((*ptep & (PG_MANAGED|PG_V)) == + PG_V); + func(pmap, &info, pte_pv, pt_pv, sva, ptep, arg); + } + if (pt_pv) + pv_put(pt_pv); +fast_skip: + pmap_inval_done(&info); + lwkt_reltoken(&pmap->pm_token); + return; } + /* + * NOTE: kernel mappings do not track page table pages, only + * terminal pages. + * + * NOTE: Locks must be ordered bottom-up. pte,pt,pd,pdp,pml4. + * However, for the scan to be efficient we try to + * cache items top-down. + */ + pdp_pv = NULL; + pd_pv = NULL; + pt_pv = NULL; + for (; sva < eva; sva = va_next) { - pml4e = pmap_pml4e(pmap, sva); - if ((*pml4e & PG_V) == 0) { + lwkt_yield(); + if (sva >= VM_MAX_USER_ADDRESS) { + if (pt_pv) { + pv_put(pt_pv); + pt_pv = NULL; + } + goto kernel_skip; + } + + /* + * PDP cache + */ + if (pdp_pv == NULL) { + pdp_pv = pv_get(pmap, pmap_pdp_pindex(sva)); + } else if (pdp_pv->pv_pindex != pmap_pdp_pindex(sva)) { + pv_put(pdp_pv); + pdp_pv = pv_get(pmap, pmap_pdp_pindex(sva)); + } + if (pdp_pv == NULL) { va_next = (sva + NBPML4) & ~PML4MASK; if (va_next < sva) va_next = eva; continue; } - pdpe = pmap_pml4e_to_pdpe(pml4e, sva); - if ((*pdpe & PG_V) == 0) { + /* + * PD cache + */ + if (pd_pv == NULL) { + if (pdp_pv) { + pv_put(pdp_pv); + pdp_pv = NULL; + } + pd_pv = pv_get(pmap, pmap_pd_pindex(sva)); + } else if (pd_pv->pv_pindex != pmap_pd_pindex(sva)) { + pv_put(pd_pv); + if (pdp_pv) { + pv_put(pdp_pv); + pdp_pv = NULL; + } + pd_pv = pv_get(pmap, pmap_pd_pindex(sva)); + } + if (pd_pv == NULL) { va_next = (sva + NBPDP) & ~PDPMASK; if (va_next < sva) va_next = eva; @@ -2357,55 +2653,186 @@ pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva) } /* - * Calculate index for next page table. + * PT cache */ - va_next = (sva + NBPDR) & ~PDRMASK; - if (va_next < sva) - va_next = eva; - - pde = pmap_pdpe_to_pde(pdpe, sva); - ptpaddr = *pde; + if (pt_pv == NULL) { + if (pdp_pv) { + pv_put(pdp_pv); + pdp_pv = NULL; + } + if (pd_pv) { + pv_put(pd_pv); + pd_pv = NULL; + } + pt_pv = pv_get(pmap, pmap_pt_pindex(sva)); + } else if (pt_pv->pv_pindex != pmap_pt_pindex(sva)) { + if (pdp_pv) { + pv_put(pdp_pv); + pdp_pv = NULL; + } + if (pd_pv) { + pv_put(pd_pv); + pd_pv = NULL; + } + pv_put(pt_pv); + pt_pv = pv_get(pmap, pmap_pt_pindex(sva)); + } /* - * Weed out invalid mappings. + * We will scan or skip a page table page so adjust va_next + * either way. */ - if (ptpaddr == 0) + if (pt_pv == NULL) { + va_next = (sva + NBPDR) & ~PDRMASK; + if (va_next < sva) + va_next = eva; continue; + } /* - * Check for large page. + * From this point in the loop testing pt_pv for non-NULL + * means we are in UVM, else if it is NULL we are in KVM. */ - if ((ptpaddr & PG_PS) != 0) { - /* JG FreeBSD has more complex treatment here */ - pmap_inval_interlock(&info, pmap, -1); - *pde = 0; - pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; - pmap_inval_deinterlock(&info, pmap); - continue; - } +kernel_skip: + va_next = (sva + NBPDR) & ~PDRMASK; + if (va_next < sva) + va_next = eva; /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. + * + * Scan the page table for pages. Some pages may not be + * managed (might not have a pv_entry). + * + * There is no page table management for kernel pages so + * pt_pv will be NULL in that case, but otherwise pt_pv + * is non-NULL, locked, and referenced. */ if (va_next > eva) va_next = eva; - /* - * NOTE: pmap_remove_pte() can block. - */ - for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, - sva += PAGE_SIZE) { - if (*pte == 0) + if (pt_pv) + ptep = pv_pte_lookup(pt_pv, pmap_pte_index(sva)); + else + ptep = vtopte(sva); + + while (sva < va_next) { + if (*ptep == 0) { + /* XXX remove me */ + pte_pv = pv_find(pmap, pmap_pte_pindex(sva)); + KKASSERT(pte_pv == NULL); + sva += PAGE_SIZE; + ++ptep; continue; - if (pmap_remove_pte(pmap, pte, sva, &info)) - break; + } + + /* + * We need a locked pte_pv as well and may have to + * loop to retry if we can't get it non-blocking + * while pt_pv is held locked. + * + * This is a bit complicated because once we release + * the pt_pv our ptep is no longer valid, so we have + * to cycle the whole thing. + */ + if (pt_pv) { + pte_pv = pv_get_try(pmap, pmap_pte_pindex(sva), + &error); + if (error) { + kprintf("x"); + if (pdp_pv) { + pv_put(pdp_pv); + pdp_pv = NULL; + } + if (pd_pv) { + pv_put(pd_pv); + pd_pv = NULL; + } + pv_put(pt_pv); /* must be non-NULL */ + pt_pv = NULL; + pv_lock(pte_pv); /* safe to block now */ + pv_put(pte_pv); + pte_pv = NULL; + pt_pv = pv_get(pmap, + pmap_pt_pindex(sva)); + continue; + } + } else { + pte_pv = pv_get(pmap, pmap_pte_pindex(sva)); + } + + /* + * Ready for the callback + */ + if (pte_pv) { + KKASSERT((*ptep & (PG_MANAGED|PG_V)) == + (PG_MANAGED|PG_V)); + func(pmap, &info, pte_pv, pt_pv, sva, + ptep, arg); + } else { + KKASSERT((*ptep & (PG_MANAGED|PG_V)) == + PG_V); + func(pmap, &info, pte_pv, pt_pv, sva, + ptep, arg); + } + pte_pv = NULL; /* eaten by callback */ + sva += PAGE_SIZE; + ++ptep; } } + if (pdp_pv) { + pv_put(pdp_pv); + pdp_pv = NULL; + } + if (pd_pv) { + pv_put(pd_pv); + pd_pv = NULL; + } + if (pt_pv) { + pv_put(pt_pv); + pt_pv = NULL; + } pmap_inval_done(&info); lwkt_reltoken(&pmap->pm_token); - vm_object_drop(pmap->pm_pteobj); +} + +void +pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva) +{ + pmap_scan(pmap, sva, eva, pmap_remove_callback, NULL); +} + +static void +pmap_remove_callback(pmap_t pmap, struct pmap_inval_info *info, + pv_entry_t pte_pv, pv_entry_t pt_pv, vm_offset_t va, + pt_entry_t *ptep, void *arg __unused) +{ + pt_entry_t pte; + + if (pte_pv) { + /* + * This will also drop pt_pv's wire_count. Note that + * terminal pages are not wired based on mmu presence. + */ + pmap_remove_pv_pte(pte_pv, pt_pv, info); + pmap_remove_pv_page(pte_pv, 0); + pv_free(pte_pv); + } else { + /* + * pt_pv's wire_count is still bumped by unmanaged pages + * so we must decrement it manually. + */ + pmap_inval_interlock(info, pmap, va); + pte = pte_load_clear(ptep); + pmap_inval_deinterlock(info, pmap); + if (pte & PG_W) + atomic_add_long(&pmap->pm_stats.wired_count, -1); + atomic_add_long(&pmap->pm_stats.resident_count, -1); + if (pt_pv && vm_page_unwire_quick(pt_pv->pv_m)) + panic("pmap_remove: insufficient wirecount"); + } } /* @@ -2419,91 +2846,35 @@ void pmap_remove_all(vm_page_t m) { struct pmap_inval_info info; - pt_entry_t *pte, tpte; pv_entry_t pv; - struct pmap *pmap; if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return; pmap_inval_init(&info); - spin_lock(&pmap_spin); + vm_page_spin_lock(m); while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { - /* - * We have to be holding the pmap token to interlock - * the pte destruction and pv removal. XXX need hold on - * pmap. - */ - pmap = pv->pv_pmap; - spin_unlock(&pmap_spin); - lwkt_gettoken(&pmap->pm_token); /* XXX hold race */ - spin_lock(&pmap_spin); - if (pv != TAILQ_FIRST(&m->md.pv_list)) { - spin_unlock(&pmap_spin); - lwkt_reltoken(&pmap->pm_token); - spin_lock(&pmap_spin); - continue; + KKASSERT(pv->pv_m == m); + if (pv_hold_try(pv)) { + vm_page_spin_unlock(m); + } else { + vm_page_spin_unlock(m); + pv_lock(pv); + if (pv->pv_m != m) { + pv_put(pv); + vm_page_spin_lock(m); + continue; + } } - /* - * Remove the pv + * Holding no spinlocks, pv is locked. */ - TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); - TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist); - m->md.pv_generation++; - m->md.pv_list_count--; + pmap_remove_pv_pte(pv, NULL, &info); + pmap_remove_pv_page(pv, 0); + pv_free(pv); vm_page_spin_lock(m); - if (m->object) - atomic_add_int(&m->object->agg_pv_list_count, -1); - vm_page_spin_unlock(m); - KKASSERT(m->md.pv_list_count >= 0); - ++pv->pv_pmap->pm_generation; - spin_unlock(&pmap_spin); - - /* - * pv is now isolated - */ - KKASSERT(pv->pv_pmap->pm_stats.resident_count > 0); - --pv->pv_pmap->pm_stats.resident_count; - - pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); - pmap_inval_interlock(&info, pv->pv_pmap, pv->pv_va); - tpte = pte_load_clear(pte); - KKASSERT(tpte & PG_MANAGED); - if (tpte & PG_W) - pv->pv_pmap->pm_stats.wired_count--; - pmap_inval_deinterlock(&info, pv->pv_pmap); - if (tpte & PG_A) - vm_page_flag_set(m, PG_REFERENCED); - - /* - * Update the vm_page_t clean and reference bits. - */ - if (tpte & PG_M) { -#if defined(PMAP_DIAGNOSTIC) - if (pmap_nw_modified(tpte)) { - kprintf("pmap_remove_all: modified page not " - "writable: va: 0x%lx, pte: 0x%lx\n", - pv->pv_va, tpte); - } -#endif - if (pmap_track_modified(pv->pv_va)) - vm_page_dirty(m); /* XXX races(m) */ - } - - spin_lock(&pmap_spin); - if (TAILQ_EMPTY(&m->md.pv_list)) - vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); - spin_unlock(&pmap_spin); - - pmap_unwire_pte_hold(pv->pv_pmap, pv->pv_va, - pv->pv_ptem, &info); - lwkt_reltoken(&pv->pv_pmap->pm_token); - - free_pv_entry(pv); - spin_lock(&pmap_spin); } - spin_unlock(&pmap_spin); + vm_page_spin_unlock(m); KKASSERT((m->flags & (PG_MAPPED|PG_WRITEABLE)) == 0); pmap_inval_done(&info); } @@ -2520,192 +2891,156 @@ pmap_remove_all(vm_page_t m) void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { - vm_offset_t va_next; - pml4_entry_t *pml4e; - pdp_entry_t *pdpe; - pd_entry_t ptpaddr, *pde; - pt_entry_t *pte; - pmap_inval_info info; - /* JG review for NX */ if (pmap == NULL) return; - if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } - if (prot & VM_PROT_WRITE) return; + pmap_scan(pmap, sva, eva, pmap_protect_callback, &prot); +} - lwkt_gettoken(&pmap->pm_token); - pmap_inval_init(&info); - - for (; sva < eva; sva = va_next) { - pml4e = pmap_pml4e(pmap, sva); - if ((*pml4e & PG_V) == 0) { - va_next = (sva + NBPML4) & ~PML4MASK; - if (va_next < sva) - va_next = eva; - continue; - } - - pdpe = pmap_pml4e_to_pdpe(pml4e, sva); - if ((*pdpe & PG_V) == 0) { - va_next = (sva + NBPDP) & ~PDPMASK; - if (va_next < sva) - va_next = eva; - continue; - } - - va_next = (sva + NBPDR) & ~PDRMASK; - if (va_next < sva) - va_next = eva; - - pde = pmap_pdpe_to_pde(pdpe, sva); - ptpaddr = *pde; - - /* - * Check for large page. - */ - if ((ptpaddr & PG_PS) != 0) { - pmap_inval_interlock(&info, pmap, -1); - *pde &= ~(PG_M|PG_RW); - pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; - pmap_inval_deinterlock(&info, pmap); - continue; - } - - /* - * Weed out invalid mappings. Note: we assume that the page - * directory table is always allocated, and in kernel virtual. - */ - if (ptpaddr == 0) - continue; - - if (va_next > eva) - va_next = eva; - - for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, - sva += PAGE_SIZE) { - pt_entry_t pbits; - pt_entry_t cbits; - vm_page_t m; +static +void +pmap_protect_callback(pmap_t pmap, struct pmap_inval_info *info, + pv_entry_t pte_pv, pv_entry_t pt_pv, vm_offset_t va, + pt_entry_t *ptep, void *arg __unused) +{ + pt_entry_t pbits; + pt_entry_t cbits; + vm_page_t m; - /* - * XXX non-optimal. - */ - pmap_inval_interlock(&info, pmap, sva); + /* + * XXX non-optimal. + */ + pmap_inval_interlock(info, pmap, va); again: - pbits = *pte; - cbits = pbits; - if ((pbits & PG_V) == 0) { - pmap_inval_deinterlock(&info, pmap); - continue; - } - if (pbits & PG_MANAGED) { - m = NULL; - if (pbits & PG_A) { + pbits = *ptep; + cbits = pbits; + if (pte_pv) { + m = NULL; + if (pbits & PG_A) { + m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); + KKASSERT(m == pte_pv->pv_m); + vm_page_flag_set(m, PG_REFERENCED); + cbits &= ~PG_A; + } + if (pbits & PG_M) { + if (pmap_track_modified(pte_pv->pv_pindex)) { + if (m == NULL) m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); - vm_page_flag_set(m, PG_REFERENCED); - cbits &= ~PG_A; - } - if (pbits & PG_M) { - if (pmap_track_modified(sva)) { - if (m == NULL) - m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); - vm_page_dirty(m); - cbits &= ~PG_M; - } - } - } - cbits &= ~PG_RW; - if (pbits != cbits && - !atomic_cmpset_long(pte, pbits, cbits)) { - goto again; + vm_page_dirty(m); + cbits &= ~PG_M; } - pmap_inval_deinterlock(&info, pmap); } } - pmap_inval_done(&info); - lwkt_reltoken(&pmap->pm_token); + cbits &= ~PG_RW; + if (pbits != cbits && !atomic_cmpset_long(ptep, pbits, cbits)) { + goto again; + } + pmap_inval_deinterlock(info, pmap); + if (pte_pv) + pv_put(pte_pv); } /* - * Insert the given physical page (p) at - * the specified virtual address (v) in the - * target physical map with the protection requested. - * - * If specified, the page will be wired down, meaning - * that the related pte can not be reclaimed. + * Insert the vm_page (m) at the virtual address (va), replacing any prior + * mapping at that address. Set protection and wiring as requested. * - * NB: This is the only routine which MAY NOT lazy-evaluate - * or lose information. That is, this routine must actually - * insert this page into the given map NOW. + * NOTE: This routine MUST insert the page into the pmap now, it cannot + * lazy-evaluate. */ void pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, boolean_t wired) -{ - vm_paddr_t pa; - pd_entry_t *pde; - pt_entry_t *pte; +{ + pmap_inval_info info; + pv_entry_t pt_pv; /* page table */ + pv_entry_t pte_pv; /* page table entry */ + pt_entry_t *ptep; vm_paddr_t opa; pt_entry_t origpte, newpte; - vm_page_t mpte; - pmap_inval_info info; + vm_paddr_t pa; if (pmap == NULL) return; - va = trunc_page(va); #ifdef PMAP_DIAGNOSTIC if (va >= KvaEnd) panic("pmap_enter: toobig"); if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS)) - panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", va); + panic("pmap_enter: invalid to pmap_enter page table " + "pages (va: 0x%lx)", va); #endif if (va < UPT_MAX_ADDRESS && pmap == &kernel_pmap) { - kprintf("Warning: pmap_enter called on UVA with kernel_pmap\n"); + kprintf("Warning: pmap_enter called on UVA with " + "kernel_pmap\n"); #ifdef DDB db_print_backtrace(); #endif } if (va >= UPT_MAX_ADDRESS && pmap != &kernel_pmap) { - kprintf("Warning: pmap_enter called on KVA without kernel_pmap\n"); + kprintf("Warning: pmap_enter called on KVA without" + "kernel_pmap\n"); #ifdef DDB db_print_backtrace(); #endif } - vm_object_hold(pmap->pm_pteobj); - lwkt_gettoken(&pmap->pm_token); - /* - * In the case that a page table page is not - * resident, we are creating it here. - */ - if (va < VM_MAX_USER_ADDRESS) - mpte = pmap_allocpte(pmap, va); - else - mpte = NULL; + * Get locked PV entries for our new page table entry (pte_pv) + * and for its parent page table (pt_pv). We need the parent + * so we can resolve the location of the ptep. + * + * Only hardware MMU actions can modify the ptep out from + * under us. + * + * if (m) is fictitious or unmanaged we do not create a managing + * pte_pv for it. Any pre-existing page's management state must + * match (avoiding code complexity). + * + * If the pmap is still being initialized we assume existing + * page tables. + * + * Kernel mapppings do not track page table pages (i.e. pt_pv). + * pmap_allocpte() checks the + */ + if (pmap_initialized == FALSE) { + pte_pv = NULL; + pt_pv = NULL; + ptep = vtopte(va); + } else if (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) { + pte_pv = NULL; + if (va >= VM_MAX_USER_ADDRESS) { + pt_pv = NULL; + ptep = vtopte(va); + } else { + pt_pv = pmap_allocpte(pmap, pmap_pt_pindex(va), NULL); + ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); + } + KKASSERT(*ptep == 0 || (*ptep & PG_MANAGED) == 0); + } else { + if (va >= VM_MAX_USER_ADDRESS) { + pt_pv = NULL; + pte_pv = pmap_allocpte(pmap, pmap_pte_pindex(va), NULL); + ptep = vtopte(va); + } else { + pte_pv = pmap_allocpte(pmap, pmap_pte_pindex(va), + &pt_pv); + ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); + } + KKASSERT(*ptep == 0 || (*ptep & PG_MANAGED)); + } if ((prot & VM_PROT_NOSYNC) == 0) pmap_inval_init(&info); - pde = pmap_pde(pmap, va); - if (pde != NULL && (*pde & PG_V) != 0) { - if ((*pde & PG_PS) != 0) - panic("pmap_enter: attempted pmap_enter on 2MB page"); - pte = pmap_pde_to_pte(pde, va); - } else { - panic("pmap_enter: invalid page directory va=%#lx", va); - } - KKASSERT(pte != NULL); pa = VM_PAGE_TO_PHYS(m); - origpte = *pte; + origpte = *ptep; opa = origpte & PG_FRAME; /* @@ -2718,87 +3053,94 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ + KKASSERT(pte_pv == NULL || m == pte_pv->pv_m); if (wired && ((origpte & PG_W) == 0)) - pmap->pm_stats.wired_count++; + atomic_add_long(&pmap->pm_stats.wired_count, 1); else if (!wired && (origpte & PG_W)) - pmap->pm_stats.wired_count--; + atomic_add_long(&pmap->pm_stats.wired_count, -1); #if defined(PMAP_DIAGNOSTIC) if (pmap_nw_modified(origpte)) { - kprintf( - "pmap_enter: modified page not writable: va: 0x%lx, pte: 0x%lx\n", - va, origpte); + kprintf("pmap_enter: modified page not writable: " + "va: 0x%lx, pte: 0x%lx\n", va, origpte); } #endif - /* - * Remove the extra pte reference. Note that we cannot - * optimize the RO->RW case because we have adjusted the - * wiring count above and may need to adjust the wiring - * bits below. - */ - if (mpte) - vm_page_unwire_quick(mpte); - /* * We might be turning off write access to the page, * so we go ahead and sense modify status. */ - if (origpte & PG_MANAGED) { - if ((origpte & PG_M) && pmap_track_modified(va)) { + if (pte_pv) { + if ((origpte & PG_M) && + pmap_track_modified(pte_pv->pv_pindex)) { vm_page_t om; - om = PHYS_TO_VM_PAGE(opa); + om = pte_pv->pv_m; + KKASSERT(PHYS_TO_VM_PAGE(opa) == om); vm_page_dirty(om); } pa |= PG_MANAGED; - KKASSERT(m->flags & PG_MAPPED); } goto validate; } + /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. - */ - while (opa) { - int err; - err = pmap_remove_pte(pmap, pte, va, &info); - if (err) - panic("pmap_enter: pte vanished, va: 0x%lx", va); - origpte = *pte; - opa = origpte & PG_FRAME; - if (opa) { - kprintf("pmap_enter: Warning, raced pmap %p va %p\n", - pmap, (void *)va); + * + * We always interlock pte removals. + */ + if (opa) { + if (pte_pv) { + /* XXX pmap_remove_pv_pte() unwires pt_pv */ + vm_page_wire_quick(pt_pv->pv_m); + if (prot & VM_PROT_NOSYNC) + pmap_remove_pv_pte(pte_pv, pt_pv, NULL); + else + pmap_remove_pv_pte(pte_pv, pt_pv, &info); + if (pte_pv->pv_m) + pmap_remove_pv_page(pte_pv, 0); + } else if (prot & VM_PROT_NOSYNC) { + *ptep = 0; + cpu_invlpg((void *)va); + atomic_add_long(&pmap->pm_stats.resident_count, -1); + } else { + pmap_inval_interlock(&info, pmap, va); + *ptep = 0; + pmap_inval_deinterlock(&info, pmap); + atomic_add_long(&pmap->pm_stats.resident_count, -1); } + KKASSERT(*ptep == 0); } /* - * Enter on the PV list if part of our managed memory. Note that we - * raise IPL while manipulating pv_table since pmap_enter can be - * called at interrupt time. - * - * The new mapping covers mpte's new wiring count so we don't - * unwire it. + * Enter on the PV list if part of our managed memory. Wiring is + * handled automatically. */ - if (pmap_initialized && - (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) { - pmap_insert_entry(pmap, va, mpte, m); - pa |= PG_MANAGED; + if (pte_pv) { + KKASSERT(pte_pv->pv_m == NULL); + vm_page_spin_lock(m); + pte_pv->pv_m = m; + TAILQ_INSERT_TAIL(&m->md.pv_list, pte_pv, pv_list); + /* + if (m->object) + atomic_add_int(&m->object->agg_pv_list_count, 1); + */ vm_page_flag_set(m, PG_MAPPED); + vm_page_spin_unlock(m); + pa |= PG_MANAGED; } /* * Increment counters */ - ++pmap->pm_stats.resident_count; if (wired) - pmap->pm_stats.wired_count++; + atomic_add_long(&pmap->pm_stats.wired_count, 1); validate: /* * Now validate mapping with desired protection/wiring. */ - newpte = (pt_entry_t) (pa | pte_prot(pmap, prot) | PG_V); + newpte = (pt_entry_t)(pa | pte_prot(pmap, prot) | PG_V); if (wired) newpte |= PG_W; @@ -2808,25 +3150,41 @@ validate: newpte |= pgeflag; /* - * if the mapping or permission bits are different, we need + * If the mapping or permission bits are different, we need * to update the pte. + * + * We do not have to interlock pte insertions as no other + * cpu will have a TLB entry. */ if ((origpte & ~(PG_M|PG_A)) != newpte) { +#if 0 if ((prot & VM_PROT_NOSYNC) == 0) pmap_inval_interlock(&info, pmap, va); - *pte = newpte | PG_A; +#endif + *ptep = newpte | PG_A; + cpu_invlpg((void *)va); +#if 0 if (prot & VM_PROT_NOSYNC) cpu_invlpg((void *)va); else pmap_inval_deinterlock(&info, pmap); +#endif if (newpte & PG_RW) vm_page_flag_set(m, PG_WRITEABLE); + if (pte_pv == NULL) + atomic_add_long(&pmap->pm_stats.resident_count, 1); } KKASSERT((newpte & PG_MANAGED) == 0 || (m->flags & PG_MAPPED)); if ((prot & VM_PROT_NOSYNC) == 0) pmap_inval_done(&info); - lwkt_reltoken(&pmap->pm_token); - vm_object_drop(pmap->pm_pteobj); + + /* + * Cleanup the pv entry, allowing other accessors. + */ + if (pte_pv) + pv_put(pte_pv); + if (pt_pv) + pv_put(pt_pv); } /* @@ -2839,91 +3197,7 @@ validate: void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m) { - pt_entry_t *pte; - vm_paddr_t pa; - vm_page_t mpte; - pmap_inval_info info; - - lwkt_gettoken(&pmap->pm_token); - vm_object_hold(pmap->pm_pteobj); - pmap_inval_init(&info); - - if (va < UPT_MAX_ADDRESS && pmap == &kernel_pmap) { - kprintf("Warning: pmap_enter_quick called on UVA with" - "kernel_pmap\n"); -#ifdef DDB - db_print_backtrace(); -#endif - } - if (va >= UPT_MAX_ADDRESS && pmap != &kernel_pmap) { - kprintf("Warning: pmap_enter_quick called on KVA without" - "kernel_pmap\n"); -#ifdef DDB - db_print_backtrace(); -#endif - } - - KKASSERT(va < UPT_MIN_ADDRESS); /* assert used on user pmaps only */ - - /* - * Calculate the page table page (mpte), allocating it if necessary. - * - * A wired page table page (mpte), or NULL, is passed onto the - * section following. - */ - if (va < VM_MAX_USER_ADDRESS) { - mpte = pmap_allocpte(pmap, va); - } else { - mpte = NULL; - /* this code path is not yet used */ - } - - /* - * With a valid (and held) page directory page, we can just use - * vtopte() to get to the pte. If the pte is already present - * we do not disturb it. - */ - pte = vtopte(va); - if (*pte & PG_V) { - pa = VM_PAGE_TO_PHYS(m); - KKASSERT(((*pte ^ pa) & PG_FRAME) == 0); - pmap_inval_done(&info); - if (mpte) - pmap_unwire_pte_hold(pmap, va, mpte, &info); - vm_object_drop(pmap->pm_pteobj); - lwkt_reltoken(&pmap->pm_token); - return; - } - - /* - * Enter on the PV list if part of our managed memory. - * - * The new mapping covers mpte's new wiring count so we don't - * unwire it. - */ - if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) { - pmap_insert_entry(pmap, va, mpte, m); - vm_page_flag_set(m, PG_MAPPED); - } - - /* - * Increment counters - */ - ++pmap->pm_stats.resident_count; - - pa = VM_PAGE_TO_PHYS(m); - - /* - * Now validate mapping with RO protection - */ - if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) - *pte = pa | PG_V | PG_U; - else - *pte = pa | PG_V | PG_U | PG_MANAGED; -/* pmap_inval_add(&info, pmap, va); shouldn't be needed inval->valid */ - pmap_inval_done(&info); - vm_object_drop(pmap->pm_pteobj); - lwkt_reltoken(&pmap->pm_token); + pmap_enter(pmap, va, m, VM_PROT_READ, FALSE); } /* @@ -2979,7 +3253,7 @@ pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_prot_t prot, return; } - if (psize + pindex > object->size) { + if (pindex + psize > object->size) { if (object->size < pindex) return; psize = object->size - pindex; @@ -3001,7 +3275,6 @@ pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_prot_t prot, info.mpte = NULL; info.addr = addr; info.pmap = pmap; - info.desired = 0; vm_object_hold(object); vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, @@ -3035,58 +3308,52 @@ pmap_object_init_pt_callback(vm_page_t p, void *data) info->addr + x86_64_ptob(rel_index), p); } vm_page_wakeup(p); - pmap_auto_yield(info); return(0); } /* - * Return TRUE if the pmap is in shape to trivially - * pre-fault the specified address. + * Return TRUE if the pmap is in shape to trivially pre-fault the specified + * address. * - * Returns FALSE if it would be non-trivial or if a - * pte is already loaded into the slot. + * Returns FALSE if it would be non-trivial or if a pte is already loaded + * into the slot. */ int pmap_prefault_ok(pmap_t pmap, vm_offset_t addr) { pt_entry_t *pte; - pd_entry_t *pde; - int ret; - lwkt_gettoken(&pmap->pm_token); - pde = pmap_pde(pmap, addr); - if (pde == NULL || *pde == 0) { - ret = 0; - } else { - pte = vtopte(addr); - ret = (*pte) ? 0 : 1; + spin_lock(&pmap->pm_spin); + if ((pte = pmap_pte(pmap, addr)) != NULL) { + if (*pte & PG_V) { + spin_unlock(&pmap->pm_spin); + return FALSE; + } } - lwkt_reltoken(&pmap->pm_token); - return(ret); + spin_unlock(&pmap->pm_spin); + return TRUE; } /* - * Routine: pmap_change_wiring - * Function: Change the wiring attribute for a map/virtual-address - * pair. - * In/out conditions: - * The mapping must already exist in the pmap. + * Change the wiring attribute for a pmap/va pair. The mapping must already + * exist in the pmap. The mapping may or may not be managed. */ void pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired) { - pt_entry_t *pte; + pt_entry_t *ptep; + pv_entry_t pv; if (pmap == NULL) return; - lwkt_gettoken(&pmap->pm_token); - pte = pmap_pte(pmap, va); + pv = pmap_allocpte(pmap, pmap_pt_pindex(va), NULL); + ptep = pv_pte_lookup(pv, pmap_pte_index(va)); - if (wired && !pmap_pte_w(pte)) - pmap->pm_stats.wired_count++; - else if (!wired && pmap_pte_w(pte)) - pmap->pm_stats.wired_count--; + if (wired && !pmap_pte_w(ptep)) + atomic_add_long(&pmap->pm_stats.wired_count, 1); + else if (!wired && pmap_pte_w(ptep)) + atomic_add_long(&pmap->pm_stats.wired_count, -1); /* * Wiring is not a hardware characteristic so there is no need to @@ -3097,15 +3364,16 @@ pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired) */ #ifdef SMP if (wired) - atomic_set_long(pte, PG_W); + atomic_set_long(ptep, PG_W); else - atomic_clear_long(pte, PG_W); + atomic_clear_long(ptep, PG_W); #else if (wired) - atomic_set_long_nonlocked(pte, PG_W); + atomic_set_long_nonlocked(ptep, PG_W); else - atomic_clear_long_nonlocked(pte, PG_W); + atomic_clear_long_nonlocked(ptep, PG_W); #endif + pv_put(pv); lwkt_reltoken(&pmap->pm_token); } @@ -3121,147 +3389,6 @@ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { - return; -#if 0 - pmap_inval_info info; - vm_offset_t addr; - vm_offset_t end_addr = src_addr + len; - vm_offset_t pdnxt; - pd_entry_t src_frame, dst_frame; - vm_page_t m; - - if (dst_addr != src_addr) - return; -#if JGPMAP32 - src_frame = src_pmap->pm_pdir[PTDPTDI] & PG_FRAME; - if (src_frame != (PTDpde & PG_FRAME)) { - return; - } - - dst_frame = dst_pmap->pm_pdir[PTDPTDI] & PG_FRAME; - if (dst_frame != (APTDpde & PG_FRAME)) { - APTDpde = (pd_entry_t) (dst_frame | PG_RW | PG_V); - /* The page directory is not shared between CPUs */ - cpu_invltlb(); - } -#endif - pmap_inval_init(&info); - pmap_inval_add(&info, dst_pmap, -1); - pmap_inval_add(&info, src_pmap, -1); - - lwkt_gettoken(&src_pmap->pm_token); - lwkt_gettoken(&dst_pmap->pm_token); - for (addr = src_addr; addr < end_addr; addr = pdnxt) { - pt_entry_t *src_pte, *dst_pte; - vm_page_t dstmpte, srcmpte; - vm_offset_t srcptepaddr; - vm_pindex_t ptepindex; - - if (addr >= UPT_MIN_ADDRESS) - panic("pmap_copy: invalid to pmap_copy page tables\n"); - - /* - * Don't let optional prefaulting of pages make us go - * way below the low water mark of free pages or way - * above high water mark of used pv entries. - */ - if (vmstats.v_free_count < vmstats.v_free_reserved || - pv_entry_count > pv_entry_high_water) - break; - - pdnxt = ((addr + PAGE_SIZE*NPTEPG) & ~(PAGE_SIZE*NPTEPG - 1)); - ptepindex = addr >> PDRSHIFT; - -#if JGPMAP32 - srcptepaddr = (vm_offset_t) src_pmap->pm_pdir[ptepindex]; -#endif - if (srcptepaddr == 0) - continue; - - if (srcptepaddr & PG_PS) { -#if JGPMAP32 - if (dst_pmap->pm_pdir[ptepindex] == 0) { - dst_pmap->pm_pdir[ptepindex] = (pd_entry_t) srcptepaddr; - dst_pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; - } -#endif - continue; - } - - /* - * - */ - srcmpte = vm_page_lookup(src_pmap->pm_pteobj, ptepindex); - if (srcmpte == NULL || srcmpte->wire_count == 1 || - (srcmpte->flags & PG_BUSY)) { - continue; - } - - if (pdnxt > end_addr) - pdnxt = end_addr; - - src_pte = vtopte(addr); -#if JGPMAP32 - dst_pte = avtopte(addr); -#endif - while (addr < pdnxt) { - pt_entry_t ptetemp; - - ptetemp = *src_pte; - /* - * we only virtual copy managed pages - */ - if ((ptetemp & PG_MANAGED) != 0) { - /* - * We have to check after allocpte for the - * pte still being around... allocpte can - * block. - * - * pmap_allocpte() can block. If we lose - * our page directory mappings we stop. - */ - dstmpte = pmap_allocpte(dst_pmap, addr); - -#if JGPMAP32 - if (src_frame != (PTDpde & PG_FRAME) || - dst_frame != (APTDpde & PG_FRAME) - ) { - kprintf("WARNING: pmap_copy: detected and corrected race\n"); - pmap_unwire_pte_hold(dst_pmap, dstmpte, &info); - goto failed; - } else if ((*dst_pte == 0) && - (ptetemp = *src_pte) != 0 && - (ptetemp & PG_MANAGED)) { - /* - * Clear the modified and - * accessed (referenced) bits - * during the copy. - */ - m = PHYS_TO_VM_PAGE(ptetemp); - *dst_pte = ptetemp & ~(PG_M | PG_A); - ++dst_pmap->pm_stats.resident_count; - pmap_insert_entry(dst_pmap, addr, - dstmpte, m); - KKASSERT(m->flags & PG_MAPPED); - } else { - kprintf("WARNING: pmap_copy: dst_pte race detected and corrected\n"); - pmap_unwire_pte_hold(dst_pmap, dstmpte, &info); - goto failed; - } -#endif - if (dstmpte->hold_count >= srcmpte->hold_count) - break; - } - addr += PAGE_SIZE; - src_pte++; - dst_pte++; - } - } -failed: - lwkt_reltoken(&dst_pmap->pm_token); - lwkt_reltoken(&src_pmap->pm_token); - pmap_inval_done(&info); -#endif } /* @@ -3353,11 +3480,10 @@ pmap_copy_page_frag(vm_paddr_t src, vm_paddr_t dst, size_t bytes) } /* - * Returns true if the pmap's pv is one of the first - * 16 pvs linked to from this page. This count may - * be changed upwards or downwards in the future; it - * is only necessary that true be returned for a small - * subset of pmaps for proper page aging. + * Returns true if the pmap's pv is one of the first 16 pvs linked to from + * this page. This count may be changed upwards or downwards in the future; + * it is only necessary that true be returned for a small subset of pmaps + * for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) @@ -3368,167 +3494,34 @@ pmap_page_exists_quick(pmap_t pmap, vm_page_t m) if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return FALSE; - spin_lock(&pmap_spin); + vm_page_spin_lock(m); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { if (pv->pv_pmap == pmap) { - spin_unlock(&pmap_spin); + vm_page_spin_unlock(m); return TRUE; } loops++; if (loops >= 16) break; } - spin_unlock(&pmap_spin); + vm_page_spin_unlock(m); return (FALSE); } /* * Remove all pages from specified address space this aids process exit - * speeds. Also, this code is special cased for current process only, but - * can have the more generic (and slightly slower) mode enabled. This - * is much faster than pmap_remove in the case of running down an entire - * address space. + * speeds. Also, this code may be special cased for the current process + * only. */ void pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { - struct lwp *lp; - pt_entry_t *pte, tpte; - pv_entry_t pv, npv; - vm_page_t m; - vm_offset_t va; - pmap_inval_info info; - int iscurrentpmap; - int save_generation; - - lp = curthread->td_lwp; - if (lp && pmap == vmspace_pmap(lp->lwp_vmspace)) - iscurrentpmap = 1; - else - iscurrentpmap = 0; - - if (pmap->pm_pteobj) - vm_object_hold(pmap->pm_pteobj); - lwkt_gettoken(&pmap->pm_token); - pmap_inval_init(&info); - - spin_lock(&pmap_spin); - for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) { - /* - * Validate the pv. We have to interlock the address with - * pmap_spin unlocked. - */ - if (pv->pv_va >= eva || pv->pv_va < sva) { - npv = TAILQ_NEXT(pv, pv_plist); - continue; - } - - KKASSERT(pmap == pv->pv_pmap); - if (iscurrentpmap) - pte = vtopte(pv->pv_va); - else - pte = pmap_pte_quick(pmap, pv->pv_va); - - /* - * We cannot remove wired pages from a process' mapping - * at this time. This does not require an invaldiation - * interlock as PG_W cannot be set by the MMU. - */ - if (*pte & PG_W) { - npv = TAILQ_NEXT(pv, pv_plist); - continue; - } - - /* - * Interlock the pte so we can safely remove it - */ - save_generation = pmap->pm_generation; - va = pv->pv_va; - spin_unlock(&pmap_spin); - - pmap_inval_interlock(&info, pmap, va); - - /* - * Restart the scan if the pv list changed out from under us. - */ - spin_lock(&pmap_spin); - if (save_generation != pmap->pm_generation) { - spin_unlock(&pmap_spin); - pmap_inval_deinterlock(&info, pmap); - kprintf("Warning: pmap_remove_pages race-A avoided\n"); - spin_lock(&pmap_spin); - npv = TAILQ_FIRST(&pmap->pm_pvlist); - continue; - } - KKASSERT(pmap == pv->pv_pmap && va == pv->pv_va); - - /* - * Extract the pte and clear its memory - */ - tpte = pte_load_clear(pte); - KKASSERT(tpte & PG_MANAGED); - - m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); - KASSERT(m < &vm_page_array[vm_page_array_size], - ("pmap_remove_pages: bad tpte %lx", tpte)); - - /* - * Remove the entry, set npv - */ - npv = TAILQ_NEXT(pv, pv_plist); - TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); - m->md.pv_generation++; - m->md.pv_list_count--; - vm_page_spin_lock(m); - if (m->object) - atomic_add_int(&m->object->agg_pv_list_count, -1); - vm_page_spin_unlock(m); - TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); - if (TAILQ_EMPTY(&m->md.pv_list)) - vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); - save_generation = ++pmap->pm_generation; - - spin_unlock(&pmap_spin); - - /* - * Adjust the pmap and cleanup the tpte and related vm_page - */ - KKASSERT(pmap->pm_stats.resident_count > 0); - --pmap->pm_stats.resident_count; - pmap_inval_deinterlock(&info, pmap); - - /* - * Update the vm_page_t clean and reference bits. - */ - if (tpte & PG_M) { - vm_page_dirty(m); - } - - pmap_unwire_pte_hold(pmap, pv->pv_va, pv->pv_ptem, &info); - free_pv_entry(pv); - - /* - * Restart the scan if we blocked during the unuse or free - * calls and other removals were made. - */ - spin_lock(&pmap_spin); - if (save_generation != pmap->pm_generation) { - kprintf("Warning: pmap_remove_pages race-A avoided\n"); - npv = TAILQ_FIRST(&pmap->pm_pvlist); - } - } - spin_unlock(&pmap_spin); - pmap_inval_done(&info); - lwkt_reltoken(&pmap->pm_token); - if (pmap->pm_pteobj) - vm_object_drop(pmap->pm_pteobj); + pmap_remove(pmap, sva, eva); } /* * pmap_testbit tests bits in pte's note that the testbit/clearbit * routines are inline, and a lot of things compile-time evaluate. - * - * Caller must hold pmap_spin */ static boolean_t @@ -3542,6 +3535,11 @@ pmap_testbit(vm_page_t m, int bit) if (TAILQ_FIRST(&m->md.pv_list) == NULL) return FALSE; + vm_page_spin_lock(m); + if (TAILQ_FIRST(&m->md.pv_list) == NULL) { + vm_page_spin_unlock(m); + return FALSE; + } TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { /* @@ -3550,7 +3548,7 @@ pmap_testbit(vm_page_t m, int bit) * modified. */ if (bit & (PG_A|PG_M)) { - if (!pmap_track_modified(pv->pv_va)) + if (!pmap_track_modified(pv->pv_pindex)) continue; } @@ -3560,29 +3558,31 @@ pmap_testbit(vm_page_t m, int bit) continue; } #endif - pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); - if (*pte & bit) + pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT); + if (*pte & bit) { + vm_page_spin_unlock(m); return TRUE; + } } + vm_page_spin_unlock(m); return (FALSE); } /* * This routine is used to modify bits in ptes * - * Caller must NOT hold pmap_spin + * Caller must NOT hold any spin locks */ static __inline void pmap_clearbit(vm_page_t m, int bit) { struct pmap_inval_info info; - int save_generation; - vm_offset_t save_va; - struct pmap *save_pmap; pv_entry_t pv; pt_entry_t *pte; pt_entry_t pbits; + vm_pindex_t save_pindex; + pmap_t save_pmap; if (bit == PG_RW) vm_page_flag_clear(m, PG_WRITEABLE); @@ -3596,14 +3596,14 @@ pmap_clearbit(vm_page_t m, int bit) * Loop over all current mappings setting/clearing as appropos If * setting RO do we need to clear the VAC? */ - spin_lock(&pmap_spin); + vm_page_spin_lock(m); restart: TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { /* * don't write protect pager mappings */ if (bit == PG_RW) { - if (!pmap_track_modified(pv->pv_va)) + if (!pmap_track_modified(pv->pv_pindex)) continue; } @@ -3623,21 +3623,22 @@ restart: * PG_M even for PTEs generated via virtual memory maps, * because the virtual kernel will invalidate the pmap * entry when/if it needs to resynchronize the Modify bit. - * - * We have to restart our scan if m->md.pv_generation changes - * on us. */ if (bit & PG_RW) { - save_generation = m->md.pv_generation; save_pmap = pv->pv_pmap; - save_va = pv->pv_va; - spin_unlock(&pmap_spin); - pmap_inval_interlock(&info, save_pmap, save_va); - spin_lock(&pmap_spin); - if (save_generation != m->md.pv_generation) + save_pindex = pv->pv_pindex; + pv_hold(pv); + vm_page_spin_unlock(m); + pmap_inval_interlock(&info, save_pmap, + (vm_offset_t)save_pindex << PAGE_SHIFT); + vm_page_spin_lock(m); + if (pv->pv_pmap == NULL) { + pv_drop(pv); goto restart; + } + pv_drop(pv); } - pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); + pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT); again: pbits = *pte; if (pbits & bit) { @@ -3670,16 +3671,19 @@ again: } } if (bit & PG_RW) { - save_generation = m->md.pv_generation; save_pmap = pv->pv_pmap; - spin_unlock(&pmap_spin); + pv_hold(pv); + vm_page_spin_unlock(m); pmap_inval_deinterlock(&info, save_pmap); - spin_lock(&pmap_spin); - if (save_generation != m->md.pv_generation) + vm_page_spin_lock(m); + if (pv->pv_pmap == NULL) { + pv_drop(pv); goto restart; + } + pv_drop(pv); } } - spin_unlock(&pmap_spin); + vm_page_spin_unlock(m); pmap_inval_done(&info); } @@ -3726,43 +3730,30 @@ pmap_phys_address(vm_pindex_t ppn) int pmap_ts_referenced(vm_page_t m) { - pv_entry_t pv, pvf, pvn; + pv_entry_t pv; pt_entry_t *pte; int rtval = 0; if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return (rtval); - spin_lock(&pmap_spin); - if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { - pvf = pv; - do { - pvn = TAILQ_NEXT(pv, pv_list); - - TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); - TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); - /*++pv->pv_pmap->pm_generation; not needed */ - - if (!pmap_track_modified(pv->pv_va)) - continue; - - pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); - - if (pte && (*pte & PG_A)) { + vm_page_spin_lock(m); + TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + if (!pmap_track_modified(pv->pv_pindex)) + continue; + pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT); + if (pte && (*pte & PG_A)) { #ifdef SMP - atomic_clear_long(pte, PG_A); + atomic_clear_long(pte, PG_A); #else - atomic_clear_long_nonlocked(pte, PG_A); + atomic_clear_long_nonlocked(pte, PG_A); #endif - rtval++; - if (rtval > 4) { - break; - } - } - } while ((pv = pvn) != NULL && pv != pvf); + rtval++; + if (rtval > 4) + break; + } } - spin_unlock(&pmap_spin); - + vm_page_spin_unlock(m); return (rtval); } @@ -3777,9 +3768,7 @@ pmap_is_modified(vm_page_t m) { boolean_t res; - spin_lock(&pmap_spin); res = pmap_testbit(m, PG_M); - spin_unlock(&pmap_spin); return (res); } -- 2.41.0