* Copyright (c) 2005-2008 Alan L. Cox <alc@cs.rice.edu>
* Copyright (c) 2008, 2009 The DragonFly Project.
* Copyright (c) 2008, 2009 Jordan Gordeev.
+ * Copyright (c) 2011 Matthew Dillon
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
#define MINPV 2048
/*
+ * pmap debugging will report who owns a pv lock when blocking.
+ */
+#ifdef PMAP_DEBUG
+
+#define PMAP_DEBUG_DECL ,const char *func, int lineno
+#define PMAP_DEBUG_ARGS , __func__, __LINE__
+#define PMAP_DEBUG_COPY , func, lineno
+
+#define pv_get(pmap, pindex) _pv_get(pmap, pindex \
+ PMAP_DEBUG_ARGS)
+#define pv_lock(pv) _pv_lock(pv \
+ PMAP_DEBUG_ARGS)
+#define pv_hold_try(pv) _pv_hold_try(pv \
+ PMAP_DEBUG_ARGS)
+#define pv_alloc(pmap, pindex, isnewp) _pv_alloc(pmap, pindex, isnewp \
+ PMAP_DEBUG_ARGS)
+
+#else
+
+#define PMAP_DEBUG_DECL
+#define PMAP_DEBUG_ARGS
+#define PMAP_DEBUG_COPY
+
+#define pv_get(pmap, pindex) _pv_get(pmap, pindex)
+#define pv_lock(pv) _pv_lock(pv)
+#define pv_hold_try(pv) _pv_hold_try(pv)
+#define pv_alloc(pmap, pindex, isnewp) _pv_alloc(pmap, pindex, isnewp)
+
+#endif
+
+/*
* Get PDEs and PTEs for user/kernel address space
*/
-static pd_entry_t *pmap_pde(pmap_t pmap, vm_offset_t va);
#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
#define pmap_pde_v(pte) ((*(pd_entry_t *)pte & PG_V) != 0)
#define pmap_pte_u(pte) ((*(pt_entry_t *)pte & PG_A) != 0)
#define pmap_pte_v(pte) ((*(pt_entry_t *)pte & PG_V) != 0)
-
/*
* Given a map and a machine independent protection code,
* convert to a vax protection code.
static int pgeflag; /* PG_G or-in */
static int pseflag; /* PG_PS or-in */
-static vm_object_t kptobj;
-
static int ndmpdp;
static vm_paddr_t dmaplimit;
static int nkpt;
static vm_zone_t pvzone;
static struct vm_zone pvzone_store;
static struct vm_object pvzone_obj;
-static int pv_entry_count=0, pv_entry_max=0, pv_entry_high_water=0;
+static int pv_entry_max=0, pv_entry_high_water=0;
static int pmap_pagedaemon_waken = 0;
static struct pv_entry *pvinit;
#define DISABLE_PSE
-static pv_entry_t get_pv_entry (void);
+static void pv_hold(pv_entry_t pv);
+static int _pv_hold_try(pv_entry_t pv
+ PMAP_DEBUG_DECL);
+static void pv_drop(pv_entry_t pv);
+static void _pv_lock(pv_entry_t pv
+ PMAP_DEBUG_DECL);
+static void pv_unlock(pv_entry_t pv);
+static pv_entry_t _pv_alloc(pmap_t pmap, vm_pindex_t pindex, int *isnew
+ PMAP_DEBUG_DECL);
+static pv_entry_t _pv_get(pmap_t pmap, vm_pindex_t pindex
+ PMAP_DEBUG_DECL);
+static pv_entry_t pv_get_try(pmap_t pmap, vm_pindex_t pindex, int *errorp);
+static pv_entry_t pv_find(pmap_t pmap, vm_pindex_t pindex);
+static void pv_put(pv_entry_t pv);
+static void pv_free(pv_entry_t pv);
+static void *pv_pte_lookup(pv_entry_t pv, vm_pindex_t pindex);
+static pv_entry_t pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
+ pv_entry_t *pvpp);
+static void pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp,
+ struct pmap_inval_info *info);
+static vm_page_t pmap_remove_pv_page(pv_entry_t pv, int holdpg);
+
+static void pmap_remove_callback(pmap_t pmap, struct pmap_inval_info *info,
+ pv_entry_t pte_pv, pv_entry_t pt_pv, vm_offset_t va,
+ pt_entry_t *ptep, void *arg __unused);
+static void pmap_protect_callback(pmap_t pmap, struct pmap_inval_info *info,
+ pv_entry_t pte_pv, pv_entry_t pt_pv, vm_offset_t va,
+ pt_entry_t *ptep, void *arg __unused);
+
static void i386_protection_init (void);
static void create_pagetables(vm_paddr_t *firstaddr);
static void pmap_remove_all (vm_page_t m);
-static int pmap_remove_pte (struct pmap *pmap, pt_entry_t *ptq,
- vm_offset_t sva, pmap_inval_info_t info);
-static void pmap_remove_page (struct pmap *pmap,
- vm_offset_t va, pmap_inval_info_t info);
-static int pmap_remove_entry (struct pmap *pmap, vm_page_t m,
- vm_offset_t va, pmap_inval_info_t info);
static boolean_t pmap_testbit (vm_page_t m, int bit);
-static void pmap_insert_entry (pmap_t pmap, vm_offset_t va,
- vm_page_t mpte, vm_page_t m);
-
-static vm_page_t pmap_allocpte (pmap_t pmap, vm_offset_t va);
-static int pmap_release_free_page (pmap_t pmap, vm_page_t p);
-static vm_page_t _pmap_allocpte (pmap_t pmap, vm_pindex_t ptepindex);
static pt_entry_t * pmap_pte_quick (pmap_t pmap, vm_offset_t va);
-static vm_page_t pmap_page_lookup (vm_object_t object, vm_pindex_t pindex);
-static int pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m,
- pmap_inval_info_t info);
static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
static unsigned pdir4mb;
+static int
+pv_entry_compare(pv_entry_t pv1, pv_entry_t pv2)
+{
+ if (pv1->pv_pindex < pv2->pv_pindex)
+ return(-1);
+ if (pv1->pv_pindex > pv2->pv_pindex)
+ return(1);
+ return(0);
+}
+
+RB_GENERATE2(pv_entry_rb_tree, pv_entry, pv_entry,
+ pv_entry_compare, vm_pindex_t, pv_pindex);
+
/*
* Move the kernel virtual free pointer to the next
* 2MB. This is used to help improve performance
return pmap_pte(pmap, va);
}
-/* Return a non-clipped PD index for a given VA */
+/*
+ * Returns the pindex of a page table entry (representing a terminal page).
+ * There are NUPTE_TOTAL page table entries possible (a huge number)
+ *
+ * x86-64 has a 48-bit address space, where bit 47 is sign-extended out.
+ * We want to properly translate negative KVAs.
+ */
static __inline
vm_pindex_t
-pmap_pde_pindex(vm_offset_t va)
+pmap_pte_pindex(vm_offset_t va)
{
- return va >> PDRSHIFT;
+ return ((va >> PAGE_SHIFT) & (NUPTE_TOTAL - 1));
}
-/* Return various clipped indexes for a given VA */
+/*
+ * Returns the pindex of a page table.
+ */
static __inline
vm_pindex_t
-pmap_pte_index(vm_offset_t va)
+pmap_pt_pindex(vm_offset_t va)
{
+ return (NUPTE_TOTAL + ((va >> PDRSHIFT) & (NUPT_TOTAL - 1)));
+}
- return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
+/*
+ * Returns the pindex of a page directory.
+ */
+static __inline
+vm_pindex_t
+pmap_pd_pindex(vm_offset_t va)
+{
+ return (NUPTE_TOTAL + NUPT_TOTAL +
+ ((va >> PDPSHIFT) & (NUPD_TOTAL - 1)));
}
static __inline
vm_pindex_t
-pmap_pde_index(vm_offset_t va)
+pmap_pdp_pindex(vm_offset_t va)
{
+ return (NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL +
+ ((va >> PML4SHIFT) & (NUPDP_TOTAL - 1)));
+}
- return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
+static __inline
+vm_pindex_t
+pmap_pml4_pindex(void)
+{
+ return (NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL);
}
+/*
+ * Return various clipped indexes for a given VA
+ *
+ * Returns the index of a pte in a page table, representing a terminal
+ * page.
+ */
static __inline
vm_pindex_t
-pmap_pdpe_index(vm_offset_t va)
+pmap_pte_index(vm_offset_t va)
{
+ return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
+}
- return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
+/*
+ * Returns the index of a pt in a page directory, representing a page
+ * table.
+ */
+static __inline
+vm_pindex_t
+pmap_pt_index(vm_offset_t va)
+{
+ return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
}
+/*
+ * Returns the index of a pd in a page directory page, representing a page
+ * directory.
+ */
static __inline
vm_pindex_t
-pmap_pml4e_index(vm_offset_t va)
+pmap_pd_index(vm_offset_t va)
{
+ return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
+}
+/*
+ * Returns the index of a pdp in the pml4 table, representing a page
+ * directory page.
+ */
+static __inline
+vm_pindex_t
+pmap_pdp_index(vm_offset_t va)
+{
return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
}
-/* Return a pointer to the PML4 slot that corresponds to a VA */
+/*
+ * Generic procedure to index a pte from a pt, pd, or pdp.
+ */
+static
+void *
+pv_pte_lookup(pv_entry_t pv, vm_pindex_t pindex)
+{
+ pt_entry_t *pte;
+
+ pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pv->pv_m));
+ return(&pte[pindex]);
+}
+
+/*
+ * Return pointer to PDP slot in the PML4
+ */
static __inline
pml4_entry_t *
-pmap_pml4e(pmap_t pmap, vm_offset_t va)
+pmap_pdp(pmap_t pmap, vm_offset_t va)
{
-
- return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
+ return (&pmap->pm_pml4[pmap_pdp_index(va)]);
}
-/* Return a pointer to the PDP slot that corresponds to a VA */
+/*
+ * Return pointer to PD slot in the PDP given a pointer to the PDP
+ */
static __inline
pdp_entry_t *
-pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
+pmap_pdp_to_pd(pml4_entry_t *pdp, vm_offset_t va)
{
- pdp_entry_t *pdpe;
+ pdp_entry_t *pd;
- pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
- return (&pdpe[pmap_pdpe_index(va)]);
+ pd = (pdp_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
+ return (&pd[pmap_pd_index(va)]);
}
-/* Return a pointer to the PDP slot that corresponds to a VA */
+/*
+ * Return pointer to PD slot in the PDP
+ **/
static __inline
pdp_entry_t *
-pmap_pdpe(pmap_t pmap, vm_offset_t va)
+pmap_pd(pmap_t pmap, vm_offset_t va)
{
- pml4_entry_t *pml4e;
+ pml4_entry_t *pdp;
- pml4e = pmap_pml4e(pmap, va);
- if ((*pml4e & PG_V) == 0)
+ pdp = pmap_pdp(pmap, va);
+ if ((*pdp & PG_V) == 0)
return NULL;
- return (pmap_pml4e_to_pdpe(pml4e, va));
+ return (pmap_pdp_to_pd(pdp, va));
}
-/* Return a pointer to the PD slot that corresponds to a VA */
+/*
+ * Return pointer to PT slot in the PD given a pointer to the PD
+ */
static __inline
pd_entry_t *
-pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va)
+pmap_pd_to_pt(pdp_entry_t *pd, vm_offset_t va)
{
- pd_entry_t *pde;
+ pd_entry_t *pt;
- pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
- return (&pde[pmap_pde_index(va)]);
+ pt = (pd_entry_t *)PHYS_TO_DMAP(*pd & PG_FRAME);
+ return (&pt[pmap_pt_index(va)]);
}
-/* Return a pointer to the PD slot that corresponds to a VA */
+/*
+ * Return pointer to PT slot in the PD
+ */
static __inline
pd_entry_t *
-pmap_pde(pmap_t pmap, vm_offset_t va)
+pmap_pt(pmap_t pmap, vm_offset_t va)
{
- pdp_entry_t *pdpe;
+ pdp_entry_t *pd;
- pdpe = pmap_pdpe(pmap, va);
- if (pdpe == NULL || (*pdpe & PG_V) == 0)
+ pd = pmap_pd(pmap, va);
+ if (pd == NULL || (*pd & PG_V) == 0)
return NULL;
- return (pmap_pdpe_to_pde(pdpe, va));
+ return (pmap_pd_to_pt(pd, va));
}
-/* Return a pointer to the PT slot that corresponds to a VA */
+/*
+ * Return pointer to PTE slot in the PT given a pointer to the PT
+ */
static __inline
pt_entry_t *
-pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
+pmap_pt_to_pte(pd_entry_t *pt, vm_offset_t va)
{
pt_entry_t *pte;
- pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
+ pte = (pt_entry_t *)PHYS_TO_DMAP(*pt & PG_FRAME);
return (&pte[pmap_pte_index(va)]);
}
-/* Return a pointer to the PT slot that corresponds to a VA */
+/*
+ * Return pointer to PTE slot in the PT
+ */
static __inline
pt_entry_t *
pmap_pte(pmap_t pmap, vm_offset_t va)
{
- pd_entry_t *pde;
+ pd_entry_t *pt;
- pde = pmap_pde(pmap, va);
- if (pde == NULL || (*pde & PG_V) == 0)
- return NULL;
- if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */
- return ((pt_entry_t *)pde);
- return (pmap_pde_to_pte(pde, va));
+ pt = pmap_pt(pmap, va);
+ if (pt == NULL || (*pt & PG_V) == 0)
+ return NULL;
+ if ((*pt & PG_PS) != 0)
+ return ((pt_entry_t *)pt);
+ return (pmap_pt_to_pte(pt, va));
}
+/*
+ * Of all the layers (PTE, PT, PD, PDP, PML4) the best one to cache is
+ * the PT layer. This will speed up core pmap operations considerably.
+ */
static __inline
-pt_entry_t *
-vtopte(vm_offset_t va)
+void
+pv_cache(pv_entry_t pv, vm_pindex_t pindex)
{
- uint64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT +
- NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
-
- return (PTmap + ((va >> PAGE_SHIFT) & mask));
+ if (pindex >= pmap_pt_pindex(0) && pindex <= pmap_pd_pindex(0))
+ pv->pv_pmap->pm_pvhint = pv;
}
+
+/*
+ * KVM - return address of PT slot in PD
+ */
static __inline
pd_entry_t *
-vtopde(vm_offset_t va)
+vtopt(vm_offset_t va)
{
uint64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT +
NPML4EPGSHIFT)) - 1);
return (PDmap + ((va >> PDRSHIFT) & mask));
}
+/*
+ * KVM - return address of PTE slot in PT
+ */
+static __inline
+pt_entry_t *
+vtopte(vm_offset_t va)
+{
+ uint64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT +
+ NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
+
+ return (PTmap + ((va >> PAGE_SHIFT) & mask));
+}
+
static uint64_t
allocpages(vm_paddr_t *firstaddr, long n)
{
* The kernel's pmap is statically allocated so we don't have to use
* pmap_create, which is unlikely to work correctly at this part of
* the boot sequence (XXX and which no longer exists).
- *
- * The kernel_pmap's pm_pteobj is used only for locking and not
- * for mmu pages.
*/
kernel_pmap.pm_pml4 = (pdp_entry_t *) (PTOV_OFFSET + KPML4phys);
kernel_pmap.pm_count = 1;
kernel_pmap.pm_active = (cpumask_t)-1 & ~CPUMASK_LOCK;
- kernel_pmap.pm_pteobj = &kernel_object;
- TAILQ_INIT(&kernel_pmap.pm_pvlist);
- TAILQ_INIT(&kernel_pmap.pm_pvlist_free);
- kernel_pmap.pm_hold = 0;
+ RB_INIT(&kernel_pmap.pm_pvroot);
spin_init(&kernel_pmap.pm_spin);
lwkt_token_init(&kernel_pmap.pm_token, "kpmap_tok");
int initial_pvs;
/*
- * object for kernel page table pages
- */
- /* JG I think the number can be arbitrary */
- kptobj = vm_object_allocate(OBJT_DEFAULT, 5);
-
- /*
* Allocate memory for random pmap data structures. Includes the
* pv_head_table.
*/
- for(i = 0; i < vm_page_array_size; i++) {
+ for (i = 0; i < vm_page_array_size; i++) {
vm_page_t m;
m = &vm_page_array[i];
TAILQ_INIT(&m->md.pv_list);
- m->md.pv_list_count = 0;
}
/*
*/
static __inline
int
-pmap_track_modified(vm_offset_t va)
+pmap_track_modified(vm_pindex_t pindex)
{
+ vm_offset_t va = (vm_offset_t)pindex << PAGE_SHIFT;
if ((va < clean_sva) || (va >= clean_eva))
return 1;
else
/*
* Extract the physical page address associated with the map/VA pair.
+ * The page must be wired for this to work reliably.
*
- * The caller must hold pmap->pm_token if non-blocking operation is desired.
+ * XXX for the moment we're using pv_find() instead of pv_get(), as
+ * callers might be expecting non-blocking operation.
*/
vm_paddr_t
pmap_extract(pmap_t pmap, vm_offset_t va)
{
vm_paddr_t rtval;
- pt_entry_t *pte;
- pd_entry_t pde, *pdep;
+ pv_entry_t pt_pv;
+ pt_entry_t *ptep;
- lwkt_gettoken(&pmap->pm_token);
rtval = 0;
- pdep = pmap_pde(pmap, va);
- if (pdep != NULL) {
- pde = *pdep;
- if (pde) {
- if ((pde & PG_PS) != 0) {
- rtval = (pde & PG_PS_FRAME) | (va & PDRMASK);
+ if (va >= VM_MAX_USER_ADDRESS) {
+ /*
+ * Kernel page directories might be direct-mapped and
+ * there is typically no PV tracking of pte's
+ */
+ pd_entry_t *pt;
+
+ pt = pmap_pt(pmap, va);
+ if (pt && (*pt & PG_V)) {
+ if (*pt & PG_PS) {
+ rtval = *pt & PG_PS_FRAME;
+ rtval |= va & PDRMASK;
} else {
- pte = pmap_pde_to_pte(pdep, va);
- rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
+ ptep = pmap_pt_to_pte(pt, va);
+ if (*pt & PG_V) {
+ rtval = *ptep & PG_FRAME;
+ rtval |= va & PAGE_MASK;
+ }
+ }
+ }
+ } else {
+ /*
+ * User pages currently do not direct-map the page directory
+ * and some pages might not used managed PVs. But all PT's
+ * will have a PV.
+ */
+ pt_pv = pv_find(pmap, pmap_pt_pindex(va));
+ if (pt_pv) {
+ ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va));
+ if (*ptep & PG_V) {
+ rtval = *ptep & PG_FRAME;
+ rtval |= va & PAGE_MASK;
}
+ pv_drop(pt_pv);
}
}
- lwkt_reltoken(&pmap->pm_token);
return rtval;
}
vm_paddr_t
pmap_kextract(vm_offset_t va)
{
- pd_entry_t pde;
+ pd_entry_t pt; /* pt entry in pd */
vm_paddr_t pa;
if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
pa = DMAP_TO_PHYS(va);
} else {
- pde = *vtopde(va);
- if (pde & PG_PS) {
- pa = (pde & PG_PS_FRAME) | (va & PDRMASK);
+ pt = *vtopt(va);
+ if (pt & PG_PS) {
+ pa = (pt & PG_PS_FRAME) | (va & PDRMASK);
} else {
/*
* Beware of a concurrent promotion that changes the
* because the page table page is preserved by the
* promotion.
*/
- pa = *pmap_pde_to_pte(&pde, va);
+ pa = *pmap_pt_to_pte(&pt, va);
pa = (pa & PG_FRAME) | (va & PAGE_MASK);
}
}
pt_entry_t npte;
pmap_inval_info info;
- pmap_inval_init(&info);
+ pmap_inval_init(&info); /* XXX remove */
npte = pa | PG_RW | PG_V | pgeflag;
pte = vtopte(va);
- pmap_inval_interlock(&info, &kernel_pmap, va);
+ pmap_inval_interlock(&info, &kernel_pmap, va); /* XXX remove */
*pte = npte;
- pmap_inval_deinterlock(&info, &kernel_pmap);
- pmap_inval_done(&info);
+ pmap_inval_deinterlock(&info, &kernel_pmap); /* XXX remove */
+ pmap_inval_done(&info); /* XXX remove */
}
/*
void
pmap_kmodify_rw(vm_offset_t va)
{
- *vtopte(va) |= PG_RW;
+ atomic_set_long(vtopte(va), PG_RW);
cpu_invlpg((void *)va);
}
void
pmap_kmodify_nc(vm_offset_t va)
{
- *vtopte(va) |= PG_N;
+ atomic_set_long(vtopte(va), PG_N);
cpu_invlpg((void *)va);
}
}
/*
- * This routine works like vm_page_lookup() but also blocks as long as the
- * page is busy. This routine does not busy the page it returns.
- *
- * The call should be made with the governing object held so the page's
- * object association remains valid on return.
- *
- * This function can block!
- */
-static
-vm_page_t
-pmap_page_lookup(vm_object_t object, vm_pindex_t pindex)
-{
- vm_page_t m;
-
- ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
- m = vm_page_lookup_busy_wait(object, pindex, FALSE, "pplookp");
-
- return(m);
-}
-
-/*
* Create a new thread and optionally associate it with a (new) process.
* NOTE! the new thread's cpu may not equal the current cpu.
*/
KASSERT(p->p_lock == 0, ("attempt to dispose referenced proc! %p", p));
}
-/***************************************************
- * Page table page management routines.....
- ***************************************************/
-
-/*
- * After removing a page table entry, this routine is used to
- * conditionally free the page, and manage the hold/wire counts.
- *
- * This routine reduces the wire_count on a page. If the wire_count
- * would drop to zero we remove the PT, PD, or PDP from its parent page
- * table. Under normal operation this only occurs with PT pages.
- *
- * mpte is never NULL for a user va, even for unmanaged pages. mpte should
- * always be NULL for a kernel va.
- */
-static __inline
-int
-pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t mpte,
- pmap_inval_info_t info)
-{
- if (mpte == NULL)
- return 0;
- if (!vm_page_unwire_quick(mpte))
- return 0;
-
- /*
- * Wait until we can busy the page ourselves. We cannot have
- * any active flushes if we block. We own one hold count on the
- * page so it cannot be freed out from under us.
- */
- vm_page_busy_wait(mpte, FALSE, "pmuwpt");
- KASSERT(mpte->queue == PQ_NONE,
- ("_pmap_unwire_pte_hold: %p->queue != PQ_NONE", mpte));
-
- /*
- * New references can bump the wire_count while we were blocked,
- * try to unwire quickly again (e.g. 2->1).
- */
- if (vm_page_unwire_quick(mpte) == 0) {
- vm_page_wakeup(mpte);
- return 0;
- }
-
- /*
- * Unmap the page table page
- */
- KKASSERT(mpte->wire_count == 1);
- pmap_inval_interlock(info, pmap, -1);
-
- if (mpte->pindex >= (NUPDE + NUPDPE)) {
- /* PDP page */
- pml4_entry_t *pml4;
- pml4 = pmap_pml4e(pmap, va);
- KKASSERT(*pml4);
- *pml4 = 0;
- } else if (mpte->pindex >= NUPDE) {
- /* PD page */
- pdp_entry_t *pdp;
- pdp = pmap_pdpe(pmap, va);
- KKASSERT(*pdp);
- *pdp = 0;
- } else {
- /* PT page */
- pd_entry_t *pd;
- pd = pmap_pde(pmap, va);
- KKASSERT(*pd);
- *pd = 0;
- }
-
- KKASSERT(pmap->pm_stats.resident_count > 0);
- --pmap->pm_stats.resident_count;
-
- if (pmap->pm_ptphint == mpte)
- pmap->pm_ptphint = NULL;
- pmap_inval_deinterlock(info, pmap);
-
- if (mpte->pindex < NUPDE) {
- /* We just released a PT, unhold the matching PD */
- vm_page_t pdpg;
-
- pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
- pmap_unwire_pte_hold(pmap, va, pdpg, info);
- }
- if (mpte->pindex >= NUPDE && mpte->pindex < (NUPDE + NUPDPE)) {
- /* We just released a PD, unhold the matching PDP */
- vm_page_t pdppg;
-
- pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
- pmap_unwire_pte_hold(pmap, va, pdppg, info);
- }
-
- /*
- * This was our wiring.
- */
- KKASSERT(mpte->flags & PG_UNMANAGED);
- vm_page_unwire(mpte, 0);
- KKASSERT(mpte->wire_count == 0);
- vm_page_flag_clear(mpte, PG_MAPPED | PG_WRITEABLE);
- vm_page_flash(mpte);
- vm_page_free_zero(mpte);
-
- return 1;
-}
-
/*
* Initialize pmap0/vmspace0. This pmap is not added to pmap_list because
* it, and IdlePTD, represents the template used to update all other pmaps.
pmap->pm_pml4 = (pml4_entry_t *)(PTOV_OFFSET + KPML4phys);
pmap->pm_count = 1;
pmap->pm_active = 0;
- pmap->pm_ptphint = NULL;
- TAILQ_INIT(&pmap->pm_pvlist);
- TAILQ_INIT(&pmap->pm_pvlist_free);
- pmap->pm_hold = 0;
+ pmap->pm_pvhint = NULL;
+ RB_INIT(&pmap->pm_pvroot);
spin_init(&pmap->pm_spin);
lwkt_token_init(&pmap->pm_token, "pmap_tok");
bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
void
pmap_pinit(struct pmap *pmap)
{
- vm_page_t pml4pg;
+ pv_entry_t pv;
+
+ /*
+ * Misc initialization
+ */
+ pmap->pm_count = 1;
+ pmap->pm_active = 0;
+ pmap->pm_pvhint = NULL;
+ if (pmap->pm_pmlpv == NULL) {
+ RB_INIT(&pmap->pm_pvroot);
+ bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
+ spin_init(&pmap->pm_spin);
+ lwkt_token_init(&pmap->pm_token, "pmap_tok");
+ }
/*
* No need to allocate page table space yet but we do need a valid
}
/*
- * Allocate an object for the ptes
- */
- if (pmap->pm_pteobj == NULL) {
- pmap->pm_pteobj = vm_object_allocate(OBJT_DEFAULT,
- NUPDE + NUPDPE + PML4PML4I + 1);
- }
-
- /*
- * Allocate the page directory page, unless we already have
- * one cached. If we used the cached page the wire_count will
- * already be set appropriately.
+ * Allocate the page directory page, which wires it even though
+ * it isn't being entered into some higher level page table (it
+ * being the highest level). If one is already cached we don't
+ * have to do anything.
*/
- if ((pml4pg = pmap->pm_pdirm) == NULL) {
- pml4pg = vm_page_grab(pmap->pm_pteobj,
- NUPDE + NUPDPE + PML4PML4I,
- VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
- pmap->pm_pdirm = pml4pg;
- vm_page_unmanage(pml4pg);
- vm_page_flag_clear(pml4pg, PG_MAPPED);
- pml4pg->valid = VM_PAGE_BITS_ALL;
- vm_page_wire(pml4pg);
- vm_page_wakeup(pml4pg);
+ if ((pv = pmap->pm_pmlpv) == NULL) {
+ pv = pmap_allocpte(pmap, pmap_pml4_pindex(), NULL);
+ pmap->pm_pmlpv = pv;
pmap_kenter((vm_offset_t)pmap->pm_pml4,
- VM_PAGE_TO_PHYS(pml4pg));
+ VM_PAGE_TO_PHYS(pv->pv_m));
+ pv_put(pv);
+ pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U;
+ pmap->pm_pml4[DMPML4I] = DMPDPphys | PG_RW | PG_V | PG_U;
+
+ /* install self-referential address mapping entry */
+ pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pv->pv_m) |
+ PG_V | PG_RW | PG_A | PG_M;
+ } else {
+ KKASSERT(pv->pv_m->flags & PG_MAPPED);
+ KKASSERT(pv->pv_m->flags & PG_WRITEABLE);
}
- if ((pml4pg->flags & PG_ZERO) == 0)
- bzero(pmap->pm_pml4, PAGE_SIZE);
-#ifdef PMAP_DEBUG
- else
- pmap_page_assertzero(VM_PAGE_TO_PHYS(pml4pg));
-#endif
- vm_page_flag_clear(pml4pg, PG_ZERO);
-
- pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U;
- pmap->pm_pml4[DMPML4I] = DMPDPphys | PG_RW | PG_V | PG_U;
-
- /* install self-referential address mapping entry */
- pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) |
- PG_V | PG_RW | PG_A | PG_M;
-
- pmap->pm_count = 1;
- pmap->pm_active = 0;
- pmap->pm_ptphint = NULL;
- TAILQ_INIT(&pmap->pm_pvlist);
- TAILQ_INIT(&pmap->pm_pvlist_free);
- pmap->pm_hold = 0;
- spin_init(&pmap->pm_spin);
- lwkt_token_init(&pmap->pm_token, "pmap_tok");
- bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
- pmap->pm_stats.resident_count = 1;
}
/*
void
pmap_puninit(pmap_t pmap)
{
+ pv_entry_t pv;
vm_page_t p;
KKASSERT(pmap->pm_active == 0);
- if ((p = pmap->pm_pdirm) != NULL) {
- KKASSERT(pmap->pm_pml4 != NULL);
- KKASSERT(pmap->pm_pml4 != (void *)(PTOV_OFFSET + KPML4phys));
+ if ((pv = pmap->pm_pmlpv) != NULL) {
+ if (pv_hold_try(pv) == 0)
+ pv_lock(pv);
+ p = pmap_remove_pv_page(pv, 1);
+ pv_free(pv);
pmap_kremove((vm_offset_t)pmap->pm_pml4);
vm_page_busy_wait(p, FALSE, "pgpun");
- KKASSERT(p->flags & PG_UNMANAGED);
+ vm_page_unhold(p);
+ KKASSERT(p->flags & (PG_FICTITIOUS|PG_UNMANAGED));
vm_page_unwire(p, 0);
- vm_page_free_zero(p);
- pmap->pm_pdirm = NULL;
+ vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE);
+
+ /*
+ * XXX eventually clean out PML4 static entries and
+ * use vm_page_free_zero()
+ */
+ vm_page_free(p);
+ pmap->pm_pmlpv = NULL;
}
if (pmap->pm_pml4) {
KKASSERT(pmap->pm_pml4 != (void *)(PTOV_OFFSET + KPML4phys));
kmem_free(&kernel_map, (vm_offset_t)pmap->pm_pml4, PAGE_SIZE);
pmap->pm_pml4 = NULL;
}
- if (pmap->pm_pteobj) {
- vm_object_deallocate(pmap->pm_pteobj);
- pmap->pm_pteobj = NULL;
- }
+ KKASSERT(pmap->pm_stats.resident_count == 0);
+ KKASSERT(pmap->pm_stats.wired_count == 0);
}
/*
}
/*
- * Attempt to release and free a vm_page in a pmap. Returns 1 on success,
- * 0 on failure (if the procedure had to sleep).
+ * This routine is called when various levels in the page table need to
+ * be populated. This routine cannot fail.
*
- * When asked to remove the page directory page itself, we actually just
- * leave it cached so we do not have to incur the SMP inval overhead of
- * removing the kernel mapping. pmap_puninit() will take care of it.
+ * This function returns two locked pv_entry's, one representing the
+ * requested pv and one representing the requested pv's parent pv. If
+ * the pv did not previously exist it will be mapped into its parent
+ * and wired, otherwise no additional wire count will be added.
*/
static
-int
-pmap_release_free_page(struct pmap *pmap, vm_page_t p)
+pv_entry_t
+pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp)
{
+ pt_entry_t *ptep;
+ pv_entry_t pv;
+ pv_entry_t pvp;
+ vm_pindex_t pt_pindex;
+ vm_page_t m;
+ int isnew;
+
/*
- * This code optimizes the case of freeing non-busy
- * page-table pages. Those pages are zero now, and
- * might as well be placed directly into the zero queue.
+ * If the pv already exists and we aren't being asked for the
+ * parent page table page we can just return it. A locked+held pv
+ * is returned.
*/
- if (vm_page_busy_try(p, FALSE)) {
- vm_page_sleep_busy(p, FALSE, "pmaprl");
- return 0;
+ pv = pv_alloc(pmap, ptepindex, &isnew);
+ if (isnew == 0 && pvpp == NULL)
+ return(pv);
+
+ /*
+ * This is a new PV, we have to resolve its parent page table and
+ * add an additional wiring to the page if necessary.
+ */
+
+ /*
+ * Special case terminal PVs. These are not page table pages so
+ * no vm_page is allocated (the caller supplied the vm_page). If
+ * pvpp is non-NULL we are being asked to also removed the pt_pv
+ * for this pv.
+ *
+ * Note that pt_pv's are only returned for user VAs. We assert that
+ * a pt_pv is not being requested for kernel VAs.
+ */
+ if (ptepindex < pmap_pt_pindex(0)) {
+ if (ptepindex >= NUPTE_USER)
+ KKASSERT(pvpp == NULL);
+ else
+ KKASSERT(pvpp != NULL);
+ if (pvpp) {
+ pt_pindex = NUPTE_TOTAL + (ptepindex >> NPTEPGSHIFT);
+ pvp = pmap_allocpte(pmap, pt_pindex, NULL);
+ if (isnew)
+ vm_page_wire_quick(pvp->pv_m);
+ *pvpp = pvp;
+ } else {
+ pvp = NULL;
+ }
+ return(pv);
}
/*
- * Remove the page table page from the processes address space.
+ * Non-terminal PVs allocate a VM page to represent the page table,
+ * so we have to resolve pvp and calculate ptepindex for the pvp
+ * and then for the page table entry index in the pvp for
+ * fall-through.
*/
- if (p->pindex == NUPDE + NUPDPE + PML4PML4I) {
+ if (ptepindex < pmap_pd_pindex(0)) {
/*
- * We are the pml4 table itself.
+ * pv is PT, pvp is PD
*/
- /* XXX anything to do here? */
- } else if (p->pindex >= (NUPDE + NUPDPE)) {
+ ptepindex = (ptepindex - pmap_pt_pindex(0)) >> NPDEPGSHIFT;
+ ptepindex += NUPTE_TOTAL + NUPT_TOTAL;
+ pvp = pmap_allocpte(pmap, ptepindex, NULL);
+ if (!isnew)
+ goto notnew;
+
/*
- * Remove a PDP page from the PML4. We do not maintain
- * wire counts on the PML4 page.
+ * PT index in PD
*/
- pml4_entry_t *pml4;
- vm_page_t m4;
- int idx;
-
- m4 = vm_page_lookup(pmap->pm_pteobj,
- NUPDE + NUPDPE + PML4PML4I);
- KKASSERT(m4 != NULL);
- pml4 = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m4));
- idx = (p->pindex - (NUPDE + NUPDPE)) % NPML4EPG;
- KKASSERT(pml4[idx] != 0);
- pml4[idx] = 0;
- } else if (p->pindex >= NUPDE) {
+ ptepindex = pv->pv_pindex - pmap_pt_pindex(0);
+ ptepindex &= ((1ul << NPDEPGSHIFT) - 1);
+ } else if (ptepindex < pmap_pdp_pindex(0)) {
/*
- * Remove a PD page from the PDP and drop the wire count
- * on the PDP. The PDP has a wire_count just from being
- * mapped so the wire_count should never drop to 0 here.
+ * pv is PD, pvp is PDP
*/
- vm_page_t m3;
- pdp_entry_t *pdp;
- int idx;
-
- m3 = vm_page_lookup(pmap->pm_pteobj,
- NUPDE + NUPDPE + (p->pindex - NUPDE) / NPDPEPG);
- KKASSERT(m3 != NULL);
- pdp = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m3));
- idx = (p->pindex - NUPDE) % NPDPEPG;
- KKASSERT(pdp[idx] != 0);
- pdp[idx] = 0;
- if (vm_page_unwire_quick(m3))
- panic("pmap_release_free_page: m3 wire_count 1->0");
- } else {
+ ptepindex = (ptepindex - pmap_pd_pindex(0)) >> NPDPEPGSHIFT;
+ ptepindex += NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL;
+ pvp = pmap_allocpte(pmap, ptepindex, NULL);
+ if (!isnew)
+ goto notnew;
+
+ /*
+ * PD index in PDP
+ */
+ ptepindex = pv->pv_pindex - pmap_pd_pindex(0);
+ ptepindex &= ((1ul << NPDPEPGSHIFT) - 1);
+ } else if (ptepindex < pmap_pml4_pindex()) {
/*
- * Remove a PT page from the PD and drop the wire count
- * on the PD. The PD has a wire_count just from being
- * mapped so the wire_count should never drop to 0 here.
+ * pv is PDP, pvp is the root pml4 table
*/
- vm_page_t m2;
- pd_entry_t *pd;
- int idx;
+ pvp = pmap_allocpte(pmap, pmap_pml4_pindex(), NULL);
+ if (!isnew)
+ goto notnew;
- m2 = vm_page_lookup(pmap->pm_pteobj,
- NUPDE + p->pindex / NPDEPG);
- KKASSERT(m2 != NULL);
- pd = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m2));
- idx = p->pindex % NPDEPG;
- pd[idx] = 0;
- if (vm_page_unwire_quick(m2))
- panic("pmap_release_free_page: m2 wire_count 1->0");
+ /*
+ * PDP index in PML4
+ */
+ ptepindex = pv->pv_pindex - pmap_pdp_pindex(0);
+ ptepindex &= ((1ul << NPML4EPGSHIFT) - 1);
+ } else {
+ /*
+ * pv represents the top-level PML4, there is no parent.
+ */
+ pvp = NULL;
+ if (!isnew)
+ goto notnew;
}
/*
- * p's wire_count should be transitioning from 1 to 0 here.
- */
- KKASSERT(p->wire_count == 1);
- KKASSERT(p->flags & PG_UNMANAGED);
- KKASSERT(pmap->pm_stats.resident_count > 0);
- vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE);
- --pmap->pm_stats.resident_count;
- if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == p->pindex))
- pmap->pm_ptphint = NULL;
-
- /*
- * We leave the top-level page table page cached, wired, and mapped in
- * the pmap until the dtor function (pmap_puninit()) gets called.
- * However, still clean it up so we can set PG_ZERO.
+ * This code is only reached if isnew is TRUE and this is not a
+ * terminal PV. We need to allocate a vm_page for the page table
+ * at this level and enter it into the parent page table.
+ *
+ * page table pages are marked PG_WRITEABLE and PG_MAPPED.
*/
- if (p->pindex == NUPDE + NUPDPE + PML4PML4I) {
- bzero(pmap->pm_pml4, PAGE_SIZE);
- vm_page_flag_set(p, PG_ZERO);
- vm_page_wakeup(p);
- } else {
- vm_page_unwire(p, 0);
- KKASSERT(p->wire_count == 0);
- /* JG eventually revert to using vm_page_free_zero() */
- vm_page_free(p);
+ for (;;) {
+ m = vm_page_alloc(NULL, pv->pv_pindex,
+ VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM |
+ VM_ALLOC_INTERRUPT);
+ if (m)
+ break;
+ vm_wait(0);
}
- return 1;
-}
+ vm_page_spin_lock(m);
+ TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
+ pv->pv_m = m;
+ vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE);
+ vm_page_spin_unlock(m);
+ vm_page_unmanage(m); /* m must be spinunlocked */
+
+ if ((m->flags & PG_ZERO) == 0) {
+ pmap_zero_page(VM_PAGE_TO_PHYS(m));
+ }
+#ifdef PMAP_DEBUG
+ else {
+ pmap_page_assertzero(VM_PAGE_TO_PHYS(m));
+ }
+#endif
+ m->valid = VM_PAGE_BITS_ALL;
+ vm_page_flag_clear(m, PG_ZERO);
+ vm_page_wire(m); /* wire for mapping in parent */
+
+ /*
+ * Wire the page into pvp, bump the wire-count for pvp's page table
+ * page. Bump the resident_count for the pmap. There is no pvp
+ * for the top level, address the pm_pml4[] array directly.
+ *
+ * If the caller wants the parent we return it, otherwise
+ * we just put it away.
+ *
+ * No interlock is needed for pte 0 -> non-zero.
+ */
+ if (pvp) {
+ vm_page_wire_quick(pvp->pv_m);
+ ptep = pv_pte_lookup(pvp, ptepindex);
+ KKASSERT((*ptep & PG_V) == 0);
+ *ptep = VM_PAGE_TO_PHYS(m) | (PG_U | PG_RW | PG_V |
+ PG_A | PG_M);
+ }
+ vm_page_wakeup(m);
+notnew:
+ if (pvpp)
+ *pvpp = pvp;
+ else if (pvp)
+ pv_put(pvp);
+ return (pv);
+}
/*
- * This routine is called when various levels in the page table need to
- * be populated. This routine cannot fail.
+ * Release any resources held by the given physical map.
+ *
+ * Called when a pmap initialized by pmap_pinit is being released. Should
+ * only be called if the map contains no valid mappings.
*
- * We returned a page wired for the caller. If we had to map the page into
- * a parent page table it will receive an additional wire_count. For example,
- * an empty page table directory which is still mapped into its pdp will
- * retain a wire_count of 1.
+ * Caller must hold pmap->pm_token
*/
-static
-vm_page_t
-_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex)
+struct pmap_release_info {
+ pmap_t pmap;
+ int retry;
+};
+
+static int pmap_release_callback(pv_entry_t pv, void *data);
+
+void
+pmap_release(struct pmap *pmap)
{
- vm_page_t m;
+ struct pmap_release_info info;
+
+ KASSERT(pmap->pm_active == 0,
+ ("pmap still active! %016jx", (uintmax_t)pmap->pm_active));
+#if defined(DIAGNOSTIC)
+ if (object->ref_count != 1)
+ panic("pmap_release: pteobj reference count != 1");
+#endif
+
+ spin_lock(&pmap_spin);
+ TAILQ_REMOVE(&pmap_list, pmap, pm_pmnode);
+ spin_unlock(&pmap_spin);
/*
- * Find or fabricate a new pagetable page. This will busy the page.
+ * Pull pv's off the RB tree in order from low to high and release
+ * each page.
*/
- m = vm_page_grab(pmap->pm_pteobj, ptepindex,
- VM_ALLOC_NORMAL | VM_ALLOC_ZERO | VM_ALLOC_RETRY);
+ info.pmap = pmap;
+ do {
+ info.retry = 0;
+ spin_lock(&pmap->pm_spin);
+ RB_SCAN(pv_entry_rb_tree, &pmap->pm_pvroot, NULL,
+ pmap_release_callback, &info);
+ spin_unlock(&pmap->pm_spin);
+ } while (info.retry);
+
/*
- * The grab may have blocked and raced another thread populating
- * the same page table. m->valid will be 0 on a newly allocated page
- * so use this to determine if we have to zero it out or not. We
- * don't want to zero-out a raced page as this would desynchronize
- * the pv_entry's for the related pte's and cause pmap_remove_all()
- * to panic.
- *
- * Page table pages are unmanaged (do not use the normal PQ_s)
+ * One resident page (the pml4 page) should remain.
+ * No wired pages should remain.
*/
- if (m->valid == 0) {
- vm_page_unmanage(m);
- if ((m->flags & PG_ZERO) == 0) {
- pmap_zero_page(VM_PAGE_TO_PHYS(m));
- }
-#ifdef PMAP_DEBUG
- else {
- pmap_page_assertzero(VM_PAGE_TO_PHYS(m));
+ KKASSERT(pmap->pm_stats.resident_count == 1);
+ KKASSERT(pmap->pm_stats.wired_count == 0);
+}
+
+static int
+pmap_release_callback(pv_entry_t pv, void *data)
+{
+ struct pmap_release_info *info = data;
+ pmap_t pmap = info->pmap;
+ vm_page_t p;
+
+ if (pv_hold_try(pv)) {
+ spin_unlock(&pmap->pm_spin);
+ } else {
+ spin_unlock(&pmap->pm_spin);
+ pv_lock(pv);
+ if (pv->pv_pmap != pmap) {
+ pv_put(pv);
+ spin_lock(&pmap->pm_spin);
+ info->retry = 1;
+ return(-1);
}
-#endif
- m->valid = VM_PAGE_BITS_ALL;
- vm_page_flag_clear(m, PG_ZERO);
}
-#ifdef PMAP_DEBUG
- else {
- KKASSERT((m->flags & PG_ZERO) == 0);
- }
-#endif
- KASSERT(m->queue == PQ_NONE,
- ("_pmap_allocpte: %p->queue != PQ_NONE", m));
+ /*
+ * The pmap is currently not spinlocked, pv is held+locked.
+ * Remove the pv's page from its parent's page table. The
+ * parent's page table page's wire_count will be decremented.
+ */
+ pmap_remove_pv_pte(pv, NULL, NULL);
/*
- * Increment the wire_count for the page we will be returning to
- * the caller.
+ * Terminal pvs are unhooked from their vm_pages. Because
+ * terminal pages aren't page table pages they aren't wired
+ * by us, so we have to be sure not to unwire them either.
*/
- vm_page_wire(m);
+ if (pv->pv_pindex < pmap_pt_pindex(0)) {
+ pmap_remove_pv_page(pv, 0);
+ goto skip;
+ }
/*
- * Map the pagetable page into the process address space, if
- * it isn't already there.
+ * We leave the top-level page table page cached, wired, and
+ * mapped in the pmap until the dtor function (pmap_puninit())
+ * gets called.
*
- * It is possible that someone else got in and mapped the page
- * directory page while we were blocked, if so just unbusy and
- * return the held page.
+ * Since we are leaving the top-level pv intact we need
+ * to break out of what would otherwise be an infinite loop.
*/
- if (ptepindex >= (NUPDE + NUPDPE)) {
- /*
- * Wire up a new PDP page in the PML4.
- *
- * (m) is busied so we cannot race another thread trying
- * to map the PDP entry in the PML4.
- */
- vm_pindex_t pml4index;
- pml4_entry_t *pml4;
-
- pml4index = ptepindex - (NUPDE + NUPDPE);
- pml4 = &pmap->pm_pml4[pml4index];
- if ((*pml4 & PG_V) == 0) {
- *pml4 = VM_PAGE_TO_PHYS(m) | (PG_U | PG_RW | PG_V |
- PG_A | PG_M);
- ++pmap->pm_stats.resident_count;
- vm_page_wire_quick(m); /* wire for mapping */
- }
- /* return (m) wired for the caller */
- } else if (ptepindex >= NUPDE) {
- /*
- * Wire up a new PD page in the PDP
- */
- vm_pindex_t pml4index;
- vm_pindex_t pdpindex;
- vm_page_t pdppg;
- pml4_entry_t *pml4;
- pdp_entry_t *pdp;
+ if (pv->pv_pindex == pmap_pml4_pindex()) {
+ pv_put(pv);
+ spin_lock(&pmap->pm_spin);
+ return(-1);
+ }
+
+ /*
+ * For page table pages (other than the top-level page),
+ * remove and free the vm_page. The representitive mapping
+ * removed above by pmap_remove_pv_pte() did not undo the
+ * last wire_count so we have to do that as well.
+ */
+ p = pmap_remove_pv_page(pv, 1);
+ vm_page_busy_wait(p, FALSE, "pmaprl");
+ vm_page_unhold(p);
+ if (p->wire_count != 1) {
+ kprintf("p->wire_count was %016lx %d\n",
+ pv->pv_pindex, p->wire_count);
+ }
+ KKASSERT(p->wire_count == 1);
+ KKASSERT(p->flags & PG_UNMANAGED);
+
+ vm_page_unwire(p, 0);
+ KKASSERT(p->wire_count == 0);
+ /* JG eventually revert to using vm_page_free_zero() */
+ vm_page_free(p);
+skip:
+ pv_free(pv);
+ spin_lock(&pmap->pm_spin);
+ return(0);
+}
+
+/*
+ * This function will remove the pte associated with a pv from its parent.
+ * Terminal pv's are supported. The removal will be interlocked if info
+ * is non-NULL. The caller must dispose of pv instead of just unlocking
+ * it.
+ *
+ * The wire count will be dropped on the parent page table. The wire
+ * count on the page being removed (pv->pv_m) from the parent page table
+ * is NOT touched. Note that terminal pages will not have any additional
+ * wire counts while page table pages will have at least one representing
+ * the mapping, plus others representing sub-mappings.
+ *
+ * NOTE: Cannot be called on kernel page table pages, only KVM terminal
+ * pages and user page table and terminal pages.
+ *
+ * The pv must be locked.
+ *
+ * XXX must lock parent pv's if they exist to remove pte XXX
+ */
+static
+void
+pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, struct pmap_inval_info *info)
+{
+ vm_pindex_t ptepindex = pv->pv_pindex;
+ pmap_t pmap = pv->pv_pmap;
+ vm_page_t p;
+ int gotpvp = 0;
- pdpindex = ptepindex - NUPDE;
- pml4index = pdpindex >> NPML4EPGSHIFT;
+ KKASSERT(pmap);
+ if (ptepindex == pmap_pml4_pindex()) {
/*
- * Once mapped the PDP is not unmapped during normal operation
- * so we only need to handle races in the unmapped case.
- *
- * Mapping a PD into the PDP requires an additional wiring
- * of the PDP.
+ * We are the top level pml4 table, there is no parent.
*/
- pml4 = &pmap->pm_pml4[pml4index];
- if ((*pml4 & PG_V) == 0) {
- pdppg = _pmap_allocpte(pmap,
- NUPDE + NUPDPE + pml4index);
- /* pdppg wired for the map and also wired for return */
- } else {
- pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
- vm_page_wire_quick(pdppg);
- }
- /* we have an extra ref on pdppg now for our use */
-
+ p = pmap->pm_pmlpv->pv_m;
+ } else if (ptepindex >= pmap_pdp_pindex(0)) {
/*
- * Now find the PD entry in the PDP and map it.
- *
- * (m) is busied so we cannot race another thread trying
- * to map the PD entry in the PDP.
- *
- * If the PD entry is already mapped we have to drop one
- * wire count on the pdppg that we had bumped above.
+ * Remove a PDP page from the pml4e. This can only occur
+ * with user page tables. We do not have to lock the
+ * pml4 PV so just ignore pvp.
*/
- pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
- pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
-
- if ((*pdp & PG_V) == 0) {
- *pdp = VM_PAGE_TO_PHYS(m) | (PG_U | PG_RW | PG_V |
- PG_A | PG_M);
- vm_page_wire_quick(m); /* wire for mapping */
- ++pmap->pm_stats.resident_count;
- /* eat extra pdppg wiring for mapping */
- } else {
- if (vm_page_unwire_quick(pdppg))
- panic("pmap_allocpte: unwire case 1");
+ vm_pindex_t pml4_pindex;
+ vm_pindex_t pdp_index;
+ pml4_entry_t *pdp;
+
+ pdp_index = ptepindex - pmap_pdp_pindex(0);
+ if (pvp == NULL) {
+ pml4_pindex = pmap_pml4_pindex();
+ pvp = pv_get(pv->pv_pmap, pml4_pindex);
+ gotpvp = 1;
}
- /* return (m) wired for the caller */
- } else {
+ pdp = &pmap->pm_pml4[pdp_index & ((1ul << NPML4EPGSHIFT) - 1)];
+ KKASSERT((*pdp & PG_V) != 0);
+ p = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
+ *pdp = 0;
+ KKASSERT(info == NULL);
+ } else if (ptepindex >= pmap_pd_pindex(0)) {
/*
- * Wire up the new PT page in the PD
+ * Remove a PD page from the pdp
*/
- vm_pindex_t pml4index;
- vm_pindex_t pdpindex;
- pml4_entry_t *pml4;
- pdp_entry_t *pdp;
- pd_entry_t *pd;
- vm_page_t pdppg;
- vm_page_t pdpg;
+ vm_pindex_t pdp_pindex;
+ vm_pindex_t pd_index;
+ pdp_entry_t *pd;
- pdpindex = ptepindex >> NPDPEPGSHIFT;
- pml4index = pdpindex >> NPML4EPGSHIFT;
+ pd_index = ptepindex - pmap_pd_pindex(0);
+ if (pvp == NULL) {
+ pdp_pindex = NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL +
+ (pd_index >> NPML4EPGSHIFT);
+ pvp = pv_get(pv->pv_pmap, pdp_pindex);
+ gotpvp = 1;
+ }
+ pd = pv_pte_lookup(pvp, pd_index & ((1ul << NPDPEPGSHIFT) - 1));
+ KKASSERT((*pd & PG_V) != 0);
+ p = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
+ *pd = 0;
+ KKASSERT(info == NULL);
+ } else if (ptepindex >= pmap_pt_pindex(0)) {
/*
- * Locate the PDP page in the PML4
- *
- * Once mapped the PDP is not unmapped during normal operation
- * so we only need to handle races in the unmapped case.
+ * Remove a PT page from the pd
*/
- pml4 = &pmap->pm_pml4[pml4index];
- if ((*pml4 & PG_V) == 0) {
- pdppg = _pmap_allocpte(pmap, NUPDE + pdpindex);
- } else {
- pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
- vm_page_wire_quick(pdppg);
- }
- /* we have an extra ref on pdppg now for our use */
+ vm_pindex_t pd_pindex;
+ vm_pindex_t pt_index;
+ pd_entry_t *pt;
+ pt_index = ptepindex - pmap_pt_pindex(0);
+
+ if (pvp == NULL) {
+ pd_pindex = NUPTE_TOTAL + NUPT_TOTAL +
+ (pt_index >> NPDPEPGSHIFT);
+ pvp = pv_get(pv->pv_pmap, pd_pindex);
+ gotpvp = 1;
+ }
+ pt = pv_pte_lookup(pvp, pt_index & ((1ul << NPDPEPGSHIFT) - 1));
+ KKASSERT((*pt & PG_V) != 0);
+ p = PHYS_TO_VM_PAGE(*pt & PG_FRAME);
+ *pt = 0;
+ KKASSERT(info == NULL);
+ } else {
/*
- * Locate the PD page in the PDP
- *
- * Once mapped the PDP is not unmapped during normal operation
- * so we only need to handle races in the unmapped case.
+ * Remove a PTE from the PT page
*
- * We can scrap the extra reference on pdppg not needed if
- * *pdp is already mapped and also not needed if it wasn't
- * because the _pmap_allocpte() picked up the case for us.
+ * NOTE: pv's must be locked bottom-up to avoid deadlocking.
+ * pv is a pte_pv so we can safely lock pt_pv.
*/
- pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
- pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
+ vm_pindex_t pt_pindex;
+ pt_entry_t *ptep;
+ pt_entry_t pte;
+ vm_offset_t va;
- if ((*pdp & PG_V) == 0) {
- pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex);
+ pt_pindex = ptepindex >> NPTEPGSHIFT;
+ va = (vm_offset_t)ptepindex << PAGE_SHIFT;
+
+ if (ptepindex >= NUPTE_USER) {
+ ptep = vtopte(ptepindex << PAGE_SHIFT);
+ KKASSERT(pvp == NULL);
} else {
- pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
- vm_page_wire_quick(pdpg);
+ if (pvp == NULL) {
+ pt_pindex = NUPTE_TOTAL +
+ (ptepindex >> NPDPEPGSHIFT);
+ pvp = pv_get(pv->pv_pmap, pt_pindex);
+ gotpvp = 1;
+ }
+ ptep = pv_pte_lookup(pvp, ptepindex &
+ ((1ul << NPDPEPGSHIFT) - 1));
}
- vm_page_unwire_quick(pdppg);
- /* we have an extra ref on pdpg now for our use */
+
+ if (info)
+ pmap_inval_interlock(info, pmap, va);
+ pte = pte_load_clear(ptep);
+ if (info)
+ pmap_inval_deinterlock(info, pmap);
/*
- * Locate the PT page in the PD.
- *
- * (m) is busied so we cannot race another thread trying
- * to map the PT page in the PD.
+ * Now update the vm_page_t
*/
- pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
- pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
- if ((*pd & PG_V) == 0) {
- *pd = VM_PAGE_TO_PHYS(m) | (PG_U | PG_RW | PG_V |
- PG_A | PG_M);
- ++pmap->pm_stats.resident_count;
- vm_page_wire_quick(m); /* wire for mapping */
- /* eat extra pdpg wiring for mapping */
- } else {
- if (vm_page_unwire_quick(pdpg))
- panic("pmap_allocpte: unwire case 2");
+ if ((pte & (PG_MANAGED|PG_V)) != (PG_MANAGED|PG_V)) {
+ kprintf("remove_pte badpte %016lx %016lx %d\n",
+ pte, pv->pv_pindex,
+ pv->pv_pindex < pmap_pt_pindex(0));
+ }
+ /*KKASSERT((pte & (PG_MANAGED|PG_V)) == (PG_MANAGED|PG_V));*/
+ p = PHYS_TO_VM_PAGE(pte & PG_FRAME);
+
+ if (pte & PG_M) {
+ if (pmap_track_modified(ptepindex))
+ vm_page_dirty(p);
+ }
+ if (pte & PG_A) {
+ vm_page_flag_set(p, PG_REFERENCED);
}
- /* return (m) wired for the caller */
+ if (pte & PG_W)
+ atomic_add_long(&pmap->pm_stats.wired_count, -1);
+ if (pte & PG_G)
+ cpu_invlpg((void *)va);
}
/*
- * We successfully loaded a PDP, PD, or PTE. Set the page table hint,
- * valid bits, mapped flag, unbusy, and we're done.
+ * Unwire the parent page table page. The wire_count cannot go below
+ * 1 here because the parent page table page is itself still mapped.
+ *
+ * XXX remove the assertions later.
*/
- pmap->pm_ptphint = m;
-
-#if 0
- m->valid = VM_PAGE_BITS_ALL;
- vm_page_flag_clear(m, PG_ZERO);
-#endif
- vm_page_flag_set(m, PG_MAPPED);
- vm_page_wakeup(m);
+ KKASSERT(pv->pv_m == p);
+ if (pvp && vm_page_unwire_quick(pvp->pv_m))
+ panic("pmap_remove_pv_pte: Insufficient wire_count");
- return (m);
+ if (gotpvp)
+ pv_put(pvp);
}
static
vm_page_t
-pmap_allocpte(pmap_t pmap, vm_offset_t va)
+pmap_remove_pv_page(pv_entry_t pv, int holdpg)
{
- vm_pindex_t ptepindex;
- pd_entry_t *pd;
vm_page_t m;
- ASSERT_LWKT_TOKEN_HELD(vm_object_token(pmap->pm_pteobj));
-
- /*
- * Calculate pagetable page index
- */
- ptepindex = pmap_pde_pindex(va);
-
- /*
- * Get the page directory entry
- */
- pd = pmap_pde(pmap, va);
-
- /*
- * This supports switching from a 2MB page to a
- * normal 4K page.
- */
- if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
- panic("no promotion/demotion yet");
- *pd = 0;
- pd = NULL;
- cpu_invltlb();
- smp_invltlb();
- }
-
- /*
- * If the page table page is mapped, we just increment the
- * wire count, and activate it.
- */
- if (pd != NULL && (*pd & PG_V) != 0) {
- m = pmap_page_lookup(pmap->pm_pteobj, ptepindex);
- pmap->pm_ptphint = m;
- vm_page_wire_quick(m);
- vm_page_wakeup(m);
- return m;
- }
+ m = pv->pv_m;
+ if (holdpg)
+ vm_page_hold(m);
+ KKASSERT(m);
+ vm_page_spin_lock(m);
+ pv->pv_m = NULL;
+ TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
/*
- * Here if the pte page isn't mapped, or if it has been deallocated.
- */
- return _pmap_allocpte(pmap, ptepindex);
-}
-
-
-/***************************************************
- * Pmap allocation/deallocation routines.
- ***************************************************/
-
-/*
- * Release any resources held by the given physical map.
- * Called when a pmap initialized by pmap_pinit is being released.
- * Should only be called if the map contains no valid mappings.
- *
- * Caller must hold pmap->pm_token
- */
-static int pmap_release_callback(struct vm_page *p, void *data);
-
-static __inline
-void
-pmap_auto_yield(struct rb_vm_page_scan_info *info)
-{
- if (++info->desired >= pmap_yield_count) {
- info->desired = 0;
- lwkt_yield();
- }
-}
-
-void
-pmap_release(struct pmap *pmap)
-{
- vm_object_t object = pmap->pm_pteobj;
- struct rb_vm_page_scan_info info;
-
- KASSERT(pmap->pm_active == 0,
- ("pmap still active! %016jx", (uintmax_t)pmap->pm_active));
-#if defined(DIAGNOSTIC)
- if (object->ref_count != 1)
- panic("pmap_release: pteobj reference count != 1");
-#endif
-
- info.pmap = pmap;
- info.object = object;
-
- spin_lock(&pmap_spin);
- TAILQ_REMOVE(&pmap_list, pmap, pm_pmnode);
- spin_unlock(&pmap_spin);
-
- info.desired = 0;
- vm_object_hold(object);
- do {
- info.error = 0;
- info.mpte = NULL;
- info.limit = object->generation;
-
- vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
- pmap_release_callback, &info);
- if (info.error == 0 && info.mpte) {
- if (!pmap_release_free_page(pmap, info.mpte))
- info.error = 1;
- }
- } while (info.error);
- vm_object_drop(object);
-
- while (pmap->pm_hold)
- tsleep(pmap, 0, "pmapx", 1);
-}
-
-static
-int
-pmap_release_callback(struct vm_page *p, void *data)
-{
- struct rb_vm_page_scan_info *info = data;
-
- if (p->pindex == NUPDE + NUPDPE + PML4PML4I) {
- info->mpte = p;
- return(0);
- }
- if (!pmap_release_free_page(info->pmap, p)) {
- info->error = 1;
- pmap_auto_yield(info);
- return(-1);
- }
- if (info->object->generation != info->limit) {
- info->error = 1;
- pmap_auto_yield(info);
- return(-1);
- }
- return(0);
+ if (m->object)
+ atomic_add_int(&m->object->agg_pv_list_count, -1);
+ */
+ if (TAILQ_EMPTY(&m->md.pv_list))
+ vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
+ vm_page_spin_unlock(m);
+ if (holdpg)
+ return(m);
+ return(NULL);
}
/*
vm_paddr_t paddr;
vm_offset_t ptppaddr;
vm_page_t nkpg;
- pd_entry_t *pde, newpdir;
- pdp_entry_t newpdp;
+ pd_entry_t *pt, newpt;
+ pdp_entry_t newpd;
int update_kernel_vm_end;
- vm_object_hold(kptobj);
-
/*
* bootstrap kernel_vm_end on first real VM use
*/
if (kernel_vm_end == 0) {
kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
nkpt = 0;
- while ((*pmap_pde(&kernel_pmap, kernel_vm_end) & PG_V) != 0) {
+ while ((*pmap_pt(&kernel_pmap, kernel_vm_end) & PG_V) != 0) {
kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) &
~(PAGE_SIZE * NPTEPG - 1);
nkpt++;
kend = kernel_map.max_offset;
while (kstart < kend) {
- pde = pmap_pde(&kernel_pmap, kstart);
- if (pde == NULL) {
+ pt = pmap_pt(&kernel_pmap, kstart);
+ if (pt == NULL) {
/* We need a new PDP entry */
- nkpg = vm_page_alloc(kptobj, nkpt,
+ nkpg = vm_page_alloc(NULL, nkpt,
VM_ALLOC_NORMAL |
VM_ALLOC_SYSTEM |
VM_ALLOC_INTERRUPT);
if ((nkpg->flags & PG_ZERO) == 0)
pmap_zero_page(paddr);
vm_page_flag_clear(nkpg, PG_ZERO);
- newpdp = (pdp_entry_t)
+ newpd = (pdp_entry_t)
(paddr | PG_V | PG_RW | PG_A | PG_M);
- *pmap_pdpe(&kernel_pmap, kstart) = newpdp;
+ *pmap_pd(&kernel_pmap, kstart) = newpd;
nkpt++;
continue; /* try again */
}
- if ((*pde & PG_V) != 0) {
+ if ((*pt & PG_V) != 0) {
kstart = (kstart + PAGE_SIZE * NPTEPG) &
~(PAGE_SIZE * NPTEPG - 1);
if (kstart - 1 >= kernel_map.max_offset) {
/*
* This index is bogus, but out of the way
*/
- nkpg = vm_page_alloc(kptobj, nkpt,
+ nkpg = vm_page_alloc(NULL, nkpt,
VM_ALLOC_NORMAL |
VM_ALLOC_SYSTEM |
VM_ALLOC_INTERRUPT);
ptppaddr = VM_PAGE_TO_PHYS(nkpg);
pmap_zero_page(ptppaddr);
vm_page_flag_clear(nkpg, PG_ZERO);
- newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
- *pmap_pde(&kernel_pmap, kstart) = newpdir;
+ newpt = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
+ *pmap_pt(&kernel_pmap, kstart) = newpt;
nkpt++;
kstart = (kstart + PAGE_SIZE * NPTEPG) &
*/
if (update_kernel_vm_end && kernel_vm_end < kstart)
kernel_vm_end = kstart;
-
- vm_object_drop(kptobj);
}
/*
}
/***************************************************
-* page management routines.
+ * page management routines.
***************************************************/
/*
- * free the pv_entry back to the free list. This function may be
- * called from an interrupt.
+ * Hold a pv without locking it
*/
-static __inline
-void
-free_pv_entry(pv_entry_t pv)
+static void
+pv_hold(pv_entry_t pv)
{
- atomic_add_int(&pv_entry_count, -1);
- KKASSERT(pv_entry_count >= 0);
- zfree(pvzone, pv);
+ u_int count;
+
+ if (atomic_cmpset_int(&pv->pv_hold, 0, 1))
+ return;
+
+ for (;;) {
+ count = pv->pv_hold;
+ cpu_ccfence();
+ if (atomic_cmpset_int(&pv->pv_hold, count, count + 1))
+ return;
+ /* retry */
+ }
}
/*
- * get a new pv_entry, allocating a block from the system
- * when needed. This function may be called from an interrupt.
+ * Hold a pv_entry, preventing its destruction. TRUE is returned if the pv
+ * was successfully locked, FALSE if it wasn't. The caller must dispose of
+ * the pv properly.
+ *
+ * Either the pmap->pm_spin or the related vm_page_spin (if traversing a
+ * pv list via its page) must be held by the caller.
*/
-static
-pv_entry_t
-get_pv_entry(void)
+static int
+_pv_hold_try(pv_entry_t pv PMAP_DEBUG_DECL)
{
- atomic_add_int(&pv_entry_count, 1);
- if (pv_entry_high_water &&
- (pv_entry_count > pv_entry_high_water) &&
- (pmap_pagedaemon_waken == 0)) {
- pmap_pagedaemon_waken = 1;
- wakeup(&vm_pages_needed);
+ u_int count;
+
+ if (atomic_cmpset_int(&pv->pv_hold, 0, PV_HOLD_LOCKED | 1)) {
+#ifdef PMAP_DEBUG
+ pv->pv_func = func;
+ pv->pv_line = lineno;
+#endif
+ return TRUE;
+ }
+
+ for (;;) {
+ count = pv->pv_hold;
+ cpu_ccfence();
+ if ((count & PV_HOLD_LOCKED) == 0) {
+ if (atomic_cmpset_int(&pv->pv_hold, count,
+ (count + 1) | PV_HOLD_LOCKED)) {
+#ifdef PMAP_DEBUG
+ pv->pv_func = func;
+ pv->pv_line = lineno;
+#endif
+ return TRUE;
+ }
+ } else {
+ if (atomic_cmpset_int(&pv->pv_hold, count, count + 1))
+ return FALSE;
+ }
+ /* retry */
}
- return zalloc(pvzone);
}
/*
- * This routine is very drastic, but can save the system
- * in a pinch.
+ * Drop a previously held pv_entry which could not be locked, allowing its
+ * destruction.
+ *
+ * Must not be called with a spinlock held as we might zfree() the pv if it
+ * is no longer associated with a pmap and this was the last hold count.
*/
-void
-pmap_collect(void)
+static void
+pv_drop(pv_entry_t pv)
{
- int i;
- vm_page_t m;
- static int warningdone=0;
+ u_int count;
- if (pmap_pagedaemon_waken == 0)
+ if (atomic_cmpset_int(&pv->pv_hold, 1, 0)) {
+ if (pv->pv_pmap == NULL)
+ zfree(pvzone, pv);
return;
- pmap_pagedaemon_waken = 0;
- if (warningdone < 5) {
- kprintf("pmap_collect: collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n");
- warningdone++;
}
- for (i = 0; i < vm_page_array_size; i++) {
- m = &vm_page_array[i];
- if (m->wire_count || m->hold_count)
- continue;
- if (vm_page_busy_try(m, TRUE) == 0) {
- if (m->wire_count == 0 && m->hold_count == 0) {
- pmap_remove_all(m);
- }
- vm_page_wakeup(m);
+ for (;;) {
+ count = pv->pv_hold;
+ cpu_ccfence();
+ KKASSERT((count & PV_HOLD_MASK) > 0);
+ KKASSERT((count & (PV_HOLD_LOCKED | PV_HOLD_MASK)) !=
+ (PV_HOLD_LOCKED | 1));
+ if (atomic_cmpset_int(&pv->pv_hold, count, count - 1)) {
+ if (count == 1 && pv->pv_pmap == NULL)
+ zfree(pvzone, pv);
+ return;
}
+ /* retry */
}
}
-
/*
- * If it is the first entry on the list, it is actually in the header and
- * we must copy the following entry up to the header.
- *
- * Otherwise we must search the list for the entry. In either case we
- * free the now unused entry.
- *
- * Caller must hold pmap->pm_token
+ * Find or allocate the requested PV entry, returning a locked pv
*/
static
-int
-pmap_remove_entry(struct pmap *pmap, vm_page_t m,
- vm_offset_t va, pmap_inval_info_t info)
+pv_entry_t
+_pv_alloc(pmap_t pmap, vm_pindex_t pindex, int *isnew PMAP_DEBUG_DECL)
{
pv_entry_t pv;
- int rtval;
+ pv_entry_t pnew = NULL;
- spin_lock(&pmap_spin);
- if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
- TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
- if (pmap == pv->pv_pmap && va == pv->pv_va)
- break;
+ spin_lock(&pmap->pm_spin);
+ for (;;) {
+ if ((pv = pmap->pm_pvhint) == NULL || pv->pv_pindex != pindex) {
+ pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot,
+ pindex);
}
- } else {
- TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) {
- if (va == pv->pv_va)
- break;
- }
- }
+ if (pv == NULL) {
+ if (pnew == NULL) {
+ spin_unlock(&pmap->pm_spin);
+ pnew = zalloc(pvzone);
+ spin_lock(&pmap->pm_spin);
+ continue;
+ }
+ pnew->pv_pmap = pmap;
+ pnew->pv_pindex = pindex;
+ pnew->pv_hold = PV_HOLD_LOCKED | 1;
+#ifdef PMAP_DEBUG
+ pnew->pv_func = func;
+ pnew->pv_line = lineno;
+#endif
+ pv_entry_rb_tree_RB_INSERT(&pmap->pm_pvroot, pnew);
+ atomic_add_long(&pmap->pm_stats.resident_count, 1);
+ spin_unlock(&pmap->pm_spin);
+ *isnew = 1;
+ return(pnew);
+ }
+ if (pnew) {
+ spin_unlock(&pmap->pm_spin);
+ zfree(pvzone, pnew);
+ pnew = NULL;
+ spin_lock(&pmap->pm_spin);
+ continue;
+ }
+ if (_pv_hold_try(pv PMAP_DEBUG_COPY)) {
+ spin_unlock(&pmap->pm_spin);
+ *isnew = 0;
+ return(pv);
+ }
+ spin_unlock(&pmap->pm_spin);
+ _pv_lock(pv PMAP_DEBUG_COPY);
+ if (pv->pv_pmap == pmap && pv->pv_pindex == pindex) {
+ *isnew = 0;
+ return(pv);
+ }
+ pv_put(pv);
+ spin_lock(&pmap->pm_spin);
+ }
- rtval = 0;
- KKASSERT(pv);
- TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
- m->md.pv_generation++;
- m->md.pv_list_count--;
- vm_page_spin_lock(m);
- if (m->object)
- atomic_add_int(&m->object->agg_pv_list_count, -1);
- vm_page_spin_unlock(m);
- KKASSERT(m->md.pv_list_count >= 0);
- if (TAILQ_EMPTY(&m->md.pv_list))
- vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
- TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
- ++pmap->pm_generation;
- spin_unlock(&pmap_spin);
+}
- rtval = pmap_unwire_pte_hold(pmap, va, pv->pv_ptem, info);
- free_pv_entry(pv);
+/*
+ * Find the requested PV entry, returning a locked+held pv or NULL
+ */
+static
+pv_entry_t
+_pv_get(pmap_t pmap, vm_pindex_t pindex PMAP_DEBUG_DECL)
+{
+ pv_entry_t pv;
- return rtval;
+ spin_lock(&pmap->pm_spin);
+ for (;;) {
+ /*
+ * Shortcut cache
+ */
+ if ((pv = pmap->pm_pvhint) == NULL || pv->pv_pindex != pindex) {
+ pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot,
+ pindex);
+ }
+ if (pv == NULL) {
+ spin_unlock(&pmap->pm_spin);
+ return NULL;
+ }
+ if (_pv_hold_try(pv PMAP_DEBUG_COPY)) {
+ pv_cache(pv, pindex);
+ spin_unlock(&pmap->pm_spin);
+ return(pv);
+ }
+ spin_unlock(&pmap->pm_spin);
+ _pv_lock(pv PMAP_DEBUG_COPY);
+ if (pv->pv_pmap == pmap && pv->pv_pindex == pindex)
+ return(pv);
+ pv_put(pv);
+ spin_lock(&pmap->pm_spin);
+ }
}
/*
- * Create a pv entry for page at pa for (pmap, va).
+ * Lookup, hold, and attempt to lock (pmap,pindex).
+ *
+ * If the entry does not exist NULL is returned and *errorp is set to 0
*
- * Caller must hold pmap token
+ * If the entry exists and could be successfully locked it is returned and
+ * errorp is set to 0.
+ *
+ * If the entry exists but could NOT be successfully locked it is returned
+ * held and *errorp is set to 1.
*/
static
-void
-pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m)
+pv_entry_t
+pv_get_try(pmap_t pmap, vm_pindex_t pindex, int *errorp)
{
pv_entry_t pv;
- pv = get_pv_entry();
- pv->pv_va = va;
- pv->pv_pmap = pmap;
- pv->pv_ptem = mpte;
-
- spin_lock(&pmap_spin);
- TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
- TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
- m->md.pv_generation++;
- m->md.pv_list_count++;
- vm_page_spin_lock(m);
- if (m->object)
- atomic_add_int(&m->object->agg_pv_list_count, 1);
- vm_page_spin_unlock(m);
- pmap->pm_generation++;
- spin_unlock(&pmap_spin);
+ spin_lock(&pmap->pm_spin);
+ if ((pv = pmap->pm_pvhint) == NULL || pv->pv_pindex != pindex)
+ pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pindex);
+ if (pv == NULL) {
+ spin_unlock(&pmap->pm_spin);
+ *errorp = 0;
+ return NULL;
+ }
+ if (pv_hold_try(pv)) {
+ pv_cache(pv, pindex);
+ spin_unlock(&pmap->pm_spin);
+ *errorp = 0;
+ return(pv); /* lock succeeded */
+ }
+ spin_unlock(&pmap->pm_spin);
+ *errorp = 1;
+ return (pv); /* lock failed */
}
/*
- * pmap_remove_pte: do the things to unmap a page in a process
- *
- * Caller must hold pmap token
- * Caller must hold pmap object
+ * Find the requested PV entry, returning a held pv or NULL
*/
static
-int
-pmap_remove_pte(struct pmap *pmap, pt_entry_t *ptq, vm_offset_t va,
- pmap_inval_info_t info)
+pv_entry_t
+pv_find(pmap_t pmap, vm_pindex_t pindex)
{
- pt_entry_t oldpte;
- vm_page_t m;
- vm_page_t mpte;
- vm_pindex_t ptepindex;
+ pv_entry_t pv;
- ASSERT_LWKT_TOKEN_HELD(&pmap->pm_token);
+ spin_lock(&pmap->pm_spin);
- pmap_inval_interlock(info, pmap, va);
- oldpte = pte_load_clear(ptq);
- pmap_inval_deinterlock(info, pmap);
- if (oldpte & PG_W)
- pmap->pm_stats.wired_count -= 1;
- /*
- * Machines that don't support invlpg, also don't support
- * PG_G. XXX PG_G is disabled for SMP so don't worry about
- * the SMP case.
- */
- if (oldpte & PG_G)
- cpu_invlpg((void *)va);
- KKASSERT(pmap->pm_stats.resident_count > 0);
- --pmap->pm_stats.resident_count;
- if (oldpte & PG_MANAGED) {
- m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
- if (oldpte & PG_M) {
-#if defined(PMAP_DIAGNOSTIC)
- if (pmap_nw_modified((pt_entry_t) oldpte)) {
- kprintf(
- "pmap_remove: modified page not writable: va: 0x%lx, pte: 0x%lx\n",
- va, oldpte);
+ if ((pv = pmap->pm_pvhint) == NULL || pv->pv_pindex != pindex)
+ pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pindex);
+ if (pv == NULL) {
+ spin_unlock(&pmap->pm_spin);
+ return NULL;
+ }
+ pv_hold(pv);
+ pv_cache(pv, pindex);
+ spin_unlock(&pmap->pm_spin);
+ return(pv);
+}
+
+/*
+ * Lock a held pv, keeping the hold count
+ */
+static
+void
+_pv_lock(pv_entry_t pv PMAP_DEBUG_DECL)
+{
+ u_int count;
+
+ for (;;) {
+ count = pv->pv_hold;
+ cpu_ccfence();
+ if ((count & PV_HOLD_LOCKED) == 0) {
+ if (atomic_cmpset_int(&pv->pv_hold, count,
+ count | PV_HOLD_LOCKED)) {
+#ifdef PMAP_DEBUG
+ pv->pv_func = func;
+ pv->pv_line = lineno;
+#endif
+ return;
}
+ continue;
+ }
+ tsleep_interlock(pv, 0);
+ if (atomic_cmpset_int(&pv->pv_hold, count,
+ count | PV_HOLD_WAITING)) {
+#ifdef PMAP_DEBUG
+ kprintf("pv waiting on %s:%d\n",
+ pv->pv_func, pv->pv_line);
#endif
- if (pmap_track_modified(va))
- vm_page_dirty(m);
+ tsleep(pv, PINTERLOCKED, "pvwait", hz);
}
- if (oldpte & PG_A)
- vm_page_flag_set(m, PG_REFERENCED);
- return pmap_remove_entry(pmap, m, va, info);
+ /* retry */
}
+}
- /*
- * Unmanaged pages in userspace still wire the PT page, we have
- * to look up the mpte for the PDE page and pass it in.
- */
- if (va < VM_MAX_USER_ADDRESS) {
- ptepindex = pmap_pde_pindex(va);
- mpte = vm_page_lookup(pmap->pm_pteobj, ptepindex);
- KKASSERT(mpte);
- } else {
- mpte = NULL;
+/*
+ * Unlock a held and locked pv, keeping the hold count.
+ */
+static
+void
+pv_unlock(pv_entry_t pv)
+{
+ u_int count;
+
+ if (atomic_cmpset_int(&pv->pv_hold, PV_HOLD_LOCKED | 1, 1))
+ return;
+
+ for (;;) {
+ count = pv->pv_hold;
+ cpu_ccfence();
+ KKASSERT((count & (PV_HOLD_LOCKED|PV_HOLD_MASK)) >=
+ (PV_HOLD_LOCKED | 1));
+ if (atomic_cmpset_int(&pv->pv_hold, count,
+ count &
+ ~(PV_HOLD_LOCKED | PV_HOLD_WAITING))) {
+ if (count & PV_HOLD_WAITING)
+ wakeup(pv);
+ break;
+ }
}
- return pmap_unwire_pte_hold(pmap, va, mpte, info);
}
/*
- * Remove a single page from a process address space.
- *
- * This function may not be called from an interrupt if the pmap is
- * not kernel_pmap.
+ * Unlock and drop a pv. If the pv is no longer associated with a pmap
+ * and the hold count drops to zero we will free it.
*
- * Caller must hold pmap->pm_token
- * Caller must hold pmap object
+ * Caller should not hold any spin locks. We are protected from hold races
+ * by virtue of holds only occuring only with a pmap_spin or vm_page_spin
+ * lock held. A pv cannot be located otherwise.
*/
static
void
-pmap_remove_page(struct pmap *pmap, vm_offset_t va, pmap_inval_info_t info)
+pv_put(pv_entry_t pv)
{
- pt_entry_t *pte;
+ if (atomic_cmpset_int(&pv->pv_hold, PV_HOLD_LOCKED | 1, 0)) {
+ if (pv->pv_pmap == NULL)
+ zfree(pvzone, pv);
+ return;
+ }
+ pv_unlock(pv);
+ pv_drop(pv);
+}
- ASSERT_LWKT_TOKEN_HELD(&pmap->pm_token);
+/*
+ * Unlock, drop, and free a pv, destroying it. The pv is removed from its
+ * pmap. Any pte operations must have already been completed.
+ */
+static
+void
+pv_free(pv_entry_t pv)
+{
+ pmap_t pmap;
- pte = pmap_pte(pmap, va);
- if (pte == NULL)
- return;
- if ((*pte & PG_V) == 0)
+ KKASSERT(pv->pv_m == NULL);
+ if ((pmap = pv->pv_pmap) != NULL) {
+ spin_lock(&pmap->pm_spin);
+ pv_entry_rb_tree_RB_REMOVE(&pmap->pm_pvroot, pv);
+ if (pmap->pm_pvhint == pv)
+ pmap->pm_pvhint = NULL;
+ atomic_add_long(&pmap->pm_stats.resident_count, -1);
+ pv->pv_pmap = NULL;
+ pv->pv_pindex = 0;
+ spin_unlock(&pmap->pm_spin);
+ }
+ pv_put(pv);
+}
+
+/*
+ * This routine is very drastic, but can save the system
+ * in a pinch.
+ */
+void
+pmap_collect(void)
+{
+ int i;
+ vm_page_t m;
+ static int warningdone=0;
+
+ if (pmap_pagedaemon_waken == 0)
return;
- pmap_remove_pte(pmap, pte, va, info);
+ pmap_pagedaemon_waken = 0;
+ if (warningdone < 5) {
+ kprintf("pmap_collect: collecting pv entries -- "
+ "suggest increasing PMAP_SHPGPERPROC\n");
+ warningdone++;
+ }
+
+ for (i = 0; i < vm_page_array_size; i++) {
+ m = &vm_page_array[i];
+ if (m->wire_count || m->hold_count)
+ continue;
+ if (vm_page_busy_try(m, TRUE) == 0) {
+ if (m->wire_count == 0 && m->hold_count == 0) {
+ pmap_remove_all(m);
+ }
+ vm_page_wakeup(m);
+ }
+ }
}
/*
- * Remove the given range of addresses from the specified map.
+ * Scan the pmap for active page table entries and issue a callback.
+ * The callback must dispose of pte_pv.
*
- * It is assumed that the start and end are properly rounded to the page size.
+ * NOTE: Unmanaged page table entries will not have a pte_pv
*
- * This function may not be called from an interrupt if the pmap is not
- * kernel_pmap.
+ * NOTE: Kernel page table entries will not have a pt_pv. That is, wiring
+ * counts are not tracked in kernel page table pages.
+ *
+ * It is assumed that the start and end are properly rounded to the page size.
*/
-void
-pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva)
-{
+static void
+pmap_scan(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva,
+ void (*func)(pmap_t, struct pmap_inval_info *,
+ pv_entry_t, pv_entry_t, vm_offset_t,
+ pt_entry_t *, void *),
+ void *arg)
+{
+ pv_entry_t pdp_pv; /* A page directory page PV */
+ pv_entry_t pd_pv; /* A page directory PV */
+ pv_entry_t pt_pv; /* A page table PV */
+ pv_entry_t pte_pv; /* A page table entry PV */
+ pt_entry_t *ptep;
vm_offset_t va_next;
- pml4_entry_t *pml4e;
- pdp_entry_t *pdpe;
- pd_entry_t ptpaddr, *pde;
- pt_entry_t *pte;
struct pmap_inval_info info;
+ int error;
if (pmap == NULL)
return;
- vm_object_hold(pmap->pm_pteobj);
+ /*
+ * Hold the token for stability; if the pmap is empty we have nothing
+ * to do.
+ */
lwkt_gettoken(&pmap->pm_token);
+#if 0
if (pmap->pm_stats.resident_count == 0) {
lwkt_reltoken(&pmap->pm_token);
- vm_object_drop(pmap->pm_pteobj);
return;
}
+#endif
pmap_inval_init(&info);
/*
- * special handling of removing one page. a very
- * common operation and easy to short circuit some
- * code.
+ * Special handling for removing one page, which is a very common
+ * operation (it is?).
+ * NOTE: Locks must be ordered bottom-up. pte,pt,pd,pdp,pml4
*/
if (sva + PAGE_SIZE == eva) {
- pde = pmap_pde(pmap, sva);
- if (pde && (*pde & PG_PS) == 0) {
- pmap_remove_page(pmap, sva, &info);
- pmap_inval_done(&info);
- lwkt_reltoken(&pmap->pm_token);
- vm_object_drop(pmap->pm_pteobj);
- return;
+ if (sva >= VM_MAX_USER_ADDRESS) {
+ /*
+ * Kernel mappings do not track wire counts on
+ * page table pages.
+ */
+ pt_pv = NULL;
+ pte_pv = pv_get(pmap, pmap_pte_pindex(sva));
+ ptep = vtopte(sva);
+ } else {
+ /*
+ * User mappings may or may not have a pte_pv but
+ * will always have a pt_pv if the page is present.
+ */
+ pte_pv = pv_get(pmap, pmap_pte_pindex(sva));
+ pt_pv = pv_get(pmap, pmap_pt_pindex(sva));
+ if (pt_pv == NULL) {
+ KKASSERT(pte_pv == NULL);
+ goto fast_skip;
+ }
+ ptep = pv_pte_lookup(pt_pv, pmap_pte_index(sva));
}
+ if (*ptep == 0) {
+ KKASSERT(pte_pv == NULL);
+ } else if (pte_pv) {
+ KKASSERT((*ptep & (PG_MANAGED|PG_V)) ==
+ (PG_MANAGED|PG_V));
+ func(pmap, &info, pte_pv, pt_pv, sva, ptep, arg);
+ } else {
+ KKASSERT((*ptep & (PG_MANAGED|PG_V)) ==
+ PG_V);
+ func(pmap, &info, pte_pv, pt_pv, sva, ptep, arg);
+ }
+ if (pt_pv)
+ pv_put(pt_pv);
+fast_skip:
+ pmap_inval_done(&info);
+ lwkt_reltoken(&pmap->pm_token);
+ return;
}
+ /*
+ * NOTE: kernel mappings do not track page table pages, only
+ * terminal pages.
+ *
+ * NOTE: Locks must be ordered bottom-up. pte,pt,pd,pdp,pml4.
+ * However, for the scan to be efficient we try to
+ * cache items top-down.
+ */
+ pdp_pv = NULL;
+ pd_pv = NULL;
+ pt_pv = NULL;
+
for (; sva < eva; sva = va_next) {
- pml4e = pmap_pml4e(pmap, sva);
- if ((*pml4e & PG_V) == 0) {
+ lwkt_yield();
+ if (sva >= VM_MAX_USER_ADDRESS) {
+ if (pt_pv) {
+ pv_put(pt_pv);
+ pt_pv = NULL;
+ }
+ goto kernel_skip;
+ }
+
+ /*
+ * PDP cache
+ */
+ if (pdp_pv == NULL) {
+ pdp_pv = pv_get(pmap, pmap_pdp_pindex(sva));
+ } else if (pdp_pv->pv_pindex != pmap_pdp_pindex(sva)) {
+ pv_put(pdp_pv);
+ pdp_pv = pv_get(pmap, pmap_pdp_pindex(sva));
+ }
+ if (pdp_pv == NULL) {
va_next = (sva + NBPML4) & ~PML4MASK;
if (va_next < sva)
va_next = eva;
continue;
}
- pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
- if ((*pdpe & PG_V) == 0) {
+ /*
+ * PD cache
+ */
+ if (pd_pv == NULL) {
+ if (pdp_pv) {
+ pv_put(pdp_pv);
+ pdp_pv = NULL;
+ }
+ pd_pv = pv_get(pmap, pmap_pd_pindex(sva));
+ } else if (pd_pv->pv_pindex != pmap_pd_pindex(sva)) {
+ pv_put(pd_pv);
+ if (pdp_pv) {
+ pv_put(pdp_pv);
+ pdp_pv = NULL;
+ }
+ pd_pv = pv_get(pmap, pmap_pd_pindex(sva));
+ }
+ if (pd_pv == NULL) {
va_next = (sva + NBPDP) & ~PDPMASK;
if (va_next < sva)
va_next = eva;
}
/*
- * Calculate index for next page table.
+ * PT cache
*/
- va_next = (sva + NBPDR) & ~PDRMASK;
- if (va_next < sva)
- va_next = eva;
-
- pde = pmap_pdpe_to_pde(pdpe, sva);
- ptpaddr = *pde;
+ if (pt_pv == NULL) {
+ if (pdp_pv) {
+ pv_put(pdp_pv);
+ pdp_pv = NULL;
+ }
+ if (pd_pv) {
+ pv_put(pd_pv);
+ pd_pv = NULL;
+ }
+ pt_pv = pv_get(pmap, pmap_pt_pindex(sva));
+ } else if (pt_pv->pv_pindex != pmap_pt_pindex(sva)) {
+ if (pdp_pv) {
+ pv_put(pdp_pv);
+ pdp_pv = NULL;
+ }
+ if (pd_pv) {
+ pv_put(pd_pv);
+ pd_pv = NULL;
+ }
+ pv_put(pt_pv);
+ pt_pv = pv_get(pmap, pmap_pt_pindex(sva));
+ }
/*
- * Weed out invalid mappings.
+ * We will scan or skip a page table page so adjust va_next
+ * either way.
*/
- if (ptpaddr == 0)
+ if (pt_pv == NULL) {
+ va_next = (sva + NBPDR) & ~PDRMASK;
+ if (va_next < sva)
+ va_next = eva;
continue;
+ }
/*
- * Check for large page.
+ * From this point in the loop testing pt_pv for non-NULL
+ * means we are in UVM, else if it is NULL we are in KVM.
*/
- if ((ptpaddr & PG_PS) != 0) {
- /* JG FreeBSD has more complex treatment here */
- pmap_inval_interlock(&info, pmap, -1);
- *pde = 0;
- pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
- pmap_inval_deinterlock(&info, pmap);
- continue;
- }
+kernel_skip:
+ va_next = (sva + NBPDR) & ~PDRMASK;
+ if (va_next < sva)
+ va_next = eva;
/*
* Limit our scan to either the end of the va represented
* by the current page table page, or to the end of the
* range being removed.
+ *
+ * Scan the page table for pages. Some pages may not be
+ * managed (might not have a pv_entry).
+ *
+ * There is no page table management for kernel pages so
+ * pt_pv will be NULL in that case, but otherwise pt_pv
+ * is non-NULL, locked, and referenced.
*/
if (va_next > eva)
va_next = eva;
- /*
- * NOTE: pmap_remove_pte() can block.
- */
- for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
- sva += PAGE_SIZE) {
- if (*pte == 0)
+ if (pt_pv)
+ ptep = pv_pte_lookup(pt_pv, pmap_pte_index(sva));
+ else
+ ptep = vtopte(sva);
+
+ while (sva < va_next) {
+ if (*ptep == 0) {
+ /* XXX remove me */
+ pte_pv = pv_find(pmap, pmap_pte_pindex(sva));
+ KKASSERT(pte_pv == NULL);
+ sva += PAGE_SIZE;
+ ++ptep;
continue;
- if (pmap_remove_pte(pmap, pte, sva, &info))
- break;
+ }
+
+ /*
+ * We need a locked pte_pv as well and may have to
+ * loop to retry if we can't get it non-blocking
+ * while pt_pv is held locked.
+ *
+ * This is a bit complicated because once we release
+ * the pt_pv our ptep is no longer valid, so we have
+ * to cycle the whole thing.
+ */
+ if (pt_pv) {
+ pte_pv = pv_get_try(pmap, pmap_pte_pindex(sva),
+ &error);
+ if (error) {
+ kprintf("x");
+ if (pdp_pv) {
+ pv_put(pdp_pv);
+ pdp_pv = NULL;
+ }
+ if (pd_pv) {
+ pv_put(pd_pv);
+ pd_pv = NULL;
+ }
+ pv_put(pt_pv); /* must be non-NULL */
+ pt_pv = NULL;
+ pv_lock(pte_pv); /* safe to block now */
+ pv_put(pte_pv);
+ pte_pv = NULL;
+ pt_pv = pv_get(pmap,
+ pmap_pt_pindex(sva));
+ continue;
+ }
+ } else {
+ pte_pv = pv_get(pmap, pmap_pte_pindex(sva));
+ }
+
+ /*
+ * Ready for the callback
+ */
+ if (pte_pv) {
+ KKASSERT((*ptep & (PG_MANAGED|PG_V)) ==
+ (PG_MANAGED|PG_V));
+ func(pmap, &info, pte_pv, pt_pv, sva,
+ ptep, arg);
+ } else {
+ KKASSERT((*ptep & (PG_MANAGED|PG_V)) ==
+ PG_V);
+ func(pmap, &info, pte_pv, pt_pv, sva,
+ ptep, arg);
+ }
+ pte_pv = NULL; /* eaten by callback */
+ sva += PAGE_SIZE;
+ ++ptep;
}
}
+ if (pdp_pv) {
+ pv_put(pdp_pv);
+ pdp_pv = NULL;
+ }
+ if (pd_pv) {
+ pv_put(pd_pv);
+ pd_pv = NULL;
+ }
+ if (pt_pv) {
+ pv_put(pt_pv);
+ pt_pv = NULL;
+ }
pmap_inval_done(&info);
lwkt_reltoken(&pmap->pm_token);
- vm_object_drop(pmap->pm_pteobj);
+}
+
+void
+pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva)
+{
+ pmap_scan(pmap, sva, eva, pmap_remove_callback, NULL);
+}
+
+static void
+pmap_remove_callback(pmap_t pmap, struct pmap_inval_info *info,
+ pv_entry_t pte_pv, pv_entry_t pt_pv, vm_offset_t va,
+ pt_entry_t *ptep, void *arg __unused)
+{
+ pt_entry_t pte;
+
+ if (pte_pv) {
+ /*
+ * This will also drop pt_pv's wire_count. Note that
+ * terminal pages are not wired based on mmu presence.
+ */
+ pmap_remove_pv_pte(pte_pv, pt_pv, info);
+ pmap_remove_pv_page(pte_pv, 0);
+ pv_free(pte_pv);
+ } else {
+ /*
+ * pt_pv's wire_count is still bumped by unmanaged pages
+ * so we must decrement it manually.
+ */
+ pmap_inval_interlock(info, pmap, va);
+ pte = pte_load_clear(ptep);
+ pmap_inval_deinterlock(info, pmap);
+ if (pte & PG_W)
+ atomic_add_long(&pmap->pm_stats.wired_count, -1);
+ atomic_add_long(&pmap->pm_stats.resident_count, -1);
+ if (pt_pv && vm_page_unwire_quick(pt_pv->pv_m))
+ panic("pmap_remove: insufficient wirecount");
+ }
}
/*
pmap_remove_all(vm_page_t m)
{
struct pmap_inval_info info;
- pt_entry_t *pte, tpte;
pv_entry_t pv;
- struct pmap *pmap;
if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
return;
pmap_inval_init(&info);
- spin_lock(&pmap_spin);
+ vm_page_spin_lock(m);
while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
- /*
- * We have to be holding the pmap token to interlock
- * the pte destruction and pv removal. XXX need hold on
- * pmap.
- */
- pmap = pv->pv_pmap;
- spin_unlock(&pmap_spin);
- lwkt_gettoken(&pmap->pm_token); /* XXX hold race */
- spin_lock(&pmap_spin);
- if (pv != TAILQ_FIRST(&m->md.pv_list)) {
- spin_unlock(&pmap_spin);
- lwkt_reltoken(&pmap->pm_token);
- spin_lock(&pmap_spin);
- continue;
+ KKASSERT(pv->pv_m == m);
+ if (pv_hold_try(pv)) {
+ vm_page_spin_unlock(m);
+ } else {
+ vm_page_spin_unlock(m);
+ pv_lock(pv);
+ if (pv->pv_m != m) {
+ pv_put(pv);
+ vm_page_spin_lock(m);
+ continue;
+ }
}
-
/*
- * Remove the pv
+ * Holding no spinlocks, pv is locked.
*/
- TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
- TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
- m->md.pv_generation++;
- m->md.pv_list_count--;
+ pmap_remove_pv_pte(pv, NULL, &info);
+ pmap_remove_pv_page(pv, 0);
+ pv_free(pv);
vm_page_spin_lock(m);
- if (m->object)
- atomic_add_int(&m->object->agg_pv_list_count, -1);
- vm_page_spin_unlock(m);
- KKASSERT(m->md.pv_list_count >= 0);
- ++pv->pv_pmap->pm_generation;
- spin_unlock(&pmap_spin);
-
- /*
- * pv is now isolated
- */
- KKASSERT(pv->pv_pmap->pm_stats.resident_count > 0);
- --pv->pv_pmap->pm_stats.resident_count;
-
- pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
- pmap_inval_interlock(&info, pv->pv_pmap, pv->pv_va);
- tpte = pte_load_clear(pte);
- KKASSERT(tpte & PG_MANAGED);
- if (tpte & PG_W)
- pv->pv_pmap->pm_stats.wired_count--;
- pmap_inval_deinterlock(&info, pv->pv_pmap);
- if (tpte & PG_A)
- vm_page_flag_set(m, PG_REFERENCED);
-
- /*
- * Update the vm_page_t clean and reference bits.
- */
- if (tpte & PG_M) {
-#if defined(PMAP_DIAGNOSTIC)
- if (pmap_nw_modified(tpte)) {
- kprintf("pmap_remove_all: modified page not "
- "writable: va: 0x%lx, pte: 0x%lx\n",
- pv->pv_va, tpte);
- }
-#endif
- if (pmap_track_modified(pv->pv_va))
- vm_page_dirty(m); /* XXX races(m) */
- }
-
- spin_lock(&pmap_spin);
- if (TAILQ_EMPTY(&m->md.pv_list))
- vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
- spin_unlock(&pmap_spin);
-
- pmap_unwire_pte_hold(pv->pv_pmap, pv->pv_va,
- pv->pv_ptem, &info);
- lwkt_reltoken(&pv->pv_pmap->pm_token);
-
- free_pv_entry(pv);
- spin_lock(&pmap_spin);
}
- spin_unlock(&pmap_spin);
+ vm_page_spin_unlock(m);
KKASSERT((m->flags & (PG_MAPPED|PG_WRITEABLE)) == 0);
pmap_inval_done(&info);
}
void
pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
{
- vm_offset_t va_next;
- pml4_entry_t *pml4e;
- pdp_entry_t *pdpe;
- pd_entry_t ptpaddr, *pde;
- pt_entry_t *pte;
- pmap_inval_info info;
-
/* JG review for NX */
if (pmap == NULL)
return;
-
if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
pmap_remove(pmap, sva, eva);
return;
}
-
if (prot & VM_PROT_WRITE)
return;
+ pmap_scan(pmap, sva, eva, pmap_protect_callback, &prot);
+}
- lwkt_gettoken(&pmap->pm_token);
- pmap_inval_init(&info);
-
- for (; sva < eva; sva = va_next) {
- pml4e = pmap_pml4e(pmap, sva);
- if ((*pml4e & PG_V) == 0) {
- va_next = (sva + NBPML4) & ~PML4MASK;
- if (va_next < sva)
- va_next = eva;
- continue;
- }
-
- pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
- if ((*pdpe & PG_V) == 0) {
- va_next = (sva + NBPDP) & ~PDPMASK;
- if (va_next < sva)
- va_next = eva;
- continue;
- }
-
- va_next = (sva + NBPDR) & ~PDRMASK;
- if (va_next < sva)
- va_next = eva;
-
- pde = pmap_pdpe_to_pde(pdpe, sva);
- ptpaddr = *pde;
-
- /*
- * Check for large page.
- */
- if ((ptpaddr & PG_PS) != 0) {
- pmap_inval_interlock(&info, pmap, -1);
- *pde &= ~(PG_M|PG_RW);
- pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
- pmap_inval_deinterlock(&info, pmap);
- continue;
- }
-
- /*
- * Weed out invalid mappings. Note: we assume that the page
- * directory table is always allocated, and in kernel virtual.
- */
- if (ptpaddr == 0)
- continue;
-
- if (va_next > eva)
- va_next = eva;
-
- for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
- sva += PAGE_SIZE) {
- pt_entry_t pbits;
- pt_entry_t cbits;
- vm_page_t m;
+static
+void
+pmap_protect_callback(pmap_t pmap, struct pmap_inval_info *info,
+ pv_entry_t pte_pv, pv_entry_t pt_pv, vm_offset_t va,
+ pt_entry_t *ptep, void *arg __unused)
+{
+ pt_entry_t pbits;
+ pt_entry_t cbits;
+ vm_page_t m;
- /*
- * XXX non-optimal.
- */
- pmap_inval_interlock(&info, pmap, sva);
+ /*
+ * XXX non-optimal.
+ */
+ pmap_inval_interlock(info, pmap, va);
again:
- pbits = *pte;
- cbits = pbits;
- if ((pbits & PG_V) == 0) {
- pmap_inval_deinterlock(&info, pmap);
- continue;
- }
- if (pbits & PG_MANAGED) {
- m = NULL;
- if (pbits & PG_A) {
+ pbits = *ptep;
+ cbits = pbits;
+ if (pte_pv) {
+ m = NULL;
+ if (pbits & PG_A) {
+ m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
+ KKASSERT(m == pte_pv->pv_m);
+ vm_page_flag_set(m, PG_REFERENCED);
+ cbits &= ~PG_A;
+ }
+ if (pbits & PG_M) {
+ if (pmap_track_modified(pte_pv->pv_pindex)) {
+ if (m == NULL)
m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
- vm_page_flag_set(m, PG_REFERENCED);
- cbits &= ~PG_A;
- }
- if (pbits & PG_M) {
- if (pmap_track_modified(sva)) {
- if (m == NULL)
- m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
- vm_page_dirty(m);
- cbits &= ~PG_M;
- }
- }
- }
- cbits &= ~PG_RW;
- if (pbits != cbits &&
- !atomic_cmpset_long(pte, pbits, cbits)) {
- goto again;
+ vm_page_dirty(m);
+ cbits &= ~PG_M;
}
- pmap_inval_deinterlock(&info, pmap);
}
}
- pmap_inval_done(&info);
- lwkt_reltoken(&pmap->pm_token);
+ cbits &= ~PG_RW;
+ if (pbits != cbits && !atomic_cmpset_long(ptep, pbits, cbits)) {
+ goto again;
+ }
+ pmap_inval_deinterlock(info, pmap);
+ if (pte_pv)
+ pv_put(pte_pv);
}
/*
- * Insert the given physical page (p) at
- * the specified virtual address (v) in the
- * target physical map with the protection requested.
- *
- * If specified, the page will be wired down, meaning
- * that the related pte can not be reclaimed.
+ * Insert the vm_page (m) at the virtual address (va), replacing any prior
+ * mapping at that address. Set protection and wiring as requested.
*
- * NB: This is the only routine which MAY NOT lazy-evaluate
- * or lose information. That is, this routine must actually
- * insert this page into the given map NOW.
+ * NOTE: This routine MUST insert the page into the pmap now, it cannot
+ * lazy-evaluate.
*/
void
pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
boolean_t wired)
-{
- vm_paddr_t pa;
- pd_entry_t *pde;
- pt_entry_t *pte;
+{
+ pmap_inval_info info;
+ pv_entry_t pt_pv; /* page table */
+ pv_entry_t pte_pv; /* page table entry */
+ pt_entry_t *ptep;
vm_paddr_t opa;
pt_entry_t origpte, newpte;
- vm_page_t mpte;
- pmap_inval_info info;
+ vm_paddr_t pa;
if (pmap == NULL)
return;
-
va = trunc_page(va);
#ifdef PMAP_DIAGNOSTIC
if (va >= KvaEnd)
panic("pmap_enter: toobig");
if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
- panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", va);
+ panic("pmap_enter: invalid to pmap_enter page table "
+ "pages (va: 0x%lx)", va);
#endif
if (va < UPT_MAX_ADDRESS && pmap == &kernel_pmap) {
- kprintf("Warning: pmap_enter called on UVA with kernel_pmap\n");
+ kprintf("Warning: pmap_enter called on UVA with "
+ "kernel_pmap\n");
#ifdef DDB
db_print_backtrace();
#endif
}
if (va >= UPT_MAX_ADDRESS && pmap != &kernel_pmap) {
- kprintf("Warning: pmap_enter called on KVA without kernel_pmap\n");
+ kprintf("Warning: pmap_enter called on KVA without"
+ "kernel_pmap\n");
#ifdef DDB
db_print_backtrace();
#endif
}
- vm_object_hold(pmap->pm_pteobj);
- lwkt_gettoken(&pmap->pm_token);
-
/*
- * In the case that a page table page is not
- * resident, we are creating it here.
- */
- if (va < VM_MAX_USER_ADDRESS)
- mpte = pmap_allocpte(pmap, va);
- else
- mpte = NULL;
+ * Get locked PV entries for our new page table entry (pte_pv)
+ * and for its parent page table (pt_pv). We need the parent
+ * so we can resolve the location of the ptep.
+ *
+ * Only hardware MMU actions can modify the ptep out from
+ * under us.
+ *
+ * if (m) is fictitious or unmanaged we do not create a managing
+ * pte_pv for it. Any pre-existing page's management state must
+ * match (avoiding code complexity).
+ *
+ * If the pmap is still being initialized we assume existing
+ * page tables.
+ *
+ * Kernel mapppings do not track page table pages (i.e. pt_pv).
+ * pmap_allocpte() checks the
+ */
+ if (pmap_initialized == FALSE) {
+ pte_pv = NULL;
+ pt_pv = NULL;
+ ptep = vtopte(va);
+ } else if (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) {
+ pte_pv = NULL;
+ if (va >= VM_MAX_USER_ADDRESS) {
+ pt_pv = NULL;
+ ptep = vtopte(va);
+ } else {
+ pt_pv = pmap_allocpte(pmap, pmap_pt_pindex(va), NULL);
+ ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va));
+ }
+ KKASSERT(*ptep == 0 || (*ptep & PG_MANAGED) == 0);
+ } else {
+ if (va >= VM_MAX_USER_ADDRESS) {
+ pt_pv = NULL;
+ pte_pv = pmap_allocpte(pmap, pmap_pte_pindex(va), NULL);
+ ptep = vtopte(va);
+ } else {
+ pte_pv = pmap_allocpte(pmap, pmap_pte_pindex(va),
+ &pt_pv);
+ ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va));
+ }
+ KKASSERT(*ptep == 0 || (*ptep & PG_MANAGED));
+ }
if ((prot & VM_PROT_NOSYNC) == 0)
pmap_inval_init(&info);
- pde = pmap_pde(pmap, va);
- if (pde != NULL && (*pde & PG_V) != 0) {
- if ((*pde & PG_PS) != 0)
- panic("pmap_enter: attempted pmap_enter on 2MB page");
- pte = pmap_pde_to_pte(pde, va);
- } else {
- panic("pmap_enter: invalid page directory va=%#lx", va);
- }
- KKASSERT(pte != NULL);
pa = VM_PAGE_TO_PHYS(m);
- origpte = *pte;
+ origpte = *ptep;
opa = origpte & PG_FRAME;
/*
* are valid mappings in them. Hence, if a user page is wired,
* the PT page will be also.
*/
+ KKASSERT(pte_pv == NULL || m == pte_pv->pv_m);
if (wired && ((origpte & PG_W) == 0))
- pmap->pm_stats.wired_count++;
+ atomic_add_long(&pmap->pm_stats.wired_count, 1);
else if (!wired && (origpte & PG_W))
- pmap->pm_stats.wired_count--;
+ atomic_add_long(&pmap->pm_stats.wired_count, -1);
#if defined(PMAP_DIAGNOSTIC)
if (pmap_nw_modified(origpte)) {
- kprintf(
- "pmap_enter: modified page not writable: va: 0x%lx, pte: 0x%lx\n",
- va, origpte);
+ kprintf("pmap_enter: modified page not writable: "
+ "va: 0x%lx, pte: 0x%lx\n", va, origpte);
}
#endif
/*
- * Remove the extra pte reference. Note that we cannot
- * optimize the RO->RW case because we have adjusted the
- * wiring count above and may need to adjust the wiring
- * bits below.
- */
- if (mpte)
- vm_page_unwire_quick(mpte);
-
- /*
* We might be turning off write access to the page,
* so we go ahead and sense modify status.
*/
- if (origpte & PG_MANAGED) {
- if ((origpte & PG_M) && pmap_track_modified(va)) {
+ if (pte_pv) {
+ if ((origpte & PG_M) &&
+ pmap_track_modified(pte_pv->pv_pindex)) {
vm_page_t om;
- om = PHYS_TO_VM_PAGE(opa);
+ om = pte_pv->pv_m;
+ KKASSERT(PHYS_TO_VM_PAGE(opa) == om);
vm_page_dirty(om);
}
pa |= PG_MANAGED;
- KKASSERT(m->flags & PG_MAPPED);
}
goto validate;
}
+
/*
* Mapping has changed, invalidate old range and fall through to
* handle validating new mapping.
- */
- while (opa) {
- int err;
- err = pmap_remove_pte(pmap, pte, va, &info);
- if (err)
- panic("pmap_enter: pte vanished, va: 0x%lx", va);
- origpte = *pte;
- opa = origpte & PG_FRAME;
- if (opa) {
- kprintf("pmap_enter: Warning, raced pmap %p va %p\n",
- pmap, (void *)va);
+ *
+ * We always interlock pte removals.
+ */
+ if (opa) {
+ if (pte_pv) {
+ /* XXX pmap_remove_pv_pte() unwires pt_pv */
+ vm_page_wire_quick(pt_pv->pv_m);
+ if (prot & VM_PROT_NOSYNC)
+ pmap_remove_pv_pte(pte_pv, pt_pv, NULL);
+ else
+ pmap_remove_pv_pte(pte_pv, pt_pv, &info);
+ if (pte_pv->pv_m)
+ pmap_remove_pv_page(pte_pv, 0);
+ } else if (prot & VM_PROT_NOSYNC) {
+ *ptep = 0;
+ cpu_invlpg((void *)va);
+ atomic_add_long(&pmap->pm_stats.resident_count, -1);
+ } else {
+ pmap_inval_interlock(&info, pmap, va);
+ *ptep = 0;
+ pmap_inval_deinterlock(&info, pmap);
+ atomic_add_long(&pmap->pm_stats.resident_count, -1);
}
+ KKASSERT(*ptep == 0);
}
/*
- * Enter on the PV list if part of our managed memory. Note that we
- * raise IPL while manipulating pv_table since pmap_enter can be
- * called at interrupt time.
- *
- * The new mapping covers mpte's new wiring count so we don't
- * unwire it.
+ * Enter on the PV list if part of our managed memory. Wiring is
+ * handled automatically.
*/
- if (pmap_initialized &&
- (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) {
- pmap_insert_entry(pmap, va, mpte, m);
- pa |= PG_MANAGED;
+ if (pte_pv) {
+ KKASSERT(pte_pv->pv_m == NULL);
+ vm_page_spin_lock(m);
+ pte_pv->pv_m = m;
+ TAILQ_INSERT_TAIL(&m->md.pv_list, pte_pv, pv_list);
+ /*
+ if (m->object)
+ atomic_add_int(&m->object->agg_pv_list_count, 1);
+ */
vm_page_flag_set(m, PG_MAPPED);
+ vm_page_spin_unlock(m);
+ pa |= PG_MANAGED;
}
/*
* Increment counters
*/
- ++pmap->pm_stats.resident_count;
if (wired)
- pmap->pm_stats.wired_count++;
+ atomic_add_long(&pmap->pm_stats.wired_count, 1);
validate:
/*
* Now validate mapping with desired protection/wiring.
*/
- newpte = (pt_entry_t) (pa | pte_prot(pmap, prot) | PG_V);
+ newpte = (pt_entry_t)(pa | pte_prot(pmap, prot) | PG_V);
if (wired)
newpte |= PG_W;
newpte |= pgeflag;
/*
- * if the mapping or permission bits are different, we need
+ * If the mapping or permission bits are different, we need
* to update the pte.
+ *
+ * We do not have to interlock pte insertions as no other
+ * cpu will have a TLB entry.
*/
if ((origpte & ~(PG_M|PG_A)) != newpte) {
+#if 0
if ((prot & VM_PROT_NOSYNC) == 0)
pmap_inval_interlock(&info, pmap, va);
- *pte = newpte | PG_A;
+#endif
+ *ptep = newpte | PG_A;
+ cpu_invlpg((void *)va);
+#if 0
if (prot & VM_PROT_NOSYNC)
cpu_invlpg((void *)va);
else
pmap_inval_deinterlock(&info, pmap);
+#endif
if (newpte & PG_RW)
vm_page_flag_set(m, PG_WRITEABLE);
+ if (pte_pv == NULL)
+ atomic_add_long(&pmap->pm_stats.resident_count, 1);
}
KKASSERT((newpte & PG_MANAGED) == 0 || (m->flags & PG_MAPPED));
if ((prot & VM_PROT_NOSYNC) == 0)
pmap_inval_done(&info);
- lwkt_reltoken(&pmap->pm_token);
- vm_object_drop(pmap->pm_pteobj);
+
+ /*
+ * Cleanup the pv entry, allowing other accessors.
+ */
+ if (pte_pv)
+ pv_put(pte_pv);
+ if (pt_pv)
+ pv_put(pt_pv);
}
/*
void
pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m)
{
- pt_entry_t *pte;
- vm_paddr_t pa;
- vm_page_t mpte;
- pmap_inval_info info;
-
- lwkt_gettoken(&pmap->pm_token);
- vm_object_hold(pmap->pm_pteobj);
- pmap_inval_init(&info);
-
- if (va < UPT_MAX_ADDRESS && pmap == &kernel_pmap) {
- kprintf("Warning: pmap_enter_quick called on UVA with"
- "kernel_pmap\n");
-#ifdef DDB
- db_print_backtrace();
-#endif
- }
- if (va >= UPT_MAX_ADDRESS && pmap != &kernel_pmap) {
- kprintf("Warning: pmap_enter_quick called on KVA without"
- "kernel_pmap\n");
-#ifdef DDB
- db_print_backtrace();
-#endif
- }
-
- KKASSERT(va < UPT_MIN_ADDRESS); /* assert used on user pmaps only */
-
- /*
- * Calculate the page table page (mpte), allocating it if necessary.
- *
- * A wired page table page (mpte), or NULL, is passed onto the
- * section following.
- */
- if (va < VM_MAX_USER_ADDRESS) {
- mpte = pmap_allocpte(pmap, va);
- } else {
- mpte = NULL;
- /* this code path is not yet used */
- }
-
- /*
- * With a valid (and held) page directory page, we can just use
- * vtopte() to get to the pte. If the pte is already present
- * we do not disturb it.
- */
- pte = vtopte(va);
- if (*pte & PG_V) {
- pa = VM_PAGE_TO_PHYS(m);
- KKASSERT(((*pte ^ pa) & PG_FRAME) == 0);
- pmap_inval_done(&info);
- if (mpte)
- pmap_unwire_pte_hold(pmap, va, mpte, &info);
- vm_object_drop(pmap->pm_pteobj);
- lwkt_reltoken(&pmap->pm_token);
- return;
- }
-
- /*
- * Enter on the PV list if part of our managed memory.
- *
- * The new mapping covers mpte's new wiring count so we don't
- * unwire it.
- */
- if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) {
- pmap_insert_entry(pmap, va, mpte, m);
- vm_page_flag_set(m, PG_MAPPED);
- }
-
- /*
- * Increment counters
- */
- ++pmap->pm_stats.resident_count;
-
- pa = VM_PAGE_TO_PHYS(m);
-
- /*
- * Now validate mapping with RO protection
- */
- if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
- *pte = pa | PG_V | PG_U;
- else
- *pte = pa | PG_V | PG_U | PG_MANAGED;
-/* pmap_inval_add(&info, pmap, va); shouldn't be needed inval->valid */
- pmap_inval_done(&info);
- vm_object_drop(pmap->pm_pteobj);
- lwkt_reltoken(&pmap->pm_token);
+ pmap_enter(pmap, va, m, VM_PROT_READ, FALSE);
}
/*
return;
}
- if (psize + pindex > object->size) {
+ if (pindex + psize > object->size) {
if (object->size < pindex)
return;
psize = object->size - pindex;
info.mpte = NULL;
info.addr = addr;
info.pmap = pmap;
- info.desired = 0;
vm_object_hold(object);
vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
info->addr + x86_64_ptob(rel_index), p);
}
vm_page_wakeup(p);
- pmap_auto_yield(info);
return(0);
}
/*
- * Return TRUE if the pmap is in shape to trivially
- * pre-fault the specified address.
+ * Return TRUE if the pmap is in shape to trivially pre-fault the specified
+ * address.
*
- * Returns FALSE if it would be non-trivial or if a
- * pte is already loaded into the slot.
+ * Returns FALSE if it would be non-trivial or if a pte is already loaded
+ * into the slot.
*/
int
pmap_prefault_ok(pmap_t pmap, vm_offset_t addr)
{
pt_entry_t *pte;
- pd_entry_t *pde;
- int ret;
- lwkt_gettoken(&pmap->pm_token);
- pde = pmap_pde(pmap, addr);
- if (pde == NULL || *pde == 0) {
- ret = 0;
- } else {
- pte = vtopte(addr);
- ret = (*pte) ? 0 : 1;
+ spin_lock(&pmap->pm_spin);
+ if ((pte = pmap_pte(pmap, addr)) != NULL) {
+ if (*pte & PG_V) {
+ spin_unlock(&pmap->pm_spin);
+ return FALSE;
+ }
}
- lwkt_reltoken(&pmap->pm_token);
- return(ret);
+ spin_unlock(&pmap->pm_spin);
+ return TRUE;
}
/*
- * Routine: pmap_change_wiring
- * Function: Change the wiring attribute for a map/virtual-address
- * pair.
- * In/out conditions:
- * The mapping must already exist in the pmap.
+ * Change the wiring attribute for a pmap/va pair. The mapping must already
+ * exist in the pmap. The mapping may or may not be managed.
*/
void
pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
{
- pt_entry_t *pte;
+ pt_entry_t *ptep;
+ pv_entry_t pv;
if (pmap == NULL)
return;
-
lwkt_gettoken(&pmap->pm_token);
- pte = pmap_pte(pmap, va);
+ pv = pmap_allocpte(pmap, pmap_pt_pindex(va), NULL);
+ ptep = pv_pte_lookup(pv, pmap_pte_index(va));
- if (wired && !pmap_pte_w(pte))
- pmap->pm_stats.wired_count++;
- else if (!wired && pmap_pte_w(pte))
- pmap->pm_stats.wired_count--;
+ if (wired && !pmap_pte_w(ptep))
+ atomic_add_long(&pmap->pm_stats.wired_count, 1);
+ else if (!wired && pmap_pte_w(ptep))
+ atomic_add_long(&pmap->pm_stats.wired_count, -1);
/*
* Wiring is not a hardware characteristic so there is no need to
*/
#ifdef SMP
if (wired)
- atomic_set_long(pte, PG_W);
+ atomic_set_long(ptep, PG_W);
else
- atomic_clear_long(pte, PG_W);
+ atomic_clear_long(ptep, PG_W);
#else
if (wired)
- atomic_set_long_nonlocked(pte, PG_W);
+ atomic_set_long_nonlocked(ptep, PG_W);
else
- atomic_clear_long_nonlocked(pte, PG_W);
+ atomic_clear_long_nonlocked(ptep, PG_W);
#endif
+ pv_put(pv);
lwkt_reltoken(&pmap->pm_token);
}
pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr,
vm_size_t len, vm_offset_t src_addr)
{
- return;
-#if 0
- pmap_inval_info info;
- vm_offset_t addr;
- vm_offset_t end_addr = src_addr + len;
- vm_offset_t pdnxt;
- pd_entry_t src_frame, dst_frame;
- vm_page_t m;
-
- if (dst_addr != src_addr)
- return;
-#if JGPMAP32
- src_frame = src_pmap->pm_pdir[PTDPTDI] & PG_FRAME;
- if (src_frame != (PTDpde & PG_FRAME)) {
- return;
- }
-
- dst_frame = dst_pmap->pm_pdir[PTDPTDI] & PG_FRAME;
- if (dst_frame != (APTDpde & PG_FRAME)) {
- APTDpde = (pd_entry_t) (dst_frame | PG_RW | PG_V);
- /* The page directory is not shared between CPUs */
- cpu_invltlb();
- }
-#endif
- pmap_inval_init(&info);
- pmap_inval_add(&info, dst_pmap, -1);
- pmap_inval_add(&info, src_pmap, -1);
-
- lwkt_gettoken(&src_pmap->pm_token);
- lwkt_gettoken(&dst_pmap->pm_token);
- for (addr = src_addr; addr < end_addr; addr = pdnxt) {
- pt_entry_t *src_pte, *dst_pte;
- vm_page_t dstmpte, srcmpte;
- vm_offset_t srcptepaddr;
- vm_pindex_t ptepindex;
-
- if (addr >= UPT_MIN_ADDRESS)
- panic("pmap_copy: invalid to pmap_copy page tables\n");
-
- /*
- * Don't let optional prefaulting of pages make us go
- * way below the low water mark of free pages or way
- * above high water mark of used pv entries.
- */
- if (vmstats.v_free_count < vmstats.v_free_reserved ||
- pv_entry_count > pv_entry_high_water)
- break;
-
- pdnxt = ((addr + PAGE_SIZE*NPTEPG) & ~(PAGE_SIZE*NPTEPG - 1));
- ptepindex = addr >> PDRSHIFT;
-
-#if JGPMAP32
- srcptepaddr = (vm_offset_t) src_pmap->pm_pdir[ptepindex];
-#endif
- if (srcptepaddr == 0)
- continue;
-
- if (srcptepaddr & PG_PS) {
-#if JGPMAP32
- if (dst_pmap->pm_pdir[ptepindex] == 0) {
- dst_pmap->pm_pdir[ptepindex] = (pd_entry_t) srcptepaddr;
- dst_pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE;
- }
-#endif
- continue;
- }
-
- /*
- *
- */
- srcmpte = vm_page_lookup(src_pmap->pm_pteobj, ptepindex);
- if (srcmpte == NULL || srcmpte->wire_count == 1 ||
- (srcmpte->flags & PG_BUSY)) {
- continue;
- }
-
- if (pdnxt > end_addr)
- pdnxt = end_addr;
-
- src_pte = vtopte(addr);
-#if JGPMAP32
- dst_pte = avtopte(addr);
-#endif
- while (addr < pdnxt) {
- pt_entry_t ptetemp;
-
- ptetemp = *src_pte;
- /*
- * we only virtual copy managed pages
- */
- if ((ptetemp & PG_MANAGED) != 0) {
- /*
- * We have to check after allocpte for the
- * pte still being around... allocpte can
- * block.
- *
- * pmap_allocpte() can block. If we lose
- * our page directory mappings we stop.
- */
- dstmpte = pmap_allocpte(dst_pmap, addr);
-
-#if JGPMAP32
- if (src_frame != (PTDpde & PG_FRAME) ||
- dst_frame != (APTDpde & PG_FRAME)
- ) {
- kprintf("WARNING: pmap_copy: detected and corrected race\n");
- pmap_unwire_pte_hold(dst_pmap, dstmpte, &info);
- goto failed;
- } else if ((*dst_pte == 0) &&
- (ptetemp = *src_pte) != 0 &&
- (ptetemp & PG_MANAGED)) {
- /*
- * Clear the modified and
- * accessed (referenced) bits
- * during the copy.
- */
- m = PHYS_TO_VM_PAGE(ptetemp);
- *dst_pte = ptetemp & ~(PG_M | PG_A);
- ++dst_pmap->pm_stats.resident_count;
- pmap_insert_entry(dst_pmap, addr,
- dstmpte, m);
- KKASSERT(m->flags & PG_MAPPED);
- } else {
- kprintf("WARNING: pmap_copy: dst_pte race detected and corrected\n");
- pmap_unwire_pte_hold(dst_pmap, dstmpte, &info);
- goto failed;
- }
-#endif
- if (dstmpte->hold_count >= srcmpte->hold_count)
- break;
- }
- addr += PAGE_SIZE;
- src_pte++;
- dst_pte++;
- }
- }
-failed:
- lwkt_reltoken(&dst_pmap->pm_token);
- lwkt_reltoken(&src_pmap->pm_token);
- pmap_inval_done(&info);
-#endif
}
/*
}
/*
- * Returns true if the pmap's pv is one of the first
- * 16 pvs linked to from this page. This count may
- * be changed upwards or downwards in the future; it
- * is only necessary that true be returned for a small
- * subset of pmaps for proper page aging.
+ * Returns true if the pmap's pv is one of the first 16 pvs linked to from
+ * this page. This count may be changed upwards or downwards in the future;
+ * it is only necessary that true be returned for a small subset of pmaps
+ * for proper page aging.
*/
boolean_t
pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
return FALSE;
- spin_lock(&pmap_spin);
+ vm_page_spin_lock(m);
TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
if (pv->pv_pmap == pmap) {
- spin_unlock(&pmap_spin);
+ vm_page_spin_unlock(m);
return TRUE;
}
loops++;
if (loops >= 16)
break;
}
- spin_unlock(&pmap_spin);
+ vm_page_spin_unlock(m);
return (FALSE);
}
/*
* Remove all pages from specified address space this aids process exit
- * speeds. Also, this code is special cased for current process only, but
- * can have the more generic (and slightly slower) mode enabled. This
- * is much faster than pmap_remove in the case of running down an entire
- * address space.
+ * speeds. Also, this code may be special cased for the current process
+ * only.
*/
void
pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
{
- struct lwp *lp;
- pt_entry_t *pte, tpte;
- pv_entry_t pv, npv;
- vm_page_t m;
- vm_offset_t va;
- pmap_inval_info info;
- int iscurrentpmap;
- int save_generation;
-
- lp = curthread->td_lwp;
- if (lp && pmap == vmspace_pmap(lp->lwp_vmspace))
- iscurrentpmap = 1;
- else
- iscurrentpmap = 0;
-
- if (pmap->pm_pteobj)
- vm_object_hold(pmap->pm_pteobj);
- lwkt_gettoken(&pmap->pm_token);
- pmap_inval_init(&info);
-
- spin_lock(&pmap_spin);
- for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
- /*
- * Validate the pv. We have to interlock the address with
- * pmap_spin unlocked.
- */
- if (pv->pv_va >= eva || pv->pv_va < sva) {
- npv = TAILQ_NEXT(pv, pv_plist);
- continue;
- }
-
- KKASSERT(pmap == pv->pv_pmap);
- if (iscurrentpmap)
- pte = vtopte(pv->pv_va);
- else
- pte = pmap_pte_quick(pmap, pv->pv_va);
-
- /*
- * We cannot remove wired pages from a process' mapping
- * at this time. This does not require an invaldiation
- * interlock as PG_W cannot be set by the MMU.
- */
- if (*pte & PG_W) {
- npv = TAILQ_NEXT(pv, pv_plist);
- continue;
- }
-
- /*
- * Interlock the pte so we can safely remove it
- */
- save_generation = pmap->pm_generation;
- va = pv->pv_va;
- spin_unlock(&pmap_spin);
-
- pmap_inval_interlock(&info, pmap, va);
-
- /*
- * Restart the scan if the pv list changed out from under us.
- */
- spin_lock(&pmap_spin);
- if (save_generation != pmap->pm_generation) {
- spin_unlock(&pmap_spin);
- pmap_inval_deinterlock(&info, pmap);
- kprintf("Warning: pmap_remove_pages race-A avoided\n");
- spin_lock(&pmap_spin);
- npv = TAILQ_FIRST(&pmap->pm_pvlist);
- continue;
- }
- KKASSERT(pmap == pv->pv_pmap && va == pv->pv_va);
-
- /*
- * Extract the pte and clear its memory
- */
- tpte = pte_load_clear(pte);
- KKASSERT(tpte & PG_MANAGED);
-
- m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
- KASSERT(m < &vm_page_array[vm_page_array_size],
- ("pmap_remove_pages: bad tpte %lx", tpte));
-
- /*
- * Remove the entry, set npv
- */
- npv = TAILQ_NEXT(pv, pv_plist);
- TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
- m->md.pv_generation++;
- m->md.pv_list_count--;
- vm_page_spin_lock(m);
- if (m->object)
- atomic_add_int(&m->object->agg_pv_list_count, -1);
- vm_page_spin_unlock(m);
- TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
- if (TAILQ_EMPTY(&m->md.pv_list))
- vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
- save_generation = ++pmap->pm_generation;
-
- spin_unlock(&pmap_spin);
-
- /*
- * Adjust the pmap and cleanup the tpte and related vm_page
- */
- KKASSERT(pmap->pm_stats.resident_count > 0);
- --pmap->pm_stats.resident_count;
- pmap_inval_deinterlock(&info, pmap);
-
- /*
- * Update the vm_page_t clean and reference bits.
- */
- if (tpte & PG_M) {
- vm_page_dirty(m);
- }
-
- pmap_unwire_pte_hold(pmap, pv->pv_va, pv->pv_ptem, &info);
- free_pv_entry(pv);
-
- /*
- * Restart the scan if we blocked during the unuse or free
- * calls and other removals were made.
- */
- spin_lock(&pmap_spin);
- if (save_generation != pmap->pm_generation) {
- kprintf("Warning: pmap_remove_pages race-A avoided\n");
- npv = TAILQ_FIRST(&pmap->pm_pvlist);
- }
- }
- spin_unlock(&pmap_spin);
- pmap_inval_done(&info);
- lwkt_reltoken(&pmap->pm_token);
- if (pmap->pm_pteobj)
- vm_object_drop(pmap->pm_pteobj);
+ pmap_remove(pmap, sva, eva);
}
/*
* pmap_testbit tests bits in pte's note that the testbit/clearbit
* routines are inline, and a lot of things compile-time evaluate.
- *
- * Caller must hold pmap_spin
*/
static
boolean_t
if (TAILQ_FIRST(&m->md.pv_list) == NULL)
return FALSE;
+ vm_page_spin_lock(m);
+ if (TAILQ_FIRST(&m->md.pv_list) == NULL) {
+ vm_page_spin_unlock(m);
+ return FALSE;
+ }
TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
/*
* modified.
*/
if (bit & (PG_A|PG_M)) {
- if (!pmap_track_modified(pv->pv_va))
+ if (!pmap_track_modified(pv->pv_pindex))
continue;
}
continue;
}
#endif
- pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
- if (*pte & bit)
+ pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT);
+ if (*pte & bit) {
+ vm_page_spin_unlock(m);
return TRUE;
+ }
}
+ vm_page_spin_unlock(m);
return (FALSE);
}
/*
* This routine is used to modify bits in ptes
*
- * Caller must NOT hold pmap_spin
+ * Caller must NOT hold any spin locks
*/
static __inline
void
pmap_clearbit(vm_page_t m, int bit)
{
struct pmap_inval_info info;
- int save_generation;
- vm_offset_t save_va;
- struct pmap *save_pmap;
pv_entry_t pv;
pt_entry_t *pte;
pt_entry_t pbits;
+ vm_pindex_t save_pindex;
+ pmap_t save_pmap;
if (bit == PG_RW)
vm_page_flag_clear(m, PG_WRITEABLE);
* Loop over all current mappings setting/clearing as appropos If
* setting RO do we need to clear the VAC?
*/
- spin_lock(&pmap_spin);
+ vm_page_spin_lock(m);
restart:
TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
/*
* don't write protect pager mappings
*/
if (bit == PG_RW) {
- if (!pmap_track_modified(pv->pv_va))
+ if (!pmap_track_modified(pv->pv_pindex))
continue;
}
* PG_M even for PTEs generated via virtual memory maps,
* because the virtual kernel will invalidate the pmap
* entry when/if it needs to resynchronize the Modify bit.
- *
- * We have to restart our scan if m->md.pv_generation changes
- * on us.
*/
if (bit & PG_RW) {
- save_generation = m->md.pv_generation;
save_pmap = pv->pv_pmap;
- save_va = pv->pv_va;
- spin_unlock(&pmap_spin);
- pmap_inval_interlock(&info, save_pmap, save_va);
- spin_lock(&pmap_spin);
- if (save_generation != m->md.pv_generation)
+ save_pindex = pv->pv_pindex;
+ pv_hold(pv);
+ vm_page_spin_unlock(m);
+ pmap_inval_interlock(&info, save_pmap,
+ (vm_offset_t)save_pindex << PAGE_SHIFT);
+ vm_page_spin_lock(m);
+ if (pv->pv_pmap == NULL) {
+ pv_drop(pv);
goto restart;
+ }
+ pv_drop(pv);
}
- pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
+ pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT);
again:
pbits = *pte;
if (pbits & bit) {
}
}
if (bit & PG_RW) {
- save_generation = m->md.pv_generation;
save_pmap = pv->pv_pmap;
- spin_unlock(&pmap_spin);
+ pv_hold(pv);
+ vm_page_spin_unlock(m);
pmap_inval_deinterlock(&info, save_pmap);
- spin_lock(&pmap_spin);
- if (save_generation != m->md.pv_generation)
+ vm_page_spin_lock(m);
+ if (pv->pv_pmap == NULL) {
+ pv_drop(pv);
goto restart;
+ }
+ pv_drop(pv);
}
}
- spin_unlock(&pmap_spin);
+ vm_page_spin_unlock(m);
pmap_inval_done(&info);
}
int
pmap_ts_referenced(vm_page_t m)
{
- pv_entry_t pv, pvf, pvn;
+ pv_entry_t pv;
pt_entry_t *pte;
int rtval = 0;
if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
return (rtval);
- spin_lock(&pmap_spin);
- if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
- pvf = pv;
- do {
- pvn = TAILQ_NEXT(pv, pv_list);
-
- TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
- TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
- /*++pv->pv_pmap->pm_generation; not needed */
-
- if (!pmap_track_modified(pv->pv_va))
- continue;
-
- pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
-
- if (pte && (*pte & PG_A)) {
+ vm_page_spin_lock(m);
+ TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
+ if (!pmap_track_modified(pv->pv_pindex))
+ continue;
+ pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT);
+ if (pte && (*pte & PG_A)) {
#ifdef SMP
- atomic_clear_long(pte, PG_A);
+ atomic_clear_long(pte, PG_A);
#else
- atomic_clear_long_nonlocked(pte, PG_A);
+ atomic_clear_long_nonlocked(pte, PG_A);
#endif
- rtval++;
- if (rtval > 4) {
- break;
- }
- }
- } while ((pv = pvn) != NULL && pv != pvf);
+ rtval++;
+ if (rtval > 4)
+ break;
+ }
}
- spin_unlock(&pmap_spin);
-
+ vm_page_spin_unlock(m);
return (rtval);
}
{
boolean_t res;
- spin_lock(&pmap_spin);
res = pmap_testbit(m, PG_M);
- spin_unlock(&pmap_spin);
return (res);
}