kernel - Rewrite the x86-64 pmap code
authorMatthew Dillon <dillon@apollo.backplane.com>
Wed, 26 Oct 2011 18:42:18 +0000 (11:42 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Wed, 26 Oct 2011 18:42:18 +0000 (11:42 -0700)
* Use unassociated VM pages (without a VM object) for all page table pages.

* Remove kptobj and pmap->pm_pteobj.

* For the moment implement a Red-Black tree for pv_entry_t manipulation.
  Revamp the pindex to include all page table page levels, from terminal
  pages to the PML4 page.  The hierarchy is now arranged via the PV system.

* As before, the kernel page tables only use PV entries for terminal pages.

* Refactor the locking to allow blocking operations during deep scans.
  Individual PV entries are now locked and critical PMAP operations do not
  require the pmap->pm_token.  This should greatly improve threaded
  program performance.

* Fix kgdb on the live kernel (pmap_extract() was not handling short-cutted
  page directory pages).

sys/platform/pc64/include/pmap.h
sys/platform/pc64/include/vmparam.h
sys/platform/pc64/x86_64/machdep.c
sys/platform/pc64/x86_64/pmap.c

index 8469067..563809e 100644 (file)
 #define NKPML4E                1               /* number of kernel PML4 slots */
 /* NKPDPE defined in vmparam.h */
 
-#define        NUPML4E         (NPML4EPG/2)    /* number of userland PML4 pages */
-#define        NUPDPE          (NUPML4E*NPDPEPG)/* number of userland PDP pages */
-#define        NUPDE           (NUPDPE*NPDEPG) /* number of userland PD entries */
+/*
+ * NUPDPs      512 (256 user)          number of PDPs in user page table
+ * NUPDs       512 * 512               number of PDs in user page table
+ * NUPTs       512 * 512 * 512         number of PTs in user page table
+ * NUPTEs      512 * 512 * 512 * 512   number of PTEs in user page table
+ *
+ * NUPDP_USER  number of PDPs reserved for userland
+ * NUPTE_USER  number of PTEs reserved for userland (big number)
+ */
+#define        NUPDP_USER      (NPML4EPG/2)
+#define        NUPDP_TOTAL     (NPML4EPG)
+#define        NUPD_TOTAL      (NPDPEPG * NUPDP_TOTAL)
+#define        NUPT_TOTAL      (NPDEPG * NUPD_TOTAL)
+#define NUPTE_TOTAL    ((vm_pindex_t)NPTEPG * NUPT_TOTAL)
+#define NUPTE_USER     ((vm_pindex_t)NPTEPG * NPDEPG * NPDPEPG * NUPDP_USER)
 
 #define        NDMPML4E        1               /* number of dmap PML4 slots */
 
 #ifndef _SYS_QUEUE_H_
 #include <sys/queue.h>
 #endif
+#ifndef _SYS_TREE_H_
+#include <sys/tree.h>
+#endif
 #ifndef _SYS_SPINLOCK_H_
 #include <sys/spinlock.h>
 #endif
@@ -192,8 +207,6 @@ struct vm_object;
 struct vmspace;
 
 struct md_page {
-       int pv_list_count;
-       int pv_generation;
        TAILQ_HEAD(,pv_entry)   pv_list;
 };
 
@@ -212,20 +225,21 @@ struct pmap_statistics {
 };
 typedef struct pmap_statistics *pmap_statistics_t;
 
+struct pv_entry_rb_tree;
+RB_PROTOTYPE2(pv_entry_rb_tree, pv_entry, pv_entry,
+             pv_entry_compare, vm_pindex_t);
+
 struct pmap {
        pml4_entry_t            *pm_pml4;       /* KVA of level 4 page table */
-       struct vm_page          *pm_pdirm;      /* VM page for pg directory */
-       struct vm_object        *pm_pteobj;     /* Container for pte's */
+       struct pv_entry         *pm_pmlpv;      /* PV entry for pml4 */
        TAILQ_ENTRY(pmap)       pm_pmnode;      /* list of pmaps */
-       TAILQ_HEAD(,pv_entry)   pm_pvlist;      /* list of mappings in pmap */
-       TAILQ_HEAD(,pv_entry)   pm_pvlist_free; /* free mappings */
+       RB_HEAD(pv_entry_rb_tree, pv_entry) pm_pvroot;
        int                     pm_count;       /* reference count */
        cpumask_t               pm_active;      /* active on cpus */
        int                     pm_filler02;    /* (filler sync w/vkernel) */
        struct pmap_statistics  pm_stats;       /* pmap statistics */
-       struct  vm_page         *pm_ptphint;    /* pmap ptp hint */
+       struct pv_entry         *pm_pvhint;     /* pv_entry lookup hint */
        int                     pm_generation;  /* detect pvlist deletions */
-       int                     pm_hold;
        struct spinlock         pm_spin;
        struct lwkt_token       pm_token;
 };
@@ -247,13 +261,23 @@ extern struct pmap        kernel_pmap;
  */
 typedef struct pv_entry {
        pmap_t          pv_pmap;        /* pmap where mapping lies */
-       vm_offset_t     pv_va;          /* virtual address for mapping */
+       vm_pindex_t     pv_pindex;      /* PTE, PT, PD, PDP, or PML4 */
        TAILQ_ENTRY(pv_entry)   pv_list;
-       TAILQ_ENTRY(pv_entry)   pv_plist;
-       struct vm_page  *pv_ptem;       /* VM page for pte */
-       u_int           pv_hold;        /* hold on destruction count */
+       RB_ENTRY(pv_entry)      pv_entry;
+       struct vm_page  *pv_m;          /* page being mapped */
+       u_int           pv_hold;        /* interlock action */
+       u_int           pv_unused01;
+#ifdef PMAP_DEBUG
+       const char      *pv_func;
+       int             pv_line;
+#endif
 } *pv_entry_t;
 
+#define PV_HOLD_LOCKED 0x80000000U
+#define PV_HOLD_WAITING        0x40000000U
+#define PV_HOLD_DELETED        0x20000000U
+#define PV_HOLD_MASK   0x1FFFFFFFU
+
 #ifdef _KERNEL
 
 #define NPPROVMTRR             8
index 35c5938..2e575e6 100644 (file)
 #define UPT_MIN_ADDRESS                KVADDR(PML4PML4I, 0, 0, 0)
 
 #define VM_MIN_USER_ADDRESS    ((vm_offset_t)0)
-#define VM_MAX_USER_ADDRESS    UVADDR(NUPML4E, 0, 0, 0)
+#define VM_MAX_USER_ADDRESS    UVADDR(NUPDP_USER, 0, 0, 0)
 
 #define USRSTACK               VM_MAX_USER_ADDRESS
 
index f662a3c..39361c2 100644 (file)
@@ -1010,6 +1010,7 @@ cpu_idle(void)
 
                if (quick && (cpu_mi_feature & CPU_MI_MONITOR) &&
                    (reqflags & RQF_IDLECHECK_WK_MASK) == 0) {
+                       splz(); /* XXX */
                        cpu_mmw_pause_int(&gd->gd_reqflags, reqflags);
                        ++cpu_idle_hltcnt;
                } else if (cpu_idle_hlt) {
index ed13709..38a23e0 100644 (file)
@@ -8,6 +8,7 @@
  * Copyright (c) 2005-2008 Alan L. Cox <alc@cs.rice.edu>
  * Copyright (c) 2008, 2009 The DragonFly Project.
  * Copyright (c) 2008, 2009 Jordan Gordeev.
+ * Copyright (c) 2011 Matthew Dillon
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
 #define MINPV 2048
 
 /*
+ * pmap debugging will report who owns a pv lock when blocking.
+ */
+#ifdef PMAP_DEBUG
+
+#define PMAP_DEBUG_DECL                ,const char *func, int lineno
+#define PMAP_DEBUG_ARGS                , __func__, __LINE__
+#define PMAP_DEBUG_COPY                , func, lineno
+
+#define pv_get(pmap, pindex)           _pv_get(pmap, pindex            \
+                                                       PMAP_DEBUG_ARGS)
+#define pv_lock(pv)                    _pv_lock(pv                     \
+                                                       PMAP_DEBUG_ARGS)
+#define pv_hold_try(pv)                        _pv_hold_try(pv                 \
+                                                       PMAP_DEBUG_ARGS)
+#define pv_alloc(pmap, pindex, isnewp) _pv_alloc(pmap, pindex, isnewp  \
+                                                       PMAP_DEBUG_ARGS)
+
+#else
+
+#define PMAP_DEBUG_DECL
+#define PMAP_DEBUG_ARGS
+#define PMAP_DEBUG_COPY
+
+#define pv_get(pmap, pindex)           _pv_get(pmap, pindex)
+#define pv_lock(pv)                    _pv_lock(pv)
+#define pv_hold_try(pv)                        _pv_hold_try(pv)
+#define pv_alloc(pmap, pindex, isnewp) _pv_alloc(pmap, pindex, isnewp)
+
+#endif
+
+/*
  * Get PDEs and PTEs for user/kernel address space
  */
-static pd_entry_t *pmap_pde(pmap_t pmap, vm_offset_t va);
 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
 
 #define pmap_pde_v(pte)                ((*(pd_entry_t *)pte & PG_V) != 0)
@@ -139,7 +170,6 @@ static pd_entry_t *pmap_pde(pmap_t pmap, vm_offset_t va);
 #define pmap_pte_u(pte)                ((*(pt_entry_t *)pte & PG_A) != 0)
 #define pmap_pte_v(pte)                ((*(pt_entry_t *)pte & PG_V) != 0)
 
-
 /*
  * Given a map and a machine independent protection code,
  * convert to a vax protection code.
@@ -164,8 +194,6 @@ static boolean_t pmap_initialized = FALSE;  /* Has pmap_init completed? */
 static int pgeflag;            /* PG_G or-in */
 static int pseflag;            /* PG_PS or-in */
 
-static vm_object_t kptobj;
-
 static int ndmpdp;
 static vm_paddr_t dmaplimit;
 static int nkpt;
@@ -187,7 +215,7 @@ static uint64_t     DMPDPphys;      /* phys addr of direct mapped level 3 */
 static vm_zone_t pvzone;
 static struct vm_zone pvzone_store;
 static struct vm_object pvzone_obj;
-static int pv_entry_count=0, pv_entry_max=0, pv_entry_high_water=0;
+static int pv_entry_max=0, pv_entry_high_water=0;
 static int pmap_pagedaemon_waken = 0;
 static struct pv_entry *pvinit;
 
@@ -211,32 +239,58 @@ SYSCTL_INT(_machdep, OID_AUTO, pmap_yield_count, CTLFLAG_RW,
 
 #define DISABLE_PSE
 
-static pv_entry_t get_pv_entry (void);
+static void pv_hold(pv_entry_t pv);
+static int _pv_hold_try(pv_entry_t pv
+                               PMAP_DEBUG_DECL);
+static void pv_drop(pv_entry_t pv);
+static void _pv_lock(pv_entry_t pv
+                               PMAP_DEBUG_DECL);
+static void pv_unlock(pv_entry_t pv);
+static pv_entry_t _pv_alloc(pmap_t pmap, vm_pindex_t pindex, int *isnew
+                               PMAP_DEBUG_DECL);
+static pv_entry_t _pv_get(pmap_t pmap, vm_pindex_t pindex
+                               PMAP_DEBUG_DECL);
+static pv_entry_t pv_get_try(pmap_t pmap, vm_pindex_t pindex, int *errorp);
+static pv_entry_t pv_find(pmap_t pmap, vm_pindex_t pindex);
+static void pv_put(pv_entry_t pv);
+static void pv_free(pv_entry_t pv);
+static void *pv_pte_lookup(pv_entry_t pv, vm_pindex_t pindex);
+static pv_entry_t pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
+                     pv_entry_t *pvpp);
+static void pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp,
+                     struct pmap_inval_info *info);
+static vm_page_t pmap_remove_pv_page(pv_entry_t pv, int holdpg);
+
+static void pmap_remove_callback(pmap_t pmap, struct pmap_inval_info *info,
+                     pv_entry_t pte_pv, pv_entry_t pt_pv, vm_offset_t va,
+                     pt_entry_t *ptep, void *arg __unused);
+static void pmap_protect_callback(pmap_t pmap, struct pmap_inval_info *info,
+                     pv_entry_t pte_pv, pv_entry_t pt_pv, vm_offset_t va,
+                     pt_entry_t *ptep, void *arg __unused);
+
 static void i386_protection_init (void);
 static void create_pagetables(vm_paddr_t *firstaddr);
 static void pmap_remove_all (vm_page_t m);
-static int  pmap_remove_pte (struct pmap *pmap, pt_entry_t *ptq,
-                               vm_offset_t sva, pmap_inval_info_t info);
-static void pmap_remove_page (struct pmap *pmap, 
-                               vm_offset_t va, pmap_inval_info_t info);
-static int  pmap_remove_entry (struct pmap *pmap, vm_page_t m,
-                               vm_offset_t va, pmap_inval_info_t info);
 static boolean_t pmap_testbit (vm_page_t m, int bit);
-static void pmap_insert_entry (pmap_t pmap, vm_offset_t va,
-                               vm_page_t mpte, vm_page_t m);
-
-static vm_page_t pmap_allocpte (pmap_t pmap, vm_offset_t va);
 
-static int pmap_release_free_page (pmap_t pmap, vm_page_t p);
-static vm_page_t _pmap_allocpte (pmap_t pmap, vm_pindex_t ptepindex);
 static pt_entry_t * pmap_pte_quick (pmap_t pmap, vm_offset_t va);
-static vm_page_t pmap_page_lookup (vm_object_t object, vm_pindex_t pindex);
-static int pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m,
-                               pmap_inval_info_t info);
 static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
 
 static unsigned pdir4mb;
 
+static int
+pv_entry_compare(pv_entry_t pv1, pv_entry_t pv2)
+{
+       if (pv1->pv_pindex < pv2->pv_pindex)
+               return(-1);
+       if (pv1->pv_pindex > pv2->pv_pindex)
+               return(1);
+       return(0);
+}
+
+RB_GENERATE2(pv_entry_rb_tree, pv_entry, pv_entry,
+             pv_entry_compare, vm_pindex_t, pv_pindex);
+
 /*
  * Move the kernel virtual free pointer to the next
  * 2MB.  This is used to help improve performance
@@ -270,143 +324,230 @@ pmap_pte_quick(pmap_t pmap, vm_offset_t va)
        return pmap_pte(pmap, va);
 }
 
-/* Return a non-clipped PD index for a given VA */
+/*
+ * Returns the pindex of a page table entry (representing a terminal page).
+ * There are NUPTE_TOTAL page table entries possible (a huge number)
+ *
+ * x86-64 has a 48-bit address space, where bit 47 is sign-extended out.
+ * We want to properly translate negative KVAs.
+ */
 static __inline
 vm_pindex_t
-pmap_pde_pindex(vm_offset_t va)
+pmap_pte_pindex(vm_offset_t va)
 {
-       return va >> PDRSHIFT;
+       return ((va >> PAGE_SHIFT) & (NUPTE_TOTAL - 1));
 }
 
-/* Return various clipped indexes for a given VA */
+/*
+ * Returns the pindex of a page table.
+ */
 static __inline
 vm_pindex_t
-pmap_pte_index(vm_offset_t va)
+pmap_pt_pindex(vm_offset_t va)
 {
+       return (NUPTE_TOTAL + ((va >> PDRSHIFT) & (NUPT_TOTAL - 1)));
+}
 
-       return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
+/*
+ * Returns the pindex of a page directory.
+ */
+static __inline
+vm_pindex_t
+pmap_pd_pindex(vm_offset_t va)
+{
+       return (NUPTE_TOTAL + NUPT_TOTAL +
+               ((va >> PDPSHIFT) & (NUPD_TOTAL - 1)));
 }
 
 static __inline
 vm_pindex_t
-pmap_pde_index(vm_offset_t va)
+pmap_pdp_pindex(vm_offset_t va)
 {
+       return (NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL +
+               ((va >> PML4SHIFT) & (NUPDP_TOTAL - 1)));
+}
 
-       return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
+static __inline
+vm_pindex_t
+pmap_pml4_pindex(void)
+{
+       return (NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL);
 }
 
+/*
+ * Return various clipped indexes for a given VA
+ *
+ * Returns the index of a pte in a page table, representing a terminal
+ * page.
+ */
 static __inline
 vm_pindex_t
-pmap_pdpe_index(vm_offset_t va)
+pmap_pte_index(vm_offset_t va)
 {
+       return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
+}
 
-       return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
+/*
+ * Returns the index of a pt in a page directory, representing a page
+ * table.
+ */
+static __inline
+vm_pindex_t
+pmap_pt_index(vm_offset_t va)
+{
+       return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
 }
 
+/*
+ * Returns the index of a pd in a page directory page, representing a page
+ * directory.
+ */
 static __inline
 vm_pindex_t
-pmap_pml4e_index(vm_offset_t va)
+pmap_pd_index(vm_offset_t va)
 {
+       return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
+}
 
+/*
+ * Returns the index of a pdp in the pml4 table, representing a page
+ * directory page.
+ */
+static __inline
+vm_pindex_t
+pmap_pdp_index(vm_offset_t va)
+{
        return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
 }
 
-/* Return a pointer to the PML4 slot that corresponds to a VA */
+/*
+ * Generic procedure to index a pte from a pt, pd, or pdp.
+ */
+static
+void *
+pv_pte_lookup(pv_entry_t pv, vm_pindex_t pindex)
+{
+       pt_entry_t *pte;
+
+       pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pv->pv_m));
+       return(&pte[pindex]);
+}
+
+/*
+ * Return pointer to PDP slot in the PML4
+ */
 static __inline
 pml4_entry_t *
-pmap_pml4e(pmap_t pmap, vm_offset_t va)
+pmap_pdp(pmap_t pmap, vm_offset_t va)
 {
-
-       return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
+       return (&pmap->pm_pml4[pmap_pdp_index(va)]);
 }
 
-/* Return a pointer to the PDP slot that corresponds to a VA */
+/*
+ * Return pointer to PD slot in the PDP given a pointer to the PDP
+ */
 static __inline
 pdp_entry_t *
-pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
+pmap_pdp_to_pd(pml4_entry_t *pdp, vm_offset_t va)
 {
-       pdp_entry_t *pdpe;
+       pdp_entry_t *pd;
 
-       pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
-       return (&pdpe[pmap_pdpe_index(va)]);
+       pd = (pdp_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
+       return (&pd[pmap_pd_index(va)]);
 }
 
-/* Return a pointer to the PDP slot that corresponds to a VA */
+/*
+ * Return pointer to PD slot in the PDP
+ **/
 static __inline
 pdp_entry_t *
-pmap_pdpe(pmap_t pmap, vm_offset_t va)
+pmap_pd(pmap_t pmap, vm_offset_t va)
 {
-       pml4_entry_t *pml4e;
+       pml4_entry_t *pdp;
 
-       pml4e = pmap_pml4e(pmap, va);
-       if ((*pml4e & PG_V) == 0)
+       pdp = pmap_pdp(pmap, va);
+       if ((*pdp & PG_V) == 0)
                return NULL;
-       return (pmap_pml4e_to_pdpe(pml4e, va));
+       return (pmap_pdp_to_pd(pdp, va));
 }
 
-/* Return a pointer to the PD slot that corresponds to a VA */
+/*
+ * Return pointer to PT slot in the PD given a pointer to the PD
+ */
 static __inline
 pd_entry_t *
-pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va)
+pmap_pd_to_pt(pdp_entry_t *pd, vm_offset_t va)
 {
-       pd_entry_t *pde;
+       pd_entry_t *pt;
 
-       pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
-       return (&pde[pmap_pde_index(va)]);
+       pt = (pd_entry_t *)PHYS_TO_DMAP(*pd & PG_FRAME);
+       return (&pt[pmap_pt_index(va)]);
 }
 
-/* Return a pointer to the PD slot that corresponds to a VA */
+/*
+ * Return pointer to PT slot in the PD
+ */
 static __inline
 pd_entry_t *
-pmap_pde(pmap_t pmap, vm_offset_t va)
+pmap_pt(pmap_t pmap, vm_offset_t va)
 {
-       pdp_entry_t *pdpe;
+       pdp_entry_t *pd;
 
-       pdpe = pmap_pdpe(pmap, va);
-       if (pdpe == NULL || (*pdpe & PG_V) == 0)
+       pd = pmap_pd(pmap, va);
+       if (pd == NULL || (*pd & PG_V) == 0)
                 return NULL;
-       return (pmap_pdpe_to_pde(pdpe, va));
+       return (pmap_pd_to_pt(pd, va));
 }
 
-/* Return a pointer to the PT slot that corresponds to a VA */
+/*
+ * Return pointer to PTE slot in the PT given a pointer to the PT
+ */
 static __inline
 pt_entry_t *
-pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
+pmap_pt_to_pte(pd_entry_t *pt, vm_offset_t va)
 {
        pt_entry_t *pte;
 
-       pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
+       pte = (pt_entry_t *)PHYS_TO_DMAP(*pt & PG_FRAME);
        return (&pte[pmap_pte_index(va)]);
 }
 
-/* Return a pointer to the PT slot that corresponds to a VA */
+/*
+ * Return pointer to PTE slot in the PT
+ */
 static __inline
 pt_entry_t *
 pmap_pte(pmap_t pmap, vm_offset_t va)
 {
-       pd_entry_t *pde;
+       pd_entry_t *pt;
 
-       pde = pmap_pde(pmap, va);
-       if (pde == NULL || (*pde & PG_V) == 0)
-               return NULL;
-       if ((*pde & PG_PS) != 0)        /* compat with i386 pmap_pte() */
-               return ((pt_entry_t *)pde);
-       return (pmap_pde_to_pte(pde, va));
+       pt = pmap_pt(pmap, va);
+       if (pt == NULL || (*pt & PG_V) == 0)
+                return NULL;
+       if ((*pt & PG_PS) != 0)
+               return ((pt_entry_t *)pt);
+       return (pmap_pt_to_pte(pt, va));
 }
 
+/*
+ * Of all the layers (PTE, PT, PD, PDP, PML4) the best one to cache is
+ * the PT layer.  This will speed up core pmap operations considerably.
+ */
 static __inline
-pt_entry_t *
-vtopte(vm_offset_t va)
+void
+pv_cache(pv_entry_t pv, vm_pindex_t pindex)
 {
-       uint64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT +
-                                 NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
-
-       return (PTmap + ((va >> PAGE_SHIFT) & mask));
+       if (pindex >= pmap_pt_pindex(0) && pindex <= pmap_pd_pindex(0))
+               pv->pv_pmap->pm_pvhint = pv;
 }
 
+
+/*
+ * KVM - return address of PT slot in PD
+ */
 static __inline
 pd_entry_t *
-vtopde(vm_offset_t va)
+vtopt(vm_offset_t va)
 {
        uint64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT +
                                  NPML4EPGSHIFT)) - 1);
@@ -414,6 +555,19 @@ vtopde(vm_offset_t va)
        return (PDmap + ((va >> PDRSHIFT) & mask));
 }
 
+/*
+ * KVM - return address of PTE slot in PT
+ */
+static __inline
+pt_entry_t *
+vtopte(vm_offset_t va)
+{
+       uint64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT +
+                                 NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
+
+       return (PTmap + ((va >> PAGE_SHIFT) & mask));
+}
+
 static uint64_t
 allocpages(vm_paddr_t *firstaddr, long n)
 {
@@ -616,17 +770,11 @@ pmap_bootstrap(vm_paddr_t *firstaddr)
         * The kernel's pmap is statically allocated so we don't have to use
         * pmap_create, which is unlikely to work correctly at this part of
         * the boot sequence (XXX and which no longer exists).
-        *
-        * The kernel_pmap's pm_pteobj is used only for locking and not
-        * for mmu pages.
         */
        kernel_pmap.pm_pml4 = (pdp_entry_t *) (PTOV_OFFSET + KPML4phys);
        kernel_pmap.pm_count = 1;
        kernel_pmap.pm_active = (cpumask_t)-1 & ~CPUMASK_LOCK;
-       kernel_pmap.pm_pteobj = &kernel_object;
-       TAILQ_INIT(&kernel_pmap.pm_pvlist);
-       TAILQ_INIT(&kernel_pmap.pm_pvlist_free);
-       kernel_pmap.pm_hold = 0;
+       RB_INIT(&kernel_pmap.pm_pvroot);
        spin_init(&kernel_pmap.pm_spin);
        lwkt_token_init(&kernel_pmap.pm_token, "kpmap_tok");
 
@@ -765,22 +913,15 @@ pmap_init(void)
        int initial_pvs;
 
        /*
-        * object for kernel page table pages
-        */
-       /* JG I think the number can be arbitrary */
-       kptobj = vm_object_allocate(OBJT_DEFAULT, 5);
-
-       /*
         * Allocate memory for random pmap data structures.  Includes the
         * pv_head_table.
         */
 
-       for(i = 0; i < vm_page_array_size; i++) {
+       for (i = 0; i < vm_page_array_size; i++) {
                vm_page_t m;
 
                m = &vm_page_array[i];
                TAILQ_INIT(&m->md.pv_list);
-               m->md.pv_list_count = 0;
        }
 
        /*
@@ -856,8 +997,9 @@ pmap_nw_modified(pt_entry_t pte)
  */
 static __inline
 int
-pmap_track_modified(vm_offset_t va)
+pmap_track_modified(vm_pindex_t pindex)
 {
+       vm_offset_t va = (vm_offset_t)pindex << PAGE_SHIFT;
        if ((va < clean_sva) || (va >= clean_eva)) 
                return 1;
        else
@@ -866,31 +1008,55 @@ pmap_track_modified(vm_offset_t va)
 
 /*
  * Extract the physical page address associated with the map/VA pair.
+ * The page must be wired for this to work reliably.
  *
- * The caller must hold pmap->pm_token if non-blocking operation is desired.
+ * XXX for the moment we're using pv_find() instead of pv_get(), as
+ *     callers might be expecting non-blocking operation.
  */
 vm_paddr_t 
 pmap_extract(pmap_t pmap, vm_offset_t va)
 {
        vm_paddr_t rtval;
-       pt_entry_t *pte;
-       pd_entry_t pde, *pdep;
+       pv_entry_t pt_pv;
+       pt_entry_t *ptep;
 
-       lwkt_gettoken(&pmap->pm_token);
        rtval = 0;
-       pdep = pmap_pde(pmap, va);
-       if (pdep != NULL) {
-               pde = *pdep;
-               if (pde) {
-                       if ((pde & PG_PS) != 0) {
-                               rtval = (pde & PG_PS_FRAME) | (va & PDRMASK);
+       if (va >= VM_MAX_USER_ADDRESS) {
+               /*
+                * Kernel page directories might be direct-mapped and
+                * there is typically no PV tracking of pte's
+                */
+               pd_entry_t *pt;
+
+               pt = pmap_pt(pmap, va);
+               if (pt && (*pt & PG_V)) {
+                       if (*pt & PG_PS) {
+                               rtval = *pt & PG_PS_FRAME;
+                               rtval |= va & PDRMASK;
                        } else {
-                               pte = pmap_pde_to_pte(pdep, va);
-                               rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
+                               ptep = pmap_pt_to_pte(pt, va);
+                               if (*pt & PG_V) {
+                                       rtval = *ptep & PG_FRAME;
+                                       rtval |= va & PAGE_MASK;
+                               }
+                       }
+               }
+       } else {
+               /*
+                * User pages currently do not direct-map the page directory
+                * and some pages might not used managed PVs.  But all PT's
+                * will have a PV.
+                */
+               pt_pv = pv_find(pmap, pmap_pt_pindex(va));
+               if (pt_pv) {
+                       ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va));
+                       if (*ptep & PG_V) {
+                               rtval = *ptep & PG_FRAME;
+                               rtval |= va & PAGE_MASK;
                        }
+                       pv_drop(pt_pv);
                }
        }
-       lwkt_reltoken(&pmap->pm_token);
        return rtval;
 }
 
@@ -900,15 +1066,15 @@ pmap_extract(pmap_t pmap, vm_offset_t va)
 vm_paddr_t
 pmap_kextract(vm_offset_t va)
 {
-       pd_entry_t pde;
+       pd_entry_t pt;          /* pt entry in pd */
        vm_paddr_t pa;
 
        if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
                pa = DMAP_TO_PHYS(va);
        } else {
-               pde = *vtopde(va);
-               if (pde & PG_PS) {
-                       pa = (pde & PG_PS_FRAME) | (va & PDRMASK);
+               pt = *vtopt(va);
+               if (pt & PG_PS) {
+                       pa = (pt & PG_PS_FRAME) | (va & PDRMASK);
                } else {
                        /*
                         * Beware of a concurrent promotion that changes the
@@ -918,7 +1084,7 @@ pmap_kextract(vm_offset_t va)
                         * because the page table page is preserved by the
                         * promotion.
                         */
-                       pa = *pmap_pde_to_pte(&pde, va);
+                       pa = *pmap_pt_to_pte(&pt, va);
                        pa = (pa & PG_FRAME) | (va & PAGE_MASK);
                }
        }
@@ -943,13 +1109,13 @@ pmap_kenter(vm_offset_t va, vm_paddr_t pa)
        pt_entry_t npte;
        pmap_inval_info info;
 
-       pmap_inval_init(&info);
+       pmap_inval_init(&info);                         /* XXX remove */
        npte = pa | PG_RW | PG_V | pgeflag;
        pte = vtopte(va);
-       pmap_inval_interlock(&info, &kernel_pmap, va);
+       pmap_inval_interlock(&info, &kernel_pmap, va);  /* XXX remove */
        *pte = npte;
-       pmap_inval_deinterlock(&info, &kernel_pmap);
-       pmap_inval_done(&info);
+       pmap_inval_deinterlock(&info, &kernel_pmap);    /* XXX remove */
+       pmap_inval_done(&info);                         /* XXX remove */
 }
 
 /*
@@ -1019,14 +1185,14 @@ pmap_kremove_quick(vm_offset_t va)
 void
 pmap_kmodify_rw(vm_offset_t va)
 {
-       *vtopte(va) |= PG_RW;
+       atomic_set_long(vtopte(va), PG_RW);
        cpu_invlpg((void *)va);
 }
 
 void
 pmap_kmodify_nc(vm_offset_t va)
 {
-       *vtopte(va) |= PG_N;
+       atomic_set_long(vtopte(va), PG_N);
        cpu_invlpg((void *)va);
 }
 
@@ -1115,27 +1281,6 @@ pmap_qremove(vm_offset_t va, int count)
 }
 
 /*
- * This routine works like vm_page_lookup() but also blocks as long as the
- * page is busy.  This routine does not busy the page it returns.
- *
- * The call should be made with the governing object held so the page's
- * object association remains valid on return.
- *
- * This function can block!
- */
-static
-vm_page_t
-pmap_page_lookup(vm_object_t object, vm_pindex_t pindex)
-{
-       vm_page_t m;
-
-       ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
-       m = vm_page_lookup_busy_wait(object, pindex, FALSE, "pplookp");
-
-       return(m);
-}
-
-/*
  * Create a new thread and optionally associate it with a (new) process.
  * NOTE! the new thread's cpu may not equal the current cpu.
  */
@@ -1167,110 +1312,6 @@ pmap_dispose_proc(struct proc *p)
        KASSERT(p->p_lock == 0, ("attempt to dispose referenced proc! %p", p));
 }
 
-/***************************************************
- * Page table page management routines.....
- ***************************************************/
-
-/*
- * After removing a page table entry, this routine is used to
- * conditionally free the page, and manage the hold/wire counts.
- *
- * This routine reduces the wire_count on a page.  If the wire_count
- * would drop to zero we remove the PT, PD, or PDP from its parent page
- * table.  Under normal operation this only occurs with PT pages.
- *
- * mpte is never NULL for a user va, even for unmanaged pages.  mpte should
- * always be NULL for a kernel va.
- */
-static __inline
-int
-pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t mpte,
-                    pmap_inval_info_t info)
-{
-       if (mpte == NULL)
-               return 0;
-       if (!vm_page_unwire_quick(mpte))
-               return 0;
-
-       /* 
-        * Wait until we can busy the page ourselves.  We cannot have
-        * any active flushes if we block.  We own one hold count on the
-        * page so it cannot be freed out from under us.
-        */
-       vm_page_busy_wait(mpte, FALSE, "pmuwpt");
-       KASSERT(mpte->queue == PQ_NONE,
-               ("_pmap_unwire_pte_hold: %p->queue != PQ_NONE", mpte));
-
-       /*
-        * New references can bump the wire_count while we were blocked,
-        * try to unwire quickly again (e.g. 2->1).
-        */
-       if (vm_page_unwire_quick(mpte) == 0) {
-               vm_page_wakeup(mpte);
-               return 0;
-       }
-
-       /*
-        * Unmap the page table page
-        */
-       KKASSERT(mpte->wire_count == 1);
-       pmap_inval_interlock(info, pmap, -1);
-
-       if (mpte->pindex >= (NUPDE + NUPDPE)) {
-               /* PDP page */
-               pml4_entry_t *pml4;
-               pml4 = pmap_pml4e(pmap, va);
-               KKASSERT(*pml4);
-               *pml4 = 0;
-       } else if (mpte->pindex >= NUPDE) {
-               /* PD page */
-               pdp_entry_t *pdp;
-               pdp = pmap_pdpe(pmap, va);
-               KKASSERT(*pdp);
-               *pdp = 0;
-       } else {
-               /* PT page */
-               pd_entry_t *pd;
-               pd = pmap_pde(pmap, va);
-               KKASSERT(*pd);
-               *pd = 0;
-       }
-
-       KKASSERT(pmap->pm_stats.resident_count > 0);
-       --pmap->pm_stats.resident_count;
-
-       if (pmap->pm_ptphint == mpte)
-               pmap->pm_ptphint = NULL;
-       pmap_inval_deinterlock(info, pmap);
-
-       if (mpte->pindex < NUPDE) {
-               /* We just released a PT, unhold the matching PD */
-               vm_page_t pdpg;
-
-               pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
-               pmap_unwire_pte_hold(pmap, va, pdpg, info);
-       }
-       if (mpte->pindex >= NUPDE && mpte->pindex < (NUPDE + NUPDPE)) {
-               /* We just released a PD, unhold the matching PDP */
-               vm_page_t pdppg;
-
-               pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
-               pmap_unwire_pte_hold(pmap, va, pdppg, info);
-       }
-
-       /*
-        * This was our wiring.
-        */
-       KKASSERT(mpte->flags & PG_UNMANAGED);
-       vm_page_unwire(mpte, 0);
-       KKASSERT(mpte->wire_count == 0);
-       vm_page_flag_clear(mpte, PG_MAPPED | PG_WRITEABLE);
-       vm_page_flash(mpte);
-       vm_page_free_zero(mpte);
-
-       return 1;
-}
-
 /*
  * Initialize pmap0/vmspace0.  This pmap is not added to pmap_list because
  * it, and IdlePTD, represents the template used to update all other pmaps.
@@ -1285,10 +1326,8 @@ pmap_pinit0(struct pmap *pmap)
        pmap->pm_pml4 = (pml4_entry_t *)(PTOV_OFFSET + KPML4phys);
        pmap->pm_count = 1;
        pmap->pm_active = 0;
-       pmap->pm_ptphint = NULL;
-       TAILQ_INIT(&pmap->pm_pvlist);
-       TAILQ_INIT(&pmap->pm_pvlist_free);
-       pmap->pm_hold = 0;
+       pmap->pm_pvhint = NULL;
+       RB_INIT(&pmap->pm_pvroot);
        spin_init(&pmap->pm_spin);
        lwkt_token_init(&pmap->pm_token, "pmap_tok");
        bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
@@ -1301,7 +1340,20 @@ pmap_pinit0(struct pmap *pmap)
 void
 pmap_pinit(struct pmap *pmap)
 {
-       vm_page_t pml4pg;
+       pv_entry_t pv;
+
+       /*
+        * Misc initialization
+        */
+       pmap->pm_count = 1;
+       pmap->pm_active = 0;
+       pmap->pm_pvhint = NULL;
+       if (pmap->pm_pmlpv == NULL) {
+               RB_INIT(&pmap->pm_pvroot);
+               bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
+               spin_init(&pmap->pm_spin);
+               lwkt_token_init(&pmap->pm_token, "pmap_tok");
+       }
 
        /*
         * No need to allocate page table space yet but we do need a valid
@@ -1313,56 +1365,27 @@ pmap_pinit(struct pmap *pmap)
        }
 
        /*
-        * Allocate an object for the ptes
-        */
-       if (pmap->pm_pteobj == NULL) {
-               pmap->pm_pteobj = vm_object_allocate(OBJT_DEFAULT,
-                                               NUPDE + NUPDPE + PML4PML4I + 1);
-       }
-
-       /*
-        * Allocate the page directory page, unless we already have
-        * one cached.  If we used the cached page the wire_count will
-        * already be set appropriately.
+        * Allocate the page directory page, which wires it even though
+        * it isn't being entered into some higher level page table (it
+        * being the highest level).  If one is already cached we don't
+        * have to do anything.
         */
-       if ((pml4pg = pmap->pm_pdirm) == NULL) {
-               pml4pg = vm_page_grab(pmap->pm_pteobj,
-                                     NUPDE + NUPDPE + PML4PML4I,
-                                     VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
-               pmap->pm_pdirm = pml4pg;
-               vm_page_unmanage(pml4pg);
-               vm_page_flag_clear(pml4pg, PG_MAPPED);
-               pml4pg->valid = VM_PAGE_BITS_ALL;
-               vm_page_wire(pml4pg);
-               vm_page_wakeup(pml4pg);
+       if ((pv = pmap->pm_pmlpv) == NULL) {
+               pv = pmap_allocpte(pmap, pmap_pml4_pindex(), NULL);
+               pmap->pm_pmlpv = pv;
                pmap_kenter((vm_offset_t)pmap->pm_pml4,
-                           VM_PAGE_TO_PHYS(pml4pg));
+                           VM_PAGE_TO_PHYS(pv->pv_m));
+               pv_put(pv);
+               pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U;
+               pmap->pm_pml4[DMPML4I] = DMPDPphys | PG_RW | PG_V | PG_U;
+
+               /* install self-referential address mapping entry */
+               pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pv->pv_m) |
+                                          PG_V | PG_RW | PG_A | PG_M;
+       } else {
+               KKASSERT(pv->pv_m->flags & PG_MAPPED);
+               KKASSERT(pv->pv_m->flags & PG_WRITEABLE);
        }
-       if ((pml4pg->flags & PG_ZERO) == 0)
-               bzero(pmap->pm_pml4, PAGE_SIZE);
-#ifdef PMAP_DEBUG
-       else
-               pmap_page_assertzero(VM_PAGE_TO_PHYS(pml4pg));
-#endif
-       vm_page_flag_clear(pml4pg, PG_ZERO);
-
-       pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U;
-       pmap->pm_pml4[DMPML4I] = DMPDPphys | PG_RW | PG_V | PG_U;
-
-       /* install self-referential address mapping entry */
-       pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) |
-                                  PG_V | PG_RW | PG_A | PG_M;
-
-       pmap->pm_count = 1;
-       pmap->pm_active = 0;
-       pmap->pm_ptphint = NULL;
-       TAILQ_INIT(&pmap->pm_pvlist);
-       TAILQ_INIT(&pmap->pm_pvlist_free);
-       pmap->pm_hold = 0;
-       spin_init(&pmap->pm_spin);
-       lwkt_token_init(&pmap->pm_token, "pmap_tok");
-       bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
-       pmap->pm_stats.resident_count = 1;
 }
 
 /*
@@ -1374,28 +1397,36 @@ pmap_pinit(struct pmap *pmap)
 void
 pmap_puninit(pmap_t pmap)
 {
+       pv_entry_t pv;
        vm_page_t p;
 
        KKASSERT(pmap->pm_active == 0);
-       if ((p = pmap->pm_pdirm) != NULL) {
-               KKASSERT(pmap->pm_pml4 != NULL);
-               KKASSERT(pmap->pm_pml4 != (void *)(PTOV_OFFSET + KPML4phys));
+       if ((pv = pmap->pm_pmlpv) != NULL) {
+               if (pv_hold_try(pv) == 0)
+                       pv_lock(pv);
+               p = pmap_remove_pv_page(pv, 1);
+               pv_free(pv);
                pmap_kremove((vm_offset_t)pmap->pm_pml4);
                vm_page_busy_wait(p, FALSE, "pgpun");
-               KKASSERT(p->flags & PG_UNMANAGED);
+               vm_page_unhold(p);
+               KKASSERT(p->flags & (PG_FICTITIOUS|PG_UNMANAGED));
                vm_page_unwire(p, 0);
-               vm_page_free_zero(p);
-               pmap->pm_pdirm = NULL;
+               vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE);
+
+               /*
+                * XXX eventually clean out PML4 static entries and
+                * use vm_page_free_zero()
+                */
+               vm_page_free(p);
+               pmap->pm_pmlpv = NULL;
        }
        if (pmap->pm_pml4) {
                KKASSERT(pmap->pm_pml4 != (void *)(PTOV_OFFSET + KPML4phys));
                kmem_free(&kernel_map, (vm_offset_t)pmap->pm_pml4, PAGE_SIZE);
                pmap->pm_pml4 = NULL;
        }
-       if (pmap->pm_pteobj) {
-               vm_object_deallocate(pmap->pm_pteobj);
-               pmap->pm_pteobj = NULL;
-       }
+       KKASSERT(pmap->pm_stats.resident_count == 0);
+       KKASSERT(pmap->pm_stats.wired_count == 0);
 }
 
 /*
@@ -1416,482 +1447,507 @@ pmap_pinit2(struct pmap *pmap)
 }
 
 /*
- * Attempt to release and free a vm_page in a pmap.  Returns 1 on success,
- * 0 on failure (if the procedure had to sleep).
+ * This routine is called when various levels in the page table need to
+ * be populated.  This routine cannot fail.
  *
- * When asked to remove the page directory page itself, we actually just
- * leave it cached so we do not have to incur the SMP inval overhead of
- * removing the kernel mapping.  pmap_puninit() will take care of it.
+ * This function returns two locked pv_entry's, one representing the
+ * requested pv and one representing the requested pv's parent pv.  If
+ * the pv did not previously exist it will be mapped into its parent
+ * and wired, otherwise no additional wire count will be added.
  */
 static
-int
-pmap_release_free_page(struct pmap *pmap, vm_page_t p)
+pv_entry_t
+pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp)
 {
+       pt_entry_t *ptep;
+       pv_entry_t pv;
+       pv_entry_t pvp;
+       vm_pindex_t pt_pindex;
+       vm_page_t m;
+       int isnew;
+
        /*
-        * This code optimizes the case of freeing non-busy
-        * page-table pages.  Those pages are zero now, and
-        * might as well be placed directly into the zero queue.
+        * If the pv already exists and we aren't being asked for the
+        * parent page table page we can just return it.  A locked+held pv
+        * is returned.
         */
-       if (vm_page_busy_try(p, FALSE)) {
-               vm_page_sleep_busy(p, FALSE, "pmaprl");
-               return 0;
+       pv = pv_alloc(pmap, ptepindex, &isnew);
+       if (isnew == 0 && pvpp == NULL)
+               return(pv);
+
+       /*
+        * This is a new PV, we have to resolve its parent page table and
+        * add an additional wiring to the page if necessary.
+        */
+
+       /*
+        * Special case terminal PVs.  These are not page table pages so
+        * no vm_page is allocated (the caller supplied the vm_page).  If
+        * pvpp is non-NULL we are being asked to also removed the pt_pv
+        * for this pv.
+        *
+        * Note that pt_pv's are only returned for user VAs. We assert that
+        * a pt_pv is not being requested for kernel VAs.
+        */
+       if (ptepindex < pmap_pt_pindex(0)) {
+               if (ptepindex >= NUPTE_USER)
+                       KKASSERT(pvpp == NULL);
+               else
+                       KKASSERT(pvpp != NULL);
+               if (pvpp) {
+                       pt_pindex = NUPTE_TOTAL + (ptepindex >> NPTEPGSHIFT);
+                       pvp = pmap_allocpte(pmap, pt_pindex, NULL);
+                       if (isnew)
+                               vm_page_wire_quick(pvp->pv_m);
+                       *pvpp = pvp;
+               } else {
+                       pvp = NULL;
+               }
+               return(pv);
        }
 
        /*
-        * Remove the page table page from the processes address space.
+        * Non-terminal PVs allocate a VM page to represent the page table,
+        * so we have to resolve pvp and calculate ptepindex for the pvp
+        * and then for the page table entry index in the pvp for
+        * fall-through.
         */
-       if (p->pindex == NUPDE + NUPDPE + PML4PML4I) {
+       if (ptepindex < pmap_pd_pindex(0)) {
                /*
-                * We are the pml4 table itself.
+                * pv is PT, pvp is PD
                 */
-               /* XXX anything to do here? */
-       } else if (p->pindex >= (NUPDE + NUPDPE)) {
+               ptepindex = (ptepindex - pmap_pt_pindex(0)) >> NPDEPGSHIFT;
+               ptepindex += NUPTE_TOTAL + NUPT_TOTAL;
+               pvp = pmap_allocpte(pmap, ptepindex, NULL);
+               if (!isnew)
+                       goto notnew;
+
                /*
-                * Remove a PDP page from the PML4.  We do not maintain
-                * wire counts on the PML4 page.
+                * PT index in PD
                 */
-               pml4_entry_t *pml4;
-               vm_page_t m4;
-               int idx;
-
-               m4 = vm_page_lookup(pmap->pm_pteobj,
-                                   NUPDE + NUPDPE + PML4PML4I);
-               KKASSERT(m4 != NULL);
-               pml4 = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m4));
-               idx = (p->pindex - (NUPDE + NUPDPE)) % NPML4EPG;
-               KKASSERT(pml4[idx] != 0);
-               pml4[idx] = 0;
-       } else if (p->pindex >= NUPDE) {
+               ptepindex = pv->pv_pindex - pmap_pt_pindex(0);
+               ptepindex &= ((1ul << NPDEPGSHIFT) - 1);
+       } else if (ptepindex < pmap_pdp_pindex(0)) {
                /*
-                * Remove a PD page from the PDP and drop the wire count
-                * on the PDP.  The PDP has a wire_count just from being
-                * mapped so the wire_count should never drop to 0 here.
+                * pv is PD, pvp is PDP
                 */
-               vm_page_t m3;
-               pdp_entry_t *pdp;
-               int idx;
-
-               m3 = vm_page_lookup(pmap->pm_pteobj,
-                               NUPDE + NUPDPE + (p->pindex - NUPDE) / NPDPEPG);
-               KKASSERT(m3 != NULL);
-               pdp = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m3));
-               idx = (p->pindex - NUPDE) % NPDPEPG;
-               KKASSERT(pdp[idx] != 0);
-               pdp[idx] = 0;
-               if (vm_page_unwire_quick(m3))
-                       panic("pmap_release_free_page: m3 wire_count 1->0");
-       } else {
+               ptepindex = (ptepindex - pmap_pd_pindex(0)) >> NPDPEPGSHIFT;
+               ptepindex += NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL;
+               pvp = pmap_allocpte(pmap, ptepindex, NULL);
+               if (!isnew)
+                       goto notnew;
+
+               /*
+                * PD index in PDP
+                */
+               ptepindex = pv->pv_pindex - pmap_pd_pindex(0);
+               ptepindex &= ((1ul << NPDPEPGSHIFT) - 1);
+       } else if (ptepindex < pmap_pml4_pindex()) {
                /*
-                * Remove a PT page from the PD and drop the wire count
-                * on the PD.  The PD has a wire_count just from being
-                * mapped so the wire_count should never drop to 0 here.
+                * pv is PDP, pvp is the root pml4 table
                 */
-               vm_page_t m2;
-               pd_entry_t *pd;
-               int idx;
+               pvp = pmap_allocpte(pmap, pmap_pml4_pindex(), NULL);
+               if (!isnew)
+                       goto notnew;
 
-               m2 = vm_page_lookup(pmap->pm_pteobj,
-                                   NUPDE + p->pindex / NPDEPG);
-               KKASSERT(m2 != NULL);
-               pd = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m2));
-               idx = p->pindex % NPDEPG;
-               pd[idx] = 0;
-               if (vm_page_unwire_quick(m2))
-                       panic("pmap_release_free_page: m2 wire_count 1->0");
+               /*
+                * PDP index in PML4
+                */
+               ptepindex = pv->pv_pindex - pmap_pdp_pindex(0);
+               ptepindex &= ((1ul << NPML4EPGSHIFT) - 1);
+       } else {
+               /*
+                * pv represents the top-level PML4, there is no parent.
+                */
+               pvp = NULL;
+               if (!isnew)
+                       goto notnew;
        }
 
        /*
-        * p's wire_count should be transitioning from 1 to 0 here.
-        */
-       KKASSERT(p->wire_count == 1);
-       KKASSERT(p->flags & PG_UNMANAGED);
-       KKASSERT(pmap->pm_stats.resident_count > 0);
-       vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE);
-       --pmap->pm_stats.resident_count;
-       if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == p->pindex))
-               pmap->pm_ptphint = NULL;
-
-       /*
-        * We leave the top-level page table page cached, wired, and mapped in
-        * the pmap until the dtor function (pmap_puninit()) gets called.
-        * However, still clean it up so we can set PG_ZERO.
+        * This code is only reached if isnew is TRUE and this is not a
+        * terminal PV.  We need to allocate a vm_page for the page table
+        * at this level and enter it into the parent page table.
+        *
+        * page table pages are marked PG_WRITEABLE and PG_MAPPED.
         */
-       if (p->pindex == NUPDE + NUPDPE + PML4PML4I) {
-               bzero(pmap->pm_pml4, PAGE_SIZE);
-               vm_page_flag_set(p, PG_ZERO);
-               vm_page_wakeup(p);
-       } else {
-               vm_page_unwire(p, 0);
-               KKASSERT(p->wire_count == 0);
-               /* JG eventually revert to using vm_page_free_zero() */
-               vm_page_free(p);
+       for (;;) {
+               m = vm_page_alloc(NULL, pv->pv_pindex,
+                                 VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM |
+                                 VM_ALLOC_INTERRUPT);
+               if (m)
+                       break;
+               vm_wait(0);
        }
-       return 1;
-}
+       vm_page_spin_lock(m);
+       TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
+       pv->pv_m = m;
+       vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE);
+       vm_page_spin_unlock(m);
+       vm_page_unmanage(m);    /* m must be spinunlocked */
+
+       if ((m->flags & PG_ZERO) == 0) {
+               pmap_zero_page(VM_PAGE_TO_PHYS(m));
+       }
+#ifdef PMAP_DEBUG
+       else {
+               pmap_page_assertzero(VM_PAGE_TO_PHYS(m));
+       }
+#endif
+       m->valid = VM_PAGE_BITS_ALL;
+       vm_page_flag_clear(m, PG_ZERO);
+       vm_page_wire(m);        /* wire for mapping in parent */
+
+       /*
+        * Wire the page into pvp, bump the wire-count for pvp's page table
+        * page.  Bump the resident_count for the pmap.  There is no pvp
+        * for the top level, address the pm_pml4[] array directly.
+        *
+        * If the caller wants the parent we return it, otherwise
+        * we just put it away.
+        *
+        * No interlock is needed for pte 0 -> non-zero.
+        */
+       if (pvp) {
+               vm_page_wire_quick(pvp->pv_m);
+               ptep = pv_pte_lookup(pvp, ptepindex);
+               KKASSERT((*ptep & PG_V) == 0);
+               *ptep = VM_PAGE_TO_PHYS(m) | (PG_U | PG_RW | PG_V |
+                                             PG_A | PG_M);
+       }
+       vm_page_wakeup(m);
+notnew:
+       if (pvpp)
+               *pvpp = pvp;
+       else if (pvp)
+               pv_put(pvp);
+       return (pv);
+}
 
 /*
- * This routine is called when various levels in the page table need to
- * be populated.  This routine cannot fail.
+ * Release any resources held by the given physical map.
+ *
+ * Called when a pmap initialized by pmap_pinit is being released.  Should
+ * only be called if the map contains no valid mappings.
  *
- * We returned a page wired for the caller.  If we had to map the page into
- * a parent page table it will receive an additional wire_count.  For example,
- * an empty page table directory which is still mapped into its pdp will
- * retain a wire_count of 1.
+ * Caller must hold pmap->pm_token
  */
-static
-vm_page_t
-_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex)
+struct pmap_release_info {
+       pmap_t  pmap;
+       int     retry;
+};
+
+static int pmap_release_callback(pv_entry_t pv, void *data);
+
+void
+pmap_release(struct pmap *pmap)
 {
-       vm_page_t m;
+       struct pmap_release_info info;
+
+       KASSERT(pmap->pm_active == 0,
+               ("pmap still active! %016jx", (uintmax_t)pmap->pm_active));
+#if defined(DIAGNOSTIC)
+       if (object->ref_count != 1)
+               panic("pmap_release: pteobj reference count != 1");
+#endif
+
+       spin_lock(&pmap_spin);
+       TAILQ_REMOVE(&pmap_list, pmap, pm_pmnode);
+       spin_unlock(&pmap_spin);
 
        /*
-        * Find or fabricate a new pagetable page.  This will busy the page.
+        * Pull pv's off the RB tree in order from low to high and release
+        * each page.
         */
-       m = vm_page_grab(pmap->pm_pteobj, ptepindex,
-                        VM_ALLOC_NORMAL | VM_ALLOC_ZERO | VM_ALLOC_RETRY);
+       info.pmap = pmap;
+       do {
+               info.retry = 0;
+               spin_lock(&pmap->pm_spin);
+               RB_SCAN(pv_entry_rb_tree, &pmap->pm_pvroot, NULL,
+                       pmap_release_callback, &info);
+               spin_unlock(&pmap->pm_spin);
+       } while (info.retry);
+
 
        /*
-        * The grab may have blocked and raced another thread populating
-        * the same page table.  m->valid will be 0 on a newly allocated page
-        * so use this to determine if we have to zero it out or not.  We
-        * don't want to zero-out a raced page as this would desynchronize
-        * the pv_entry's for the related pte's and cause pmap_remove_all()
-        * to panic.
-        *
-        * Page table pages are unmanaged (do not use the normal PQ_s)
+        * One resident page (the pml4 page) should remain.
+        * No wired pages should remain.
         */
-       if (m->valid == 0) {
-               vm_page_unmanage(m);
-               if ((m->flags & PG_ZERO) == 0) {
-                       pmap_zero_page(VM_PAGE_TO_PHYS(m));
-               }
-#ifdef PMAP_DEBUG
-               else {
-                       pmap_page_assertzero(VM_PAGE_TO_PHYS(m));
+       KKASSERT(pmap->pm_stats.resident_count == 1);
+       KKASSERT(pmap->pm_stats.wired_count == 0);
+}
+
+static int
+pmap_release_callback(pv_entry_t pv, void *data)
+{
+       struct pmap_release_info *info = data;
+       pmap_t pmap = info->pmap;
+       vm_page_t p;
+
+       if (pv_hold_try(pv)) {
+               spin_unlock(&pmap->pm_spin);
+       } else {
+               spin_unlock(&pmap->pm_spin);
+               pv_lock(pv);
+               if (pv->pv_pmap != pmap) {
+                       pv_put(pv);
+                       spin_lock(&pmap->pm_spin);
+                       info->retry = 1;
+                       return(-1);
                }
-#endif
-               m->valid = VM_PAGE_BITS_ALL;
-               vm_page_flag_clear(m, PG_ZERO);
        }
-#ifdef PMAP_DEBUG
-       else {
-               KKASSERT((m->flags & PG_ZERO) == 0);
-       }
-#endif
 
-       KASSERT(m->queue == PQ_NONE,
-               ("_pmap_allocpte: %p->queue != PQ_NONE", m));
+       /*
+        * The pmap is currently not spinlocked, pv is held+locked.
+        * Remove the pv's page from its parent's page table.  The
+        * parent's page table page's wire_count will be decremented.
+        */
+       pmap_remove_pv_pte(pv, NULL, NULL);
 
        /*
-        * Increment the wire_count for the page we will be returning to
-        * the caller.
+        * Terminal pvs are unhooked from their vm_pages.  Because
+        * terminal pages aren't page table pages they aren't wired
+        * by us, so we have to be sure not to unwire them either.
         */
-       vm_page_wire(m);
+       if (pv->pv_pindex < pmap_pt_pindex(0)) {
+               pmap_remove_pv_page(pv, 0);
+               goto skip;
+       }
 
        /*
-        * Map the pagetable page into the process address space, if
-        * it isn't already there.
+        * We leave the top-level page table page cached, wired, and
+        * mapped in the pmap until the dtor function (pmap_puninit())
+        * gets called.
         *
-        * It is possible that someone else got in and mapped the page
-        * directory page while we were blocked, if so just unbusy and
-        * return the held page.
+        * Since we are leaving the top-level pv intact we need
+        * to break out of what would otherwise be an infinite loop.
         */
-       if (ptepindex >= (NUPDE + NUPDPE)) {
-               /*
-                * Wire up a new PDP page in the PML4.
-                *
-                * (m) is busied so we cannot race another thread trying
-                * to map the PDP entry in the PML4.
-                */
-               vm_pindex_t pml4index;
-               pml4_entry_t *pml4;
-
-               pml4index = ptepindex - (NUPDE + NUPDPE);
-               pml4 = &pmap->pm_pml4[pml4index];
-               if ((*pml4 & PG_V) == 0) {
-                       *pml4 = VM_PAGE_TO_PHYS(m) | (PG_U | PG_RW | PG_V |
-                                                     PG_A | PG_M);
-                       ++pmap->pm_stats.resident_count;
-                       vm_page_wire_quick(m);  /* wire for mapping */
-               }
-               /* return (m) wired for the caller */
-       } else if (ptepindex >= NUPDE) {
-               /*
-                * Wire up a new PD page in the PDP
-                */
-               vm_pindex_t pml4index;
-               vm_pindex_t pdpindex;
-               vm_page_t pdppg;
-               pml4_entry_t *pml4;
-               pdp_entry_t *pdp;
+       if (pv->pv_pindex == pmap_pml4_pindex()) {
+               pv_put(pv);
+               spin_lock(&pmap->pm_spin);
+               return(-1);
+       }
+
+       /*
+        * For page table pages (other than the top-level page),
+        * remove and free the vm_page.  The representitive mapping
+        * removed above by pmap_remove_pv_pte() did not undo the
+        * last wire_count so we have to do that as well.
+        */
+       p = pmap_remove_pv_page(pv, 1);
+       vm_page_busy_wait(p, FALSE, "pmaprl");
+       vm_page_unhold(p);
+       if (p->wire_count != 1) {
+               kprintf("p->wire_count was %016lx %d\n",
+                       pv->pv_pindex, p->wire_count);
+       }
+       KKASSERT(p->wire_count == 1);
+       KKASSERT(p->flags & PG_UNMANAGED);
+
+       vm_page_unwire(p, 0);
+       KKASSERT(p->wire_count == 0);
+       /* JG eventually revert to using vm_page_free_zero() */
+       vm_page_free(p);
+skip:
+       pv_free(pv);
+       spin_lock(&pmap->pm_spin);
+       return(0);
+}
+
+/*
+ * This function will remove the pte associated with a pv from its parent.
+ * Terminal pv's are supported.  The removal will be interlocked if info
+ * is non-NULL.  The caller must dispose of pv instead of just unlocking
+ * it.
+ *
+ * The wire count will be dropped on the parent page table.  The wire
+ * count on the page being removed (pv->pv_m) from the parent page table
+ * is NOT touched.  Note that terminal pages will not have any additional
+ * wire counts while page table pages will have at least one representing
+ * the mapping, plus others representing sub-mappings.
+ *
+ * NOTE: Cannot be called on kernel page table pages, only KVM terminal
+ *      pages and user page table and terminal pages.
+ *
+ * The pv must be locked.
+ *
+ * XXX must lock parent pv's if they exist to remove pte XXX
+ */
+static
+void
+pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, struct pmap_inval_info *info)
+{
+       vm_pindex_t ptepindex = pv->pv_pindex;
+       pmap_t pmap = pv->pv_pmap;
+       vm_page_t p;
+       int gotpvp = 0;
 
-               pdpindex = ptepindex - NUPDE;
-               pml4index = pdpindex >> NPML4EPGSHIFT;
+       KKASSERT(pmap);
 
+       if (ptepindex == pmap_pml4_pindex()) {
                /*
-                * Once mapped the PDP is not unmapped during normal operation
-                * so we only need to handle races in the unmapped case.
-                *
-                * Mapping a PD into the PDP requires an additional wiring
-                * of the PDP.
+                * We are the top level pml4 table, there is no parent.
                 */
-               pml4 = &pmap->pm_pml4[pml4index];
-               if ((*pml4 & PG_V) == 0) {
-                       pdppg = _pmap_allocpte(pmap,
-                                              NUPDE + NUPDPE + pml4index);
-                       /* pdppg wired for the map and also wired for return */
-               } else {
-                       pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
-                       vm_page_wire_quick(pdppg);
-               }
-               /* we have an extra ref on pdppg now for our use */
-
+               p = pmap->pm_pmlpv->pv_m;
+       } else if (ptepindex >= pmap_pdp_pindex(0)) {
                /*
-                * Now find the PD entry in the PDP and map it.
-                *
-                * (m) is busied so we cannot race another thread trying
-                * to map the PD entry in the PDP.
-                *
-                * If the PD entry is already mapped we have to drop one
-                * wire count on the pdppg that we had bumped above.
+                * Remove a PDP page from the pml4e.  This can only occur
+                * with user page tables.  We do not have to lock the
+                * pml4 PV so just ignore pvp.
                 */
-               pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
-               pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
-
-               if ((*pdp & PG_V) == 0) {
-                       *pdp = VM_PAGE_TO_PHYS(m) | (PG_U | PG_RW | PG_V |
-                                                    PG_A | PG_M);
-                       vm_page_wire_quick(m);  /* wire for mapping */
-                       ++pmap->pm_stats.resident_count;
-                       /* eat extra pdppg wiring for mapping */
-               } else {
-                       if (vm_page_unwire_quick(pdppg))
-                               panic("pmap_allocpte: unwire case 1");
+               vm_pindex_t pml4_pindex;
+               vm_pindex_t pdp_index;
+               pml4_entry_t *pdp;
+
+               pdp_index = ptepindex - pmap_pdp_pindex(0);
+               if (pvp == NULL) {
+                       pml4_pindex = pmap_pml4_pindex();
+                       pvp = pv_get(pv->pv_pmap, pml4_pindex);
+                       gotpvp = 1;
                }
-               /* return (m) wired for the caller */
-       } else {
+               pdp = &pmap->pm_pml4[pdp_index & ((1ul << NPML4EPGSHIFT) - 1)];
+               KKASSERT((*pdp & PG_V) != 0);
+               p = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
+               *pdp = 0;
+               KKASSERT(info == NULL);
+       } else if (ptepindex >= pmap_pd_pindex(0)) {
                /*
-                * Wire up the new PT page in the PD
+                *  Remove a PD page from the pdp
                 */
-               vm_pindex_t pml4index;
-               vm_pindex_t pdpindex;
-               pml4_entry_t *pml4;
-               pdp_entry_t *pdp;
-               pd_entry_t *pd;
-               vm_page_t pdppg;
-               vm_page_t pdpg;
+               vm_pindex_t pdp_pindex;
+               vm_pindex_t pd_index;
+               pdp_entry_t *pd;
 
-               pdpindex = ptepindex >> NPDPEPGSHIFT;
-               pml4index = pdpindex >> NPML4EPGSHIFT;
+               pd_index = ptepindex - pmap_pd_pindex(0);
 
+               if (pvp == NULL) {
+                       pdp_pindex = NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL +
+                                    (pd_index >> NPML4EPGSHIFT);
+                       pvp = pv_get(pv->pv_pmap, pdp_pindex);
+                       gotpvp = 1;
+               }
+               pd = pv_pte_lookup(pvp, pd_index & ((1ul << NPDPEPGSHIFT) - 1));
+               KKASSERT((*pd & PG_V) != 0);
+               p = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
+               *pd = 0;
+               KKASSERT(info == NULL);
+       } else if (ptepindex >= pmap_pt_pindex(0)) {
                /*
-                * Locate the PDP page in the PML4
-                *
-                * Once mapped the PDP is not unmapped during normal operation
-                * so we only need to handle races in the unmapped case.
+                *  Remove a PT page from the pd
                 */
-               pml4 = &pmap->pm_pml4[pml4index];
-               if ((*pml4 & PG_V) == 0) {
-                       pdppg = _pmap_allocpte(pmap, NUPDE + pdpindex);
-               } else {
-                       pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
-                       vm_page_wire_quick(pdppg);
-               }
-               /* we have an extra ref on pdppg now for our use */
+               vm_pindex_t pd_pindex;
+               vm_pindex_t pt_index;
+               pd_entry_t *pt;
 
+               pt_index = ptepindex - pmap_pt_pindex(0);
+
+               if (pvp == NULL) {
+                       pd_pindex = NUPTE_TOTAL + NUPT_TOTAL +
+                                   (pt_index >> NPDPEPGSHIFT);
+                       pvp = pv_get(pv->pv_pmap, pd_pindex);
+                       gotpvp = 1;
+               }
+               pt = pv_pte_lookup(pvp, pt_index & ((1ul << NPDPEPGSHIFT) - 1));
+               KKASSERT((*pt & PG_V) != 0);
+               p = PHYS_TO_VM_PAGE(*pt & PG_FRAME);
+               *pt = 0;
+               KKASSERT(info == NULL);
+       } else {
                /*
-                * Locate the PD page in the PDP
-                *
-                * Once mapped the PDP is not unmapped during normal operation
-                * so we only need to handle races in the unmapped case.
+                * Remove a PTE from the PT page
                 *
-                * We can scrap the extra reference on pdppg not needed if
-                * *pdp is already mapped and also not needed if it wasn't
-                * because the _pmap_allocpte() picked up the case for us.
+                * NOTE: pv's must be locked bottom-up to avoid deadlocking.
+                *       pv is a pte_pv so we can safely lock pt_pv.
                 */
-               pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
-               pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
+               vm_pindex_t pt_pindex;
+               pt_entry_t *ptep;
+               pt_entry_t pte;
+               vm_offset_t va;
 
-               if ((*pdp & PG_V) == 0) {
-                       pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex);
+               pt_pindex = ptepindex >> NPTEPGSHIFT;
+               va = (vm_offset_t)ptepindex << PAGE_SHIFT;
+
+               if (ptepindex >= NUPTE_USER) {
+                       ptep = vtopte(ptepindex << PAGE_SHIFT);
+                       KKASSERT(pvp == NULL);
                } else {
-                       pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
-                       vm_page_wire_quick(pdpg);
+                       if (pvp == NULL) {
+                               pt_pindex = NUPTE_TOTAL +
+                                           (ptepindex >> NPDPEPGSHIFT);
+                               pvp = pv_get(pv->pv_pmap, pt_pindex);
+                               gotpvp = 1;
+                       }
+                       ptep = pv_pte_lookup(pvp, ptepindex &
+                                                 ((1ul << NPDPEPGSHIFT) - 1));
                }
-               vm_page_unwire_quick(pdppg);
-               /* we have an extra ref on pdpg now for our use */
+
+               if (info)
+                       pmap_inval_interlock(info, pmap, va);
+               pte = pte_load_clear(ptep);
+               if (info)
+                       pmap_inval_deinterlock(info, pmap);
 
                /*
-                * Locate the PT page in the PD.
-                *
-                * (m) is busied so we cannot race another thread trying
-                * to map the PT page in the PD.
+                * Now update the vm_page_t
                 */
-               pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
-               pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
-               if ((*pd & PG_V) == 0) {
-                       *pd = VM_PAGE_TO_PHYS(m) | (PG_U | PG_RW | PG_V |
-                                                   PG_A | PG_M);
-                       ++pmap->pm_stats.resident_count;
-                       vm_page_wire_quick(m);  /* wire for mapping */
-                       /* eat extra pdpg wiring for mapping */
-               } else {
-                       if (vm_page_unwire_quick(pdpg))
-                               panic("pmap_allocpte: unwire case 2");
+               if ((pte & (PG_MANAGED|PG_V)) != (PG_MANAGED|PG_V)) {
+                       kprintf("remove_pte badpte %016lx %016lx %d\n",
+                               pte, pv->pv_pindex,
+                               pv->pv_pindex < pmap_pt_pindex(0));
+               }
+               /*KKASSERT((pte & (PG_MANAGED|PG_V)) == (PG_MANAGED|PG_V));*/
+               p = PHYS_TO_VM_PAGE(pte & PG_FRAME);
+
+               if (pte & PG_M) {
+                       if (pmap_track_modified(ptepindex))
+                               vm_page_dirty(p);
+               }
+               if (pte & PG_A) {
+                       vm_page_flag_set(p, PG_REFERENCED);
                }
-               /* return (m) wired for the caller */
+               if (pte & PG_W)
+                       atomic_add_long(&pmap->pm_stats.wired_count, -1);
+               if (pte & PG_G)
+                       cpu_invlpg((void *)va);
        }
 
        /*
-        * We successfully loaded a PDP, PD, or PTE.  Set the page table hint,
-        * valid bits, mapped flag, unbusy, and we're done.
+        * Unwire the parent page table page.  The wire_count cannot go below
+        * 1 here because the parent page table page is itself still mapped.
+        *
+        * XXX remove the assertions later.
         */
-       pmap->pm_ptphint = m;
-
-#if 0
-       m->valid = VM_PAGE_BITS_ALL;
-       vm_page_flag_clear(m, PG_ZERO);
-#endif
-       vm_page_flag_set(m, PG_MAPPED);
-       vm_page_wakeup(m);
+       KKASSERT(pv->pv_m == p);
+       if (pvp && vm_page_unwire_quick(pvp->pv_m))
+               panic("pmap_remove_pv_pte: Insufficient wire_count");
 
-       return (m);
+       if (gotpvp)
+               pv_put(pvp);
 }
 
 static
 vm_page_t
-pmap_allocpte(pmap_t pmap, vm_offset_t va)
+pmap_remove_pv_page(pv_entry_t pv, int holdpg)
 {
-       vm_pindex_t ptepindex;
-       pd_entry_t *pd;
        vm_page_t m;
 
-       ASSERT_LWKT_TOKEN_HELD(vm_object_token(pmap->pm_pteobj));
-
-       /*
-        * Calculate pagetable page index
-        */
-       ptepindex = pmap_pde_pindex(va);
-
-       /*
-        * Get the page directory entry
-        */
-       pd = pmap_pde(pmap, va);
-
-       /*
-        * This supports switching from a 2MB page to a
-        * normal 4K page.
-        */
-       if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
-               panic("no promotion/demotion yet");
-               *pd = 0;
-               pd = NULL;
-               cpu_invltlb();
-               smp_invltlb();
-       }
-
-       /*
-        * If the page table page is mapped, we just increment the
-        * wire count, and activate it.
-        */
-       if (pd != NULL && (*pd & PG_V) != 0) {
-               m = pmap_page_lookup(pmap->pm_pteobj, ptepindex);
-               pmap->pm_ptphint = m;
-               vm_page_wire_quick(m);
-               vm_page_wakeup(m);
-               return m;
-       }
+       m = pv->pv_m;
+       if (holdpg)
+               vm_page_hold(m);
+       KKASSERT(m);
+       vm_page_spin_lock(m);
+       pv->pv_m = NULL;
+       TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
        /*
-        * Here if the pte page isn't mapped, or if it has been deallocated.
-        */
-       return _pmap_allocpte(pmap, ptepindex);
-}
-
-
-/***************************************************
- * Pmap allocation/deallocation routines.
- ***************************************************/
-
-/*
- * Release any resources held by the given physical map.
- * Called when a pmap initialized by pmap_pinit is being released.
- * Should only be called if the map contains no valid mappings.
- *
- * Caller must hold pmap->pm_token
- */
-static int pmap_release_callback(struct vm_page *p, void *data);
-
-static __inline
-void
-pmap_auto_yield(struct rb_vm_page_scan_info *info)
-{
-       if (++info->desired >= pmap_yield_count) {
-               info->desired = 0;
-               lwkt_yield();
-       }
-}
-
-void
-pmap_release(struct pmap *pmap)
-{
-       vm_object_t object = pmap->pm_pteobj;
-       struct rb_vm_page_scan_info info;
-
-       KASSERT(pmap->pm_active == 0,
-               ("pmap still active! %016jx", (uintmax_t)pmap->pm_active));
-#if defined(DIAGNOSTIC)
-       if (object->ref_count != 1)
-               panic("pmap_release: pteobj reference count != 1");
-#endif
-       
-       info.pmap = pmap;
-       info.object = object;
-
-       spin_lock(&pmap_spin);
-       TAILQ_REMOVE(&pmap_list, pmap, pm_pmnode);
-       spin_unlock(&pmap_spin);
-
-       info.desired = 0;
-       vm_object_hold(object);
-       do {
-               info.error = 0;
-               info.mpte = NULL;
-               info.limit = object->generation;
-
-               vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL, 
-                                       pmap_release_callback, &info);
-               if (info.error == 0 && info.mpte) {
-                       if (!pmap_release_free_page(pmap, info.mpte))
-                               info.error = 1;
-               }
-       } while (info.error);
-       vm_object_drop(object);
-
-       while (pmap->pm_hold)
-               tsleep(pmap, 0, "pmapx", 1);
-}
-
-static
-int
-pmap_release_callback(struct vm_page *p, void *data)
-{
-       struct rb_vm_page_scan_info *info = data;
-
-       if (p->pindex == NUPDE + NUPDPE + PML4PML4I) {
-               info->mpte = p;
-               return(0);
-       }
-       if (!pmap_release_free_page(info->pmap, p)) {
-               info->error = 1;
-               pmap_auto_yield(info);
-               return(-1);
-       }
-       if (info->object->generation != info->limit) {
-               info->error = 1;
-               pmap_auto_yield(info);
-               return(-1);
-       }
-       return(0);
+       if (m->object)
+               atomic_add_int(&m->object->agg_pv_list_count, -1);
+       */
+       if (TAILQ_EMPTY(&m->md.pv_list))
+               vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
+       vm_page_spin_unlock(m);
+       if (holdpg)
+               return(m);
+       return(NULL);
 }
 
 /*
@@ -1907,19 +1963,17 @@ pmap_growkernel(vm_offset_t kstart, vm_offset_t kend)
        vm_paddr_t paddr;
        vm_offset_t ptppaddr;
        vm_page_t nkpg;
-       pd_entry_t *pde, newpdir;
-       pdp_entry_t newpdp;
+       pd_entry_t *pt, newpt;
+       pdp_entry_t newpd;
        int update_kernel_vm_end;
 
-       vm_object_hold(kptobj);
-
        /*
         * bootstrap kernel_vm_end on first real VM use
         */
        if (kernel_vm_end == 0) {
                kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
                nkpt = 0;
-               while ((*pmap_pde(&kernel_pmap, kernel_vm_end) & PG_V) != 0) {
+               while ((*pmap_pt(&kernel_pmap, kernel_vm_end) & PG_V) != 0) {
                        kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) &
                                        ~(PAGE_SIZE * NPTEPG - 1);
                        nkpt++;
@@ -1951,10 +2005,10 @@ pmap_growkernel(vm_offset_t kstart, vm_offset_t kend)
                kend = kernel_map.max_offset;
 
        while (kstart < kend) {
-               pde = pmap_pde(&kernel_pmap, kstart);
-               if (pde == NULL) {
+               pt = pmap_pt(&kernel_pmap, kstart);
+               if (pt == NULL) {
                        /* We need a new PDP entry */
-                       nkpg = vm_page_alloc(kptobj, nkpt,
+                       nkpg = vm_page_alloc(NULL, nkpt,
                                             VM_ALLOC_NORMAL |
                                             VM_ALLOC_SYSTEM |
                                             VM_ALLOC_INTERRUPT);
@@ -1966,13 +2020,13 @@ pmap_growkernel(vm_offset_t kstart, vm_offset_t kend)
                        if ((nkpg->flags & PG_ZERO) == 0)
                                pmap_zero_page(paddr);
                        vm_page_flag_clear(nkpg, PG_ZERO);
-                       newpdp = (pdp_entry_t)
+                       newpd = (pdp_entry_t)
                                (paddr | PG_V | PG_RW | PG_A | PG_M);
-                       *pmap_pdpe(&kernel_pmap, kstart) = newpdp;
+                       *pmap_pd(&kernel_pmap, kstart) = newpd;
                        nkpt++;
                        continue; /* try again */
                }
-               if ((*pde & PG_V) != 0) {
+               if ((*pt & PG_V) != 0) {
                        kstart = (kstart + PAGE_SIZE * NPTEPG) &
                                 ~(PAGE_SIZE * NPTEPG - 1);
                        if (kstart - 1 >= kernel_map.max_offset) {
@@ -1985,7 +2039,7 @@ pmap_growkernel(vm_offset_t kstart, vm_offset_t kend)
                /*
                 * This index is bogus, but out of the way
                 */
-               nkpg = vm_page_alloc(kptobj, nkpt,
+               nkpg = vm_page_alloc(NULL, nkpt,
                                     VM_ALLOC_NORMAL |
                                     VM_ALLOC_SYSTEM |
                                     VM_ALLOC_INTERRUPT);
@@ -1996,8 +2050,8 @@ pmap_growkernel(vm_offset_t kstart, vm_offset_t kend)
                ptppaddr = VM_PAGE_TO_PHYS(nkpg);
                pmap_zero_page(ptppaddr);
                vm_page_flag_clear(nkpg, PG_ZERO);
-               newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
-               *pmap_pde(&kernel_pmap, kstart) = newpdir;
+               newpt = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
+               *pmap_pt(&kernel_pmap, kstart) = newpt;
                nkpt++;
 
                kstart = (kstart + PAGE_SIZE * NPTEPG) &
@@ -2014,8 +2068,6 @@ pmap_growkernel(vm_offset_t kstart, vm_offset_t kend)
         */
        if (update_kernel_vm_end && kernel_vm_end < kstart)
                kernel_vm_end = kstart;
-
-       vm_object_drop(kptobj);
 }
 
 /*
@@ -2054,302 +2106,546 @@ pmap_reference(pmap_t pmap)
 }
 
 /***************************************************
-* page management routines.
+ * page management routines.
  ***************************************************/
 
 /*
- * free the pv_entry back to the free list.  This function may be
- * called from an interrupt.
+ * Hold a pv without locking it
  */
-static __inline
-void
-free_pv_entry(pv_entry_t pv)
+static void
+pv_hold(pv_entry_t pv)
 {
-       atomic_add_int(&pv_entry_count, -1);
-       KKASSERT(pv_entry_count >= 0);
-       zfree(pvzone, pv);
+       u_int count;
+
+       if (atomic_cmpset_int(&pv->pv_hold, 0, 1))
+               return;
+
+       for (;;) {
+               count = pv->pv_hold;
+               cpu_ccfence();
+               if (atomic_cmpset_int(&pv->pv_hold, count, count + 1))
+                       return;
+               /* retry */
+       }
 }
 
 /*
- * get a new pv_entry, allocating a block from the system
- * when needed.  This function may be called from an interrupt.
+ * Hold a pv_entry, preventing its destruction.  TRUE is returned if the pv
+ * was successfully locked, FALSE if it wasn't.  The caller must dispose of
+ * the pv properly.
+ *
+ * Either the pmap->pm_spin or the related vm_page_spin (if traversing a
+ * pv list via its page) must be held by the caller.
  */
-static
-pv_entry_t
-get_pv_entry(void)
+static int
+_pv_hold_try(pv_entry_t pv PMAP_DEBUG_DECL)
 {
-       atomic_add_int(&pv_entry_count, 1);
-       if (pv_entry_high_water &&
-               (pv_entry_count > pv_entry_high_water) &&
-               (pmap_pagedaemon_waken == 0)) {
-               pmap_pagedaemon_waken = 1;
-               wakeup(&vm_pages_needed);
+       u_int count;
+
+       if (atomic_cmpset_int(&pv->pv_hold, 0, PV_HOLD_LOCKED | 1)) {
+#ifdef PMAP_DEBUG
+               pv->pv_func = func;
+               pv->pv_line = lineno;
+#endif
+               return TRUE;
+       }
+
+       for (;;) {
+               count = pv->pv_hold;
+               cpu_ccfence();
+               if ((count & PV_HOLD_LOCKED) == 0) {
+                       if (atomic_cmpset_int(&pv->pv_hold, count,
+                                             (count + 1) | PV_HOLD_LOCKED)) {
+#ifdef PMAP_DEBUG
+                               pv->pv_func = func;
+                               pv->pv_line = lineno;
+#endif
+                               return TRUE;
+                       }
+               } else {
+                       if (atomic_cmpset_int(&pv->pv_hold, count, count + 1))
+                               return FALSE;
+               }
+               /* retry */
        }
-       return zalloc(pvzone);
 }
 
 /*
- * This routine is very drastic, but can save the system
- * in a pinch.
+ * Drop a previously held pv_entry which could not be locked, allowing its
+ * destruction.
+ *
+ * Must not be called with a spinlock held as we might zfree() the pv if it
+ * is no longer associated with a pmap and this was the last hold count.
  */
-void
-pmap_collect(void)
+static void
+pv_drop(pv_entry_t pv)
 {
-       int i;
-       vm_page_t m;
-       static int warningdone=0;
+       u_int count;
 
-       if (pmap_pagedaemon_waken == 0)
+       if (atomic_cmpset_int(&pv->pv_hold, 1, 0)) {
+               if (pv->pv_pmap == NULL)
+                       zfree(pvzone, pv);
                return;
-       pmap_pagedaemon_waken = 0;
-       if (warningdone < 5) {
-               kprintf("pmap_collect: collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n");
-               warningdone++;
        }
 
-       for (i = 0; i < vm_page_array_size; i++) {
-               m = &vm_page_array[i];
-               if (m->wire_count || m->hold_count)
-                       continue;
-               if (vm_page_busy_try(m, TRUE) == 0) {
-                       if (m->wire_count == 0 && m->hold_count == 0) {
-                               pmap_remove_all(m);
-                       }
-                       vm_page_wakeup(m);
+       for (;;) {
+               count = pv->pv_hold;
+               cpu_ccfence();
+               KKASSERT((count & PV_HOLD_MASK) > 0);
+               KKASSERT((count & (PV_HOLD_LOCKED | PV_HOLD_MASK)) !=
+                        (PV_HOLD_LOCKED | 1));
+               if (atomic_cmpset_int(&pv->pv_hold, count, count - 1)) {
+                       if (count == 1 && pv->pv_pmap == NULL)
+                               zfree(pvzone, pv);
+                       return;
                }
+               /* retry */
        }
 }
-       
 
 /*
- * If it is the first entry on the list, it is actually in the header and
- * we must copy the following entry up to the header.
- *
- * Otherwise we must search the list for the entry.  In either case we
- * free the now unused entry.
- *
- * Caller must hold pmap->pm_token
+ * Find or allocate the requested PV entry, returning a locked pv
  */
 static
-int
-pmap_remove_entry(struct pmap *pmap, vm_page_t m, 
-                 vm_offset_t va, pmap_inval_info_t info)
+pv_entry_t
+_pv_alloc(pmap_t pmap, vm_pindex_t pindex, int *isnew PMAP_DEBUG_DECL)
 {
        pv_entry_t pv;
-       int rtval;
+       pv_entry_t pnew = NULL;
 
-       spin_lock(&pmap_spin);
-       if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
-               TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
-                       if (pmap == pv->pv_pmap && va == pv->pv_va) 
-                               break;
+       spin_lock(&pmap->pm_spin);
+       for (;;) {
+               if ((pv = pmap->pm_pvhint) == NULL || pv->pv_pindex != pindex) {
+                       pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot,
+                                                       pindex);
                }
-       } else {
-               TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) {
-                       if (va == pv->pv_va) 
-                               break;
-               }
-       }
+               if (pv == NULL) {
+                       if (pnew == NULL) {
+                               spin_unlock(&pmap->pm_spin);
+                               pnew = zalloc(pvzone);
+                               spin_lock(&pmap->pm_spin);
+                               continue;
+                       }
+                       pnew->pv_pmap = pmap;
+                       pnew->pv_pindex = pindex;
+                       pnew->pv_hold = PV_HOLD_LOCKED | 1;
+#ifdef PMAP_DEBUG
+                       pnew->pv_func = func;
+                       pnew->pv_line = lineno;
+#endif
+                       pv_entry_rb_tree_RB_INSERT(&pmap->pm_pvroot, pnew);
+                       atomic_add_long(&pmap->pm_stats.resident_count, 1);
+                       spin_unlock(&pmap->pm_spin);
+                       *isnew = 1;
+                       return(pnew);
+               }
+               if (pnew) {
+                       spin_unlock(&pmap->pm_spin);
+                       zfree(pvzone, pnew);
+                       pnew = NULL;
+                       spin_lock(&pmap->pm_spin);
+                       continue;
+               }
+               if (_pv_hold_try(pv PMAP_DEBUG_COPY)) {
+                       spin_unlock(&pmap->pm_spin);
+                       *isnew = 0;
+                       return(pv);
+               }
+               spin_unlock(&pmap->pm_spin);
+               _pv_lock(pv PMAP_DEBUG_COPY);
+               if (pv->pv_pmap == pmap && pv->pv_pindex == pindex) {
+                       *isnew = 0;
+                       return(pv);
+               }
+               pv_put(pv);
+               spin_lock(&pmap->pm_spin);
+       }
 
-       rtval = 0;
-       KKASSERT(pv);
 
-       TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
-       m->md.pv_generation++;
-       m->md.pv_list_count--;
-       vm_page_spin_lock(m);
-       if (m->object)
-               atomic_add_int(&m->object->agg_pv_list_count, -1);
-       vm_page_spin_unlock(m);
-       KKASSERT(m->md.pv_list_count >= 0);
-       if (TAILQ_EMPTY(&m->md.pv_list))
-               vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
-       TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
-       ++pmap->pm_generation;
-       spin_unlock(&pmap_spin);
+}
 
-       rtval = pmap_unwire_pte_hold(pmap, va, pv->pv_ptem, info);
-       free_pv_entry(pv);
+/*
+ * Find the requested PV entry, returning a locked+held pv or NULL
+ */
+static
+pv_entry_t
+_pv_get(pmap_t pmap, vm_pindex_t pindex PMAP_DEBUG_DECL)
+{
+       pv_entry_t pv;
 
-       return rtval;
+       spin_lock(&pmap->pm_spin);
+       for (;;) {
+               /*
+                * Shortcut cache
+                */
+               if ((pv = pmap->pm_pvhint) == NULL || pv->pv_pindex != pindex) {
+                       pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot,
+                                                       pindex);
+               }
+               if (pv == NULL) {
+                       spin_unlock(&pmap->pm_spin);
+                       return NULL;
+               }
+               if (_pv_hold_try(pv PMAP_DEBUG_COPY)) {
+                       pv_cache(pv, pindex);
+                       spin_unlock(&pmap->pm_spin);
+                       return(pv);
+               }
+               spin_unlock(&pmap->pm_spin);
+               _pv_lock(pv PMAP_DEBUG_COPY);
+               if (pv->pv_pmap == pmap && pv->pv_pindex == pindex)
+                       return(pv);
+               pv_put(pv);
+               spin_lock(&pmap->pm_spin);
+       }
 }
 
 /*
- * Create a pv entry for page at pa for (pmap, va).
+ * Lookup, hold, and attempt to lock (pmap,pindex).
+ *
+ * If the entry does not exist NULL is returned and *errorp is set to 0
  *
- * Caller must hold pmap token
+ * If the entry exists and could be successfully locked it is returned and
+ * errorp is set to 0.
+ *
+ * If the entry exists but could NOT be successfully locked it is returned
+ * held and *errorp is set to 1.
  */
 static
-void
-pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m)
+pv_entry_t
+pv_get_try(pmap_t pmap, vm_pindex_t pindex, int *errorp)
 {
        pv_entry_t pv;
 
-       pv = get_pv_entry();
-       pv->pv_va = va;
-       pv->pv_pmap = pmap;
-       pv->pv_ptem = mpte;
-
-       spin_lock(&pmap_spin);
-       TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
-       TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
-       m->md.pv_generation++;
-       m->md.pv_list_count++;
-       vm_page_spin_lock(m);
-       if (m->object)
-               atomic_add_int(&m->object->agg_pv_list_count, 1);
-       vm_page_spin_unlock(m);
-       pmap->pm_generation++;
-       spin_unlock(&pmap_spin);
+       spin_lock(&pmap->pm_spin);
+       if ((pv = pmap->pm_pvhint) == NULL || pv->pv_pindex != pindex)
+               pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pindex);
+       if (pv == NULL) {
+               spin_unlock(&pmap->pm_spin);
+               *errorp = 0;
+               return NULL;
+       }
+       if (pv_hold_try(pv)) {
+               pv_cache(pv, pindex);
+               spin_unlock(&pmap->pm_spin);
+               *errorp = 0;
+               return(pv);     /* lock succeeded */
+       }
+       spin_unlock(&pmap->pm_spin);
+       *errorp = 1;
+       return (pv);            /* lock failed */
 }
 
 /*
- * pmap_remove_pte: do the things to unmap a page in a process
- *
- * Caller must hold pmap token
- * Caller must hold pmap object
+ * Find the requested PV entry, returning a held pv or NULL
  */
 static
-int
-pmap_remove_pte(struct pmap *pmap, pt_entry_t *ptq, vm_offset_t va,
-               pmap_inval_info_t info)
+pv_entry_t
+pv_find(pmap_t pmap, vm_pindex_t pindex)
 {
-       pt_entry_t oldpte;
-       vm_page_t m;
-       vm_page_t mpte;
-       vm_pindex_t ptepindex;
+       pv_entry_t pv;
 
-       ASSERT_LWKT_TOKEN_HELD(&pmap->pm_token);
+       spin_lock(&pmap->pm_spin);
 
-       pmap_inval_interlock(info, pmap, va);
-       oldpte = pte_load_clear(ptq);
-       pmap_inval_deinterlock(info, pmap);
-       if (oldpte & PG_W)
-               pmap->pm_stats.wired_count -= 1;
-       /*
-        * Machines that don't support invlpg, also don't support
-        * PG_G.  XXX PG_G is disabled for SMP so don't worry about
-        * the SMP case.
-        */
-       if (oldpte & PG_G)
-               cpu_invlpg((void *)va);
-       KKASSERT(pmap->pm_stats.resident_count > 0);
-       --pmap->pm_stats.resident_count;
-       if (oldpte & PG_MANAGED) {
-               m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
-               if (oldpte & PG_M) {
-#if defined(PMAP_DIAGNOSTIC)
-                       if (pmap_nw_modified((pt_entry_t) oldpte)) {
-                               kprintf(
-       "pmap_remove: modified page not writable: va: 0x%lx, pte: 0x%lx\n",
-                                   va, oldpte);
+       if ((pv = pmap->pm_pvhint) == NULL || pv->pv_pindex != pindex)
+               pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pindex);
+       if (pv == NULL) {
+               spin_unlock(&pmap->pm_spin);
+               return NULL;
+       }
+       pv_hold(pv);
+       pv_cache(pv, pindex);
+       spin_unlock(&pmap->pm_spin);
+       return(pv);
+}
+
+/*
+ * Lock a held pv, keeping the hold count
+ */
+static
+void
+_pv_lock(pv_entry_t pv PMAP_DEBUG_DECL)
+{
+       u_int count;
+
+       for (;;) {
+               count = pv->pv_hold;
+               cpu_ccfence();
+               if ((count & PV_HOLD_LOCKED) == 0) {
+                       if (atomic_cmpset_int(&pv->pv_hold, count,
+                                             count | PV_HOLD_LOCKED)) {
+#ifdef PMAP_DEBUG
+                               pv->pv_func = func;
+                               pv->pv_line = lineno;
+#endif
+                               return;
                        }
+                       continue;
+               }
+               tsleep_interlock(pv, 0);
+               if (atomic_cmpset_int(&pv->pv_hold, count,
+                                     count | PV_HOLD_WAITING)) {
+#ifdef PMAP_DEBUG
+                       kprintf("pv waiting on %s:%d\n",
+                                       pv->pv_func, pv->pv_line);
 #endif
-                       if (pmap_track_modified(va))
-                               vm_page_dirty(m);
+                       tsleep(pv, PINTERLOCKED, "pvwait", hz);
                }
-               if (oldpte & PG_A)
-                       vm_page_flag_set(m, PG_REFERENCED);
-               return pmap_remove_entry(pmap, m, va, info);
+               /* retry */
        }
+}
 
-       /*
-        * Unmanaged pages in userspace still wire the PT page, we have
-        * to look up the mpte for the PDE page and pass it in.
-        */
-       if (va < VM_MAX_USER_ADDRESS) {
-               ptepindex = pmap_pde_pindex(va);
-               mpte = vm_page_lookup(pmap->pm_pteobj, ptepindex);
-               KKASSERT(mpte);
-       } else {
-               mpte = NULL;
+/*
+ * Unlock a held and locked pv, keeping the hold count.
+ */
+static
+void
+pv_unlock(pv_entry_t pv)
+{
+       u_int count;
+
+       if (atomic_cmpset_int(&pv->pv_hold, PV_HOLD_LOCKED | 1, 1))
+               return;
+
+       for (;;) {
+               count = pv->pv_hold;
+               cpu_ccfence();
+               KKASSERT((count & (PV_HOLD_LOCKED|PV_HOLD_MASK)) >=
+                        (PV_HOLD_LOCKED | 1));
+               if (atomic_cmpset_int(&pv->pv_hold, count,
+                                     count &
+                                     ~(PV_HOLD_LOCKED | PV_HOLD_WAITING))) {
+                       if (count & PV_HOLD_WAITING)
+                               wakeup(pv);
+                       break;
+               }
        }
-       return pmap_unwire_pte_hold(pmap, va, mpte, info);
 }
 
 /*
- * Remove a single page from a process address space.
- *
- * This function may not be called from an interrupt if the pmap is
- * not kernel_pmap.
+ * Unlock and drop a pv.  If the pv is no longer associated with a pmap
+ * and the hold count drops to zero we will free it.
  *
- * Caller must hold pmap->pm_token
- * Caller must hold pmap object
+ * Caller should not hold any spin locks.  We are protected from hold races
+ * by virtue of holds only occuring only with a pmap_spin or vm_page_spin
+ * lock held.  A pv cannot be located otherwise.
  */
 static
 void
-pmap_remove_page(struct pmap *pmap, vm_offset_t va, pmap_inval_info_t info)
+pv_put(pv_entry_t pv)
 {
-       pt_entry_t *pte;
+       if (atomic_cmpset_int(&pv->pv_hold, PV_HOLD_LOCKED | 1, 0)) {
+               if (pv->pv_pmap == NULL)
+                       zfree(pvzone, pv);
+               return;
+       }
+       pv_unlock(pv);
+       pv_drop(pv);
+}
 
-       ASSERT_LWKT_TOKEN_HELD(&pmap->pm_token);
+/*
+ * Unlock, drop, and free a pv, destroying it.  The pv is removed from its
+ * pmap.  Any pte operations must have already been completed.
+ */
+static
+void
+pv_free(pv_entry_t pv)
+{
+       pmap_t pmap;
 
-       pte = pmap_pte(pmap, va);
-       if (pte == NULL)
-               return;
-       if ((*pte & PG_V) == 0)
+       KKASSERT(pv->pv_m == NULL);
+       if ((pmap = pv->pv_pmap) != NULL) {
+               spin_lock(&pmap->pm_spin);
+               pv_entry_rb_tree_RB_REMOVE(&pmap->pm_pvroot, pv);
+               if (pmap->pm_pvhint == pv)
+                       pmap->pm_pvhint = NULL;
+               atomic_add_long(&pmap->pm_stats.resident_count, -1);
+               pv->pv_pmap = NULL;
+               pv->pv_pindex = 0;
+               spin_unlock(&pmap->pm_spin);
+       }
+       pv_put(pv);
+}
+
+/*
+ * This routine is very drastic, but can save the system
+ * in a pinch.
+ */
+void
+pmap_collect(void)
+{
+       int i;
+       vm_page_t m;
+       static int warningdone=0;
+
+       if (pmap_pagedaemon_waken == 0)
                return;
-       pmap_remove_pte(pmap, pte, va, info);
+       pmap_pagedaemon_waken = 0;
+       if (warningdone < 5) {
+               kprintf("pmap_collect: collecting pv entries -- "
+                       "suggest increasing PMAP_SHPGPERPROC\n");
+               warningdone++;
+       }
+
+       for (i = 0; i < vm_page_array_size; i++) {
+               m = &vm_page_array[i];
+               if (m->wire_count || m->hold_count)
+                       continue;
+               if (vm_page_busy_try(m, TRUE) == 0) {
+                       if (m->wire_count == 0 && m->hold_count == 0) {
+                               pmap_remove_all(m);
+                       }
+                       vm_page_wakeup(m);
+               }
+       }
 }
 
 /*
- * Remove the given range of addresses from the specified map.
+ * Scan the pmap for active page table entries and issue a callback.
+ * The callback must dispose of pte_pv.
  *
- * It is assumed that the start and end are properly rounded to the page size.
+ * NOTE: Unmanaged page table entries will not have a pte_pv
  *
- * This function may not be called from an interrupt if the pmap is not
- * kernel_pmap.
+ * NOTE: Kernel page table entries will not have a pt_pv.  That is, wiring
+ *      counts are not tracked in kernel page table pages.
+ *
+ * It is assumed that the start and end are properly rounded to the page size.
  */
-void
-pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva)
-{
+static void
+pmap_scan(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva,
+         void (*func)(pmap_t, struct pmap_inval_info *,
+                      pv_entry_t, pv_entry_t, vm_offset_t,
+                      pt_entry_t *, void *),
+         void *arg)
+{
+       pv_entry_t pdp_pv;      /* A page directory page PV */
+       pv_entry_t pd_pv;       /* A page directory PV */
+       pv_entry_t pt_pv;       /* A page table PV */
+       pv_entry_t pte_pv;      /* A page table entry PV */
+       pt_entry_t *ptep;
        vm_offset_t va_next;
-       pml4_entry_t *pml4e;
-       pdp_entry_t *pdpe;
-       pd_entry_t ptpaddr, *pde;
-       pt_entry_t *pte;
        struct pmap_inval_info info;
+       int error;
 
        if (pmap == NULL)
                return;
 
-       vm_object_hold(pmap->pm_pteobj);
+       /*
+        * Hold the token for stability; if the pmap is empty we have nothing
+        * to do.
+        */
        lwkt_gettoken(&pmap->pm_token);
+#if 0
        if (pmap->pm_stats.resident_count == 0) {
                lwkt_reltoken(&pmap->pm_token);
-               vm_object_drop(pmap->pm_pteobj);
                return;
        }
+#endif
 
        pmap_inval_init(&info);
 
        /*
-        * special handling of removing one page.  a very
-        * common operation and easy to short circuit some
-        * code.
+        * Special handling for removing one page, which is a very common
+        * operation (it is?).
+        * NOTE: Locks must be ordered bottom-up. pte,pt,pd,pdp,pml4
         */
        if (sva + PAGE_SIZE == eva) {
-               pde = pmap_pde(pmap, sva);
-               if (pde && (*pde & PG_PS) == 0) {
-                       pmap_remove_page(pmap, sva, &info);
-                       pmap_inval_done(&info);
-                       lwkt_reltoken(&pmap->pm_token);
-                       vm_object_drop(pmap->pm_pteobj);
-                       return;
+               if (sva >= VM_MAX_USER_ADDRESS) {
+                       /*
+                        * Kernel mappings do not track wire counts on
+                        * page table pages.
+                        */
+                       pt_pv = NULL;
+                       pte_pv = pv_get(pmap, pmap_pte_pindex(sva));
+                       ptep = vtopte(sva);
+               } else {
+                       /*
+                        * User mappings may or may not have a pte_pv but
+                        * will always have a pt_pv if the page is present.
+                        */
+                       pte_pv = pv_get(pmap, pmap_pte_pindex(sva));
+                       pt_pv = pv_get(pmap, pmap_pt_pindex(sva));
+                       if (pt_pv == NULL) {
+                               KKASSERT(pte_pv == NULL);
+                               goto fast_skip;
+                       }
+                       ptep = pv_pte_lookup(pt_pv, pmap_pte_index(sva));
                }
+               if (*ptep == 0) {
+                       KKASSERT(pte_pv == NULL);
+               } else if (pte_pv) {
+                       KKASSERT((*ptep & (PG_MANAGED|PG_V)) ==
+                                (PG_MANAGED|PG_V));
+                       func(pmap, &info, pte_pv, pt_pv, sva, ptep, arg);
+               } else {
+                       KKASSERT((*ptep & (PG_MANAGED|PG_V)) ==
+                                PG_V);
+                       func(pmap, &info, pte_pv, pt_pv, sva, ptep, arg);
+               }
+               if (pt_pv)
+                       pv_put(pt_pv);
+fast_skip:
+               pmap_inval_done(&info);
+               lwkt_reltoken(&pmap->pm_token);
+               return;
        }
 
+       /*
+        * NOTE: kernel mappings do not track page table pages, only
+        *       terminal pages.
+        *
+        * NOTE: Locks must be ordered bottom-up. pte,pt,pd,pdp,pml4.
+        *       However, for the scan to be efficient we try to
+        *       cache items top-down.
+        */
+       pdp_pv = NULL;
+       pd_pv = NULL;
+       pt_pv = NULL;
+
        for (; sva < eva; sva = va_next) {
-               pml4e = pmap_pml4e(pmap, sva);
-               if ((*pml4e & PG_V) == 0) {
+               lwkt_yield();
+               if (sva >= VM_MAX_USER_ADDRESS) {
+                       if (pt_pv) {
+                               pv_put(pt_pv);
+                               pt_pv = NULL;
+                       }
+                       goto kernel_skip;
+               }
+
+               /*
+                * PDP cache
+                */
+               if (pdp_pv == NULL) {
+                       pdp_pv = pv_get(pmap, pmap_pdp_pindex(sva));
+               } else if (pdp_pv->pv_pindex != pmap_pdp_pindex(sva)) {
+                       pv_put(pdp_pv);
+                       pdp_pv = pv_get(pmap, pmap_pdp_pindex(sva));
+               }
+               if (pdp_pv == NULL) {
                        va_next = (sva + NBPML4) & ~PML4MASK;
                        if (va_next < sva)
                                va_next = eva;
                        continue;
                }
 
-               pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
-               if ((*pdpe & PG_V) == 0) {
+               /*
+                * PD cache
+                */
+               if (pd_pv == NULL) {
+                       if (pdp_pv) {
+                               pv_put(pdp_pv);
+                               pdp_pv = NULL;
+                       }
+                       pd_pv = pv_get(pmap, pmap_pd_pindex(sva));
+               } else if (pd_pv->pv_pindex != pmap_pd_pindex(sva)) {
+                       pv_put(pd_pv);
+                       if (pdp_pv) {
+                               pv_put(pdp_pv);
+                               pdp_pv = NULL;
+                       }
+                       pd_pv = pv_get(pmap, pmap_pd_pindex(sva));
+               }
+               if (pd_pv == NULL) {
                        va_next = (sva + NBPDP) & ~PDPMASK;
                        if (va_next < sva)
                                va_next = eva;
@@ -2357,55 +2653,186 @@ pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva)
                }
 
                /*
-                * Calculate index for next page table.
+                * PT cache
                 */
-               va_next = (sva + NBPDR) & ~PDRMASK;
-               if (va_next < sva)
-                       va_next = eva;
-
-               pde = pmap_pdpe_to_pde(pdpe, sva);
-               ptpaddr = *pde;
+               if (pt_pv == NULL) {
+                       if (pdp_pv) {
+                               pv_put(pdp_pv);
+                               pdp_pv = NULL;
+                       }
+                       if (pd_pv) {
+                               pv_put(pd_pv);
+                               pd_pv = NULL;
+                       }
+                       pt_pv = pv_get(pmap, pmap_pt_pindex(sva));
+               } else if (pt_pv->pv_pindex != pmap_pt_pindex(sva)) {
+                       if (pdp_pv) {
+                               pv_put(pdp_pv);
+                               pdp_pv = NULL;
+                       }
+                       if (pd_pv) {
+                               pv_put(pd_pv);
+                               pd_pv = NULL;
+                       }
+                       pv_put(pt_pv);
+                       pt_pv = pv_get(pmap, pmap_pt_pindex(sva));
+               }
 
                /*
-                * Weed out invalid mappings.
+                * We will scan or skip a page table page so adjust va_next
+                * either way.
                 */
-               if (ptpaddr == 0)
+               if (pt_pv == NULL) {
+                       va_next = (sva + NBPDR) & ~PDRMASK;
+                       if (va_next < sva)
+                               va_next = eva;
                        continue;
+               }
 
                /*
-                * Check for large page.
+                * From this point in the loop testing pt_pv for non-NULL
+                * means we are in UVM, else if it is NULL we are in KVM.
                 */
-               if ((ptpaddr & PG_PS) != 0) {
-                       /* JG FreeBSD has more complex treatment here */
-                       pmap_inval_interlock(&info, pmap, -1);
-                       *pde = 0;
-                       pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
-                       pmap_inval_deinterlock(&info, pmap);
-                       continue;
-               }
+kernel_skip:
+               va_next = (sva + NBPDR) & ~PDRMASK;
+               if (va_next < sva)
+                       va_next = eva;
 
                /*
                 * Limit our scan to either the end of the va represented
                 * by the current page table page, or to the end of the
                 * range being removed.
+                *
+                * Scan the page table for pages.  Some pages may not be
+                * managed (might not have a pv_entry).
+                *
+                * There is no page table management for kernel pages so
+                * pt_pv will be NULL in that case, but otherwise pt_pv
+                * is non-NULL, locked, and referenced.
                 */
                if (va_next > eva)
                        va_next = eva;
 
-               /*
-                * NOTE: pmap_remove_pte() can block.
-                */
-               for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
-                   sva += PAGE_SIZE) {
-                       if (*pte == 0)
+               if (pt_pv)
+                       ptep = pv_pte_lookup(pt_pv, pmap_pte_index(sva));
+               else
+                       ptep = vtopte(sva);
+
+               while (sva < va_next) {
+                       if (*ptep == 0) {
+                               /* XXX remove me */
+                               pte_pv = pv_find(pmap, pmap_pte_pindex(sva));
+                               KKASSERT(pte_pv == NULL);
+                               sva += PAGE_SIZE;
+                               ++ptep;
                                continue;
-                       if (pmap_remove_pte(pmap, pte, sva, &info))
-                               break;
+                       }
+
+                       /*
+                        * We need a locked pte_pv as well and may have to
+                        * loop to retry if we can't get it non-blocking
+                        * while pt_pv is held locked.
+                        *
+                        * This is a bit complicated because once we release
+                        * the pt_pv our ptep is no longer valid, so we have
+                        * to cycle the whole thing.
+                        */
+                       if (pt_pv) {
+                               pte_pv = pv_get_try(pmap, pmap_pte_pindex(sva),
+                                                   &error);
+                               if (error) {
+                                       kprintf("x");
+                                       if (pdp_pv) {
+                                               pv_put(pdp_pv);
+                                               pdp_pv = NULL;
+                                       }
+                                       if (pd_pv) {
+                                               pv_put(pd_pv);
+                                               pd_pv = NULL;
+                                       }
+                                       pv_put(pt_pv);   /* must be non-NULL */
+                                       pt_pv = NULL;
+                                       pv_lock(pte_pv); /* safe to block now */
+                                       pv_put(pte_pv);
+                                       pte_pv = NULL;
+                                       pt_pv = pv_get(pmap,
+                                                      pmap_pt_pindex(sva));
+                                       continue;
+                               }
+                       } else {
+                               pte_pv = pv_get(pmap, pmap_pte_pindex(sva));
+                       }
+
+                       /*
+                        * Ready for the callback
+                        */
+                       if (pte_pv) {
+                               KKASSERT((*ptep & (PG_MANAGED|PG_V)) ==
+                                        (PG_MANAGED|PG_V));
+                               func(pmap, &info, pte_pv, pt_pv, sva,
+                                    ptep, arg);
+                       } else {
+                               KKASSERT((*ptep & (PG_MANAGED|PG_V)) ==
+                                        PG_V);
+                               func(pmap, &info, pte_pv, pt_pv, sva,
+                                    ptep, arg);
+                       }
+                       pte_pv = NULL;  /* eaten by callback */
+                       sva += PAGE_SIZE;
+                       ++ptep;
                }
        }
+       if (pdp_pv) {
+               pv_put(pdp_pv);
+               pdp_pv = NULL;
+       }
+       if (pd_pv) {
+               pv_put(pd_pv);
+               pd_pv = NULL;
+       }
+       if (pt_pv) {
+               pv_put(pt_pv);
+               pt_pv = NULL;
+       }
        pmap_inval_done(&info);
        lwkt_reltoken(&pmap->pm_token);
-       vm_object_drop(pmap->pm_pteobj);
+}
+
+void
+pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva)
+{
+       pmap_scan(pmap, sva, eva, pmap_remove_callback, NULL);
+}
+
+static void
+pmap_remove_callback(pmap_t pmap, struct pmap_inval_info *info,
+                    pv_entry_t pte_pv, pv_entry_t pt_pv, vm_offset_t va,
+                    pt_entry_t *ptep, void *arg __unused)
+{
+       pt_entry_t pte;
+
+       if (pte_pv) {
+               /*
+                * This will also drop pt_pv's wire_count. Note that
+                * terminal pages are not wired based on mmu presence.
+                */
+               pmap_remove_pv_pte(pte_pv, pt_pv, info);
+               pmap_remove_pv_page(pte_pv, 0);
+               pv_free(pte_pv);
+       } else {
+               /*
+                * pt_pv's wire_count is still bumped by unmanaged pages
+                * so we must decrement it manually.
+                */
+               pmap_inval_interlock(info, pmap, va);
+               pte = pte_load_clear(ptep);
+               pmap_inval_deinterlock(info, pmap);
+               if (pte & PG_W)
+                       atomic_add_long(&pmap->pm_stats.wired_count, -1);
+               atomic_add_long(&pmap->pm_stats.resident_count, -1);
+               if (pt_pv && vm_page_unwire_quick(pt_pv->pv_m))
+                       panic("pmap_remove: insufficient wirecount");
+       }
 }
 
 /*
@@ -2419,91 +2846,35 @@ void
 pmap_remove_all(vm_page_t m)
 {
        struct pmap_inval_info info;
-       pt_entry_t *pte, tpte;
        pv_entry_t pv;
-       struct pmap *pmap;
 
        if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
                return;
 
        pmap_inval_init(&info);
-       spin_lock(&pmap_spin);
+       vm_page_spin_lock(m);
        while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
-               /*
-                * We have to be holding the pmap token to interlock
-                * the pte destruction and pv removal.  XXX need hold on
-                * pmap.
-                */
-               pmap = pv->pv_pmap;
-               spin_unlock(&pmap_spin);
-               lwkt_gettoken(&pmap->pm_token); /* XXX hold race */
-               spin_lock(&pmap_spin);
-               if (pv != TAILQ_FIRST(&m->md.pv_list)) {
-                       spin_unlock(&pmap_spin);
-                       lwkt_reltoken(&pmap->pm_token);
-                       spin_lock(&pmap_spin);
-                       continue;
+               KKASSERT(pv->pv_m == m);
+               if (pv_hold_try(pv)) {
+                       vm_page_spin_unlock(m);
+               } else {
+                       vm_page_spin_unlock(m);
+                       pv_lock(pv);
+                       if (pv->pv_m != m) {
+                               pv_put(pv);
+                               vm_page_spin_lock(m);
+                               continue;
+                       }
                }
-
                /*
-                * Remove the pv
+                * Holding no spinlocks, pv is locked.
                 */
-               TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
-               TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
-               m->md.pv_generation++;
-               m->md.pv_list_count--;
+               pmap_remove_pv_pte(pv, NULL, &info);
+               pmap_remove_pv_page(pv, 0);
+               pv_free(pv);
                vm_page_spin_lock(m);
-               if (m->object)
-                       atomic_add_int(&m->object->agg_pv_list_count, -1);
-               vm_page_spin_unlock(m);
-               KKASSERT(m->md.pv_list_count >= 0);
-               ++pv->pv_pmap->pm_generation;
-               spin_unlock(&pmap_spin);
-
-               /*
-                * pv is now isolated
-                */
-               KKASSERT(pv->pv_pmap->pm_stats.resident_count > 0);
-               --pv->pv_pmap->pm_stats.resident_count;
-
-               pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
-               pmap_inval_interlock(&info, pv->pv_pmap, pv->pv_va);
-               tpte = pte_load_clear(pte);
-               KKASSERT(tpte & PG_MANAGED);
-               if (tpte & PG_W)
-                       pv->pv_pmap->pm_stats.wired_count--;
-               pmap_inval_deinterlock(&info, pv->pv_pmap);
-               if (tpte & PG_A)
-                       vm_page_flag_set(m, PG_REFERENCED);
-
-               /*
-                * Update the vm_page_t clean and reference bits.
-                */
-               if (tpte & PG_M) {
-#if defined(PMAP_DIAGNOSTIC)
-                       if (pmap_nw_modified(tpte)) {
-                               kprintf("pmap_remove_all: modified page not "
-                                       "writable: va: 0x%lx, pte: 0x%lx\n",
-                                       pv->pv_va, tpte);
-                       }
-#endif
-                       if (pmap_track_modified(pv->pv_va))
-                               vm_page_dirty(m); /* XXX races(m) */
-               }
-
-               spin_lock(&pmap_spin);
-               if (TAILQ_EMPTY(&m->md.pv_list))
-                       vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
-               spin_unlock(&pmap_spin);
-
-               pmap_unwire_pte_hold(pv->pv_pmap, pv->pv_va,
-                                    pv->pv_ptem, &info);
-               lwkt_reltoken(&pv->pv_pmap->pm_token);
-
-               free_pv_entry(pv);
-               spin_lock(&pmap_spin);
        }
-       spin_unlock(&pmap_spin);
+       vm_page_spin_unlock(m);
        KKASSERT((m->flags & (PG_MAPPED|PG_WRITEABLE)) == 0);
        pmap_inval_done(&info);
 }
@@ -2520,192 +2891,156 @@ pmap_remove_all(vm_page_t m)
 void
 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 {
-       vm_offset_t va_next;
-       pml4_entry_t *pml4e;
-       pdp_entry_t *pdpe;
-       pd_entry_t ptpaddr, *pde;
-       pt_entry_t *pte;
-       pmap_inval_info info;
-
        /* JG review for NX */
 
        if (pmap == NULL)
                return;
-
        if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
                pmap_remove(pmap, sva, eva);
                return;
        }
-
        if (prot & VM_PROT_WRITE)
                return;
+       pmap_scan(pmap, sva, eva, pmap_protect_callback, &prot);
+}
 
-       lwkt_gettoken(&pmap->pm_token);
-       pmap_inval_init(&info);
-
-       for (; sva < eva; sva = va_next) {
-               pml4e = pmap_pml4e(pmap, sva);
-               if ((*pml4e & PG_V) == 0) {
-                       va_next = (sva + NBPML4) & ~PML4MASK;
-                       if (va_next < sva)
-                               va_next = eva;
-                       continue;
-               }
-
-               pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
-               if ((*pdpe & PG_V) == 0) {
-                       va_next = (sva + NBPDP) & ~PDPMASK;
-                       if (va_next < sva)
-                               va_next = eva;
-                       continue;
-               }
-
-               va_next = (sva + NBPDR) & ~PDRMASK;
-               if (va_next < sva)
-                       va_next = eva;
-
-               pde = pmap_pdpe_to_pde(pdpe, sva);
-               ptpaddr = *pde;
-
-               /*
-                * Check for large page.
-                */
-               if ((ptpaddr & PG_PS) != 0) {
-                       pmap_inval_interlock(&info, pmap, -1);
-                       *pde &= ~(PG_M|PG_RW);
-                       pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
-                       pmap_inval_deinterlock(&info, pmap);
-                       continue;
-               }
-
-               /*
-                * Weed out invalid mappings. Note: we assume that the page
-                * directory table is always allocated, and in kernel virtual.
-                */
-               if (ptpaddr == 0)
-                       continue;
-
-               if (va_next > eva)
-                       va_next = eva;
-
-               for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
-                    sva += PAGE_SIZE) {
-                       pt_entry_t pbits;
-                       pt_entry_t cbits;
-                       vm_page_t m;
+static
+void
+pmap_protect_callback(pmap_t pmap, struct pmap_inval_info *info,
+                     pv_entry_t pte_pv, pv_entry_t pt_pv, vm_offset_t va,
+                     pt_entry_t *ptep, void *arg __unused)
+{
+       pt_entry_t pbits;
+       pt_entry_t cbits;
+       vm_page_t m;
 
-                       /*
-                        * XXX non-optimal.
-                        */
-                       pmap_inval_interlock(&info, pmap, sva);
+       /*
+        * XXX non-optimal.
+        */
+       pmap_inval_interlock(info, pmap, va);
 again:
-                       pbits = *pte;
-                       cbits = pbits;
-                       if ((pbits & PG_V) == 0) {
-                               pmap_inval_deinterlock(&info, pmap);
-                               continue;
-                       }
-                       if (pbits & PG_MANAGED) {
-                               m = NULL;
-                               if (pbits & PG_A) {
+       pbits = *ptep;
+       cbits = pbits;
+       if (pte_pv) {
+               m = NULL;
+               if (pbits & PG_A) {
+                       m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
+                       KKASSERT(m == pte_pv->pv_m);
+                       vm_page_flag_set(m, PG_REFERENCED);
+                       cbits &= ~PG_A;
+               }
+               if (pbits & PG_M) {
+                       if (pmap_track_modified(pte_pv->pv_pindex)) {
+                               if (m == NULL)
                                        m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
-                                       vm_page_flag_set(m, PG_REFERENCED);
-                                       cbits &= ~PG_A;
-                               }
-                               if (pbits & PG_M) {
-                                       if (pmap_track_modified(sva)) {
-                                               if (m == NULL)
-                                                       m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
-                                               vm_page_dirty(m);
-                                               cbits &= ~PG_M;
-                                       }
-                               }
-                       }
-                       cbits &= ~PG_RW;
-                       if (pbits != cbits &&
-                           !atomic_cmpset_long(pte, pbits, cbits)) {
-                               goto again;
+                               vm_page_dirty(m);
+                               cbits &= ~PG_M;
                        }
-                       pmap_inval_deinterlock(&info, pmap);
                }
        }
-       pmap_inval_done(&info);
-       lwkt_reltoken(&pmap->pm_token);
+       cbits &= ~PG_RW;
+       if (pbits != cbits && !atomic_cmpset_long(ptep, pbits, cbits)) {
+               goto again;
+       }
+       pmap_inval_deinterlock(info, pmap);
+       if (pte_pv)
+               pv_put(pte_pv);
 }
 
 /*
- *     Insert the given physical page (p) at
- *     the specified virtual address (v) in the
- *     target physical map with the protection requested.
- *
- *     If specified, the page will be wired down, meaning
- *     that the related pte can not be reclaimed.
+ * Insert the vm_page (m) at the virtual address (va), replacing any prior
+ * mapping at that address.  Set protection and wiring as requested.
  *
- *     NB:  This is the only routine which MAY NOT lazy-evaluate
- *     or lose information.  That is, this routine must actually
- *     insert this page into the given map NOW.
+ * NOTE: This routine MUST insert the page into the pmap now, it cannot
+ *      lazy-evaluate.
  */
 void
 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
           boolean_t wired)
-{
-       vm_paddr_t pa;
-       pd_entry_t *pde;
-       pt_entry_t *pte;
+{
+       pmap_inval_info info;
+       pv_entry_t pt_pv;       /* page table */
+       pv_entry_t pte_pv;      /* page table entry */
+       pt_entry_t *ptep;
        vm_paddr_t opa;
        pt_entry_t origpte, newpte;
-       vm_page_t mpte;
-       pmap_inval_info info;
+       vm_paddr_t pa;
 
        if (pmap == NULL)
                return;
-
        va = trunc_page(va);
 #ifdef PMAP_DIAGNOSTIC
        if (va >= KvaEnd)
                panic("pmap_enter: toobig");
        if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
-               panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", va);
+               panic("pmap_enter: invalid to pmap_enter page table "
+                     "pages (va: 0x%lx)", va);
 #endif
        if (va < UPT_MAX_ADDRESS && pmap == &kernel_pmap) {
-               kprintf("Warning: pmap_enter called on UVA with kernel_pmap\n");
+               kprintf("Warning: pmap_enter called on UVA with "
+                       "kernel_pmap\n");
 #ifdef DDB
                db_print_backtrace();
 #endif
        }
        if (va >= UPT_MAX_ADDRESS && pmap != &kernel_pmap) {
-               kprintf("Warning: pmap_enter called on KVA without kernel_pmap\n");
+               kprintf("Warning: pmap_enter called on KVA without"
+                       "kernel_pmap\n");
 #ifdef DDB
                db_print_backtrace();
 #endif
        }
 
-       vm_object_hold(pmap->pm_pteobj);
-       lwkt_gettoken(&pmap->pm_token);
-
        /*
-        * In the case that a page table page is not
-        * resident, we are creating it here.
-        */
-       if (va < VM_MAX_USER_ADDRESS)
-               mpte = pmap_allocpte(pmap, va);
-       else
-               mpte = NULL;
+        * Get locked PV entries for our new page table entry (pte_pv)
+        * and for its parent page table (pt_pv).  We need the parent
+        * so we can resolve the location of the ptep.
+        *
+        * Only hardware MMU actions can modify the ptep out from
+        * under us.
+        *
+        * if (m) is fictitious or unmanaged we do not create a managing
+        * pte_pv for it.  Any pre-existing page's management state must
+        * match (avoiding code complexity).
+        *
+        * If the pmap is still being initialized we assume existing
+        * page tables.
+        *
+        * Kernel mapppings do not track page table pages (i.e. pt_pv).
+        * pmap_allocpte() checks the
+        */
+       if (pmap_initialized == FALSE) {
+               pte_pv = NULL;
+               pt_pv = NULL;
+               ptep = vtopte(va);
+       } else if (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) {
+               pte_pv = NULL;
+               if (va >= VM_MAX_USER_ADDRESS) {
+                       pt_pv = NULL;
+                       ptep = vtopte(va);
+               } else {
+                       pt_pv = pmap_allocpte(pmap, pmap_pt_pindex(va), NULL);
+                       ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va));
+               }
+               KKASSERT(*ptep == 0 || (*ptep & PG_MANAGED) == 0);
+       } else {
+               if (va >= VM_MAX_USER_ADDRESS) {
+                       pt_pv = NULL;
+                       pte_pv = pmap_allocpte(pmap, pmap_pte_pindex(va), NULL);
+                       ptep = vtopte(va);
+               } else {
+                       pte_pv = pmap_allocpte(pmap, pmap_pte_pindex(va),
+                                              &pt_pv);
+                       ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va));
+               }
+               KKASSERT(*ptep == 0 || (*ptep & PG_MANAGED));
+       }
 
        if ((prot & VM_PROT_NOSYNC) == 0)
                pmap_inval_init(&info);
-       pde = pmap_pde(pmap, va);
-       if (pde != NULL && (*pde & PG_V) != 0) {
-               if ((*pde & PG_PS) != 0)
-                       panic("pmap_enter: attempted pmap_enter on 2MB page");
-               pte = pmap_pde_to_pte(pde, va);
-       } else {
-               panic("pmap_enter: invalid page directory va=%#lx", va);
-       }
 
-       KKASSERT(pte != NULL);
        pa = VM_PAGE_TO_PHYS(m);
-       origpte = *pte;
+       origpte = *ptep;
        opa = origpte & PG_FRAME;
 
        /*
@@ -2718,87 +3053,94 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
                 * are valid mappings in them. Hence, if a user page is wired,
                 * the PT page will be also.
                 */
+               KKASSERT(pte_pv == NULL || m == pte_pv->pv_m);
                if (wired && ((origpte & PG_W) == 0))
-                       pmap->pm_stats.wired_count++;
+                       atomic_add_long(&pmap->pm_stats.wired_count, 1);
                else if (!wired && (origpte & PG_W))
-                       pmap->pm_stats.wired_count--;
+                       atomic_add_long(&pmap->pm_stats.wired_count, -1);
 
 #if defined(PMAP_DIAGNOSTIC)
                if (pmap_nw_modified(origpte)) {
-                       kprintf(
-       "pmap_enter: modified page not writable: va: 0x%lx, pte: 0x%lx\n",
-                           va, origpte);
+                       kprintf("pmap_enter: modified page not writable: "
+                               "va: 0x%lx, pte: 0x%lx\n", va, origpte);
                }
 #endif
 
                /*
-                * Remove the extra pte reference.  Note that we cannot
-                * optimize the RO->RW case because we have adjusted the
-                * wiring count above and may need to adjust the wiring
-                * bits below.
-                */
-               if (mpte)
-                       vm_page_unwire_quick(mpte);
-
-               /*
                 * We might be turning off write access to the page,
                 * so we go ahead and sense modify status.
                 */
-               if (origpte & PG_MANAGED) {
-                       if ((origpte & PG_M) && pmap_track_modified(va)) {
+               if (pte_pv) {
+                       if ((origpte & PG_M) &&
+                           pmap_track_modified(pte_pv->pv_pindex)) {
                                vm_page_t om;
-                               om = PHYS_TO_VM_PAGE(opa);
+                               om = pte_pv->pv_m;
+                               KKASSERT(PHYS_TO_VM_PAGE(opa) == om);
                                vm_page_dirty(om);
                        }
                        pa |= PG_MANAGED;
-                       KKASSERT(m->flags & PG_MAPPED);
                }
                goto validate;
        } 
+
        /*
         * Mapping has changed, invalidate old range and fall through to
         * handle validating new mapping.
-        */
-       while (opa) {
-               int err;
-               err = pmap_remove_pte(pmap, pte, va, &info);
-               if (err)
-                       panic("pmap_enter: pte vanished, va: 0x%lx", va);
-               origpte = *pte;
-               opa = origpte & PG_FRAME;
-               if (opa) {
-                       kprintf("pmap_enter: Warning, raced pmap %p va %p\n",
-                               pmap, (void *)va);
+        *
+        * We always interlock pte removals.
+        */
+       if (opa) {
+               if (pte_pv) {
+                       /* XXX pmap_remove_pv_pte() unwires pt_pv */
+                       vm_page_wire_quick(pt_pv->pv_m);
+                       if (prot & VM_PROT_NOSYNC)
+                               pmap_remove_pv_pte(pte_pv, pt_pv, NULL);
+                       else
+                               pmap_remove_pv_pte(pte_pv, pt_pv, &info);
+                       if (pte_pv->pv_m)
+                               pmap_remove_pv_page(pte_pv, 0);
+               } else if (prot & VM_PROT_NOSYNC) {
+                       *ptep = 0;
+                       cpu_invlpg((void *)va);
+                       atomic_add_long(&pmap->pm_stats.resident_count, -1);
+               } else {
+                       pmap_inval_interlock(&info, pmap, va);
+                       *ptep = 0;
+                       pmap_inval_deinterlock(&info, pmap);
+                       atomic_add_long(&pmap->pm_stats.resident_count, -1);
                }
+               KKASSERT(*ptep == 0);
        }
 
        /*
-        * Enter on the PV list if part of our managed memory. Note that we
-        * raise IPL while manipulating pv_table since pmap_enter can be
-        * called at interrupt time.
-        *
-        * The new mapping covers mpte's new wiring count so we don't
-        * unwire it.
+        * Enter on the PV list if part of our managed memory.  Wiring is
+        * handled automatically.
         */
-       if (pmap_initialized && 
-           (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) {
-               pmap_insert_entry(pmap, va, mpte, m);
-               pa |= PG_MANAGED;
+       if (pte_pv) {
+               KKASSERT(pte_pv->pv_m == NULL);
+               vm_page_spin_lock(m);
+               pte_pv->pv_m = m;
+               TAILQ_INSERT_TAIL(&m->md.pv_list, pte_pv, pv_list);
+               /*
+               if (m->object)
+                       atomic_add_int(&m->object->agg_pv_list_count, 1);
+               */
                vm_page_flag_set(m, PG_MAPPED);
+               vm_page_spin_unlock(m);
+               pa |= PG_MANAGED;
        }
 
        /*
         * Increment counters
         */
-       ++pmap->pm_stats.resident_count;
        if (wired)
-               pmap->pm_stats.wired_count++;
+               atomic_add_long(&pmap->pm_stats.wired_count, 1);
 
 validate:
        /*
         * Now validate mapping with desired protection/wiring.
         */
-       newpte = (pt_entry_t) (pa | pte_prot(pmap, prot) | PG_V);
+       newpte = (pt_entry_t)(pa | pte_prot(pmap, prot) | PG_V);
 
        if (wired)
                newpte |= PG_W;
@@ -2808,25 +3150,41 @@ validate:
                newpte |= pgeflag;
 
        /*
-        * if the mapping or permission bits are different, we need
+        * If the mapping or permission bits are different, we need
         * to update the pte.
+        *
+        * We do not have to interlock pte insertions as no other
+        * cpu will have a TLB entry.
         */
        if ((origpte & ~(PG_M|PG_A)) != newpte) {
+#if 0
                if ((prot & VM_PROT_NOSYNC) == 0)
                        pmap_inval_interlock(&info, pmap, va);
-               *pte = newpte | PG_A;
+#endif
+               *ptep = newpte | PG_A;
+               cpu_invlpg((void *)va);
+#if 0
                if (prot & VM_PROT_NOSYNC)
                        cpu_invlpg((void *)va);
                else
                        pmap_inval_deinterlock(&info, pmap);
+#endif
                if (newpte & PG_RW)
                        vm_page_flag_set(m, PG_WRITEABLE);
+               if (pte_pv == NULL)
+                       atomic_add_long(&pmap->pm_stats.resident_count, 1);
        }
        KKASSERT((newpte & PG_MANAGED) == 0 || (m->flags & PG_MAPPED));
        if ((prot & VM_PROT_NOSYNC) == 0)
                pmap_inval_done(&info);
-       lwkt_reltoken(&pmap->pm_token);
-       vm_object_drop(pmap->pm_pteobj);
+
+       /*
+        * Cleanup the pv entry, allowing other accessors.
+        */
+       if (pte_pv)
+               pv_put(pte_pv);
+       if (pt_pv)
+               pv_put(pt_pv);
 }
 
 /*
@@ -2839,91 +3197,7 @@ validate:
 void
 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m)
 {
-       pt_entry_t *pte;
-       vm_paddr_t pa;
-       vm_page_t mpte;
-       pmap_inval_info info;
-
-       lwkt_gettoken(&pmap->pm_token);
-       vm_object_hold(pmap->pm_pteobj);
-       pmap_inval_init(&info);
-
-       if (va < UPT_MAX_ADDRESS && pmap == &kernel_pmap) {
-               kprintf("Warning: pmap_enter_quick called on UVA with"
-                       "kernel_pmap\n");
-#ifdef DDB
-               db_print_backtrace();
-#endif
-       }
-       if (va >= UPT_MAX_ADDRESS && pmap != &kernel_pmap) {
-               kprintf("Warning: pmap_enter_quick called on KVA without"
-                       "kernel_pmap\n");
-#ifdef DDB
-               db_print_backtrace();
-#endif
-       }
-
-       KKASSERT(va < UPT_MIN_ADDRESS); /* assert used on user pmaps only */
-
-       /*
-        * Calculate the page table page (mpte), allocating it if necessary.
-        *
-        * A wired page table page (mpte), or NULL, is passed onto the
-        * section following.
-        */
-       if (va < VM_MAX_USER_ADDRESS) {
-               mpte = pmap_allocpte(pmap, va);
-       } else {
-               mpte = NULL;
-               /* this code path is not yet used */
-       }
-
-       /*
-        * With a valid (and held) page directory page, we can just use
-        * vtopte() to get to the pte.  If the pte is already present
-        * we do not disturb it.
-        */
-       pte = vtopte(va);
-       if (*pte & PG_V) {
-               pa = VM_PAGE_TO_PHYS(m);
-               KKASSERT(((*pte ^ pa) & PG_FRAME) == 0);
-               pmap_inval_done(&info);
-               if (mpte)
-                       pmap_unwire_pte_hold(pmap, va, mpte, &info);
-               vm_object_drop(pmap->pm_pteobj);
-               lwkt_reltoken(&pmap->pm_token);
-               return;
-       }
-
-       /*
-        * Enter on the PV list if part of our managed memory.
-        *
-        * The new mapping covers mpte's new wiring count so we don't
-        * unwire it.
-        */
-       if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) {
-               pmap_insert_entry(pmap, va, mpte, m);
-               vm_page_flag_set(m, PG_MAPPED);
-       }
-
-       /*
-        * Increment counters
-        */
-       ++pmap->pm_stats.resident_count;
-
-       pa = VM_PAGE_TO_PHYS(m);
-
-       /*
-        * Now validate mapping with RO protection
-        */
-       if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
-               *pte = pa | PG_V | PG_U;
-       else
-               *pte = pa | PG_V | PG_U | PG_MANAGED;
-/*     pmap_inval_add(&info, pmap, va); shouldn't be needed inval->valid */
-       pmap_inval_done(&info);
-       vm_object_drop(pmap->pm_pteobj);
-       lwkt_reltoken(&pmap->pm_token);
+       pmap_enter(pmap, va, m, VM_PROT_READ, FALSE);
 }
 
 /*
@@ -2979,7 +3253,7 @@ pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_prot_t prot,
                return;
        }
 
-       if (psize + pindex > object->size) {
+       if (pindex + psize > object->size) {
                if (object->size < pindex)
                        return;           
                psize = object->size - pindex;
@@ -3001,7 +3275,6 @@ pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_prot_t prot,
        info.mpte = NULL;
        info.addr = addr;
        info.pmap = pmap;
-       info.desired = 0;
 
        vm_object_hold(object);
        vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
@@ -3035,58 +3308,52 @@ pmap_object_init_pt_callback(vm_page_t p, void *data)
                                 info->addr + x86_64_ptob(rel_index), p);
        }
        vm_page_wakeup(p);
-       pmap_auto_yield(info);
        return(0);
 }
 
 /*
- * Return TRUE if the pmap is in shape to trivially
- * pre-fault the specified address.
+ * Return TRUE if the pmap is in shape to trivially pre-fault the specified
+ * address.
  *
- * Returns FALSE if it would be non-trivial or if a
- * pte is already loaded into the slot.
+ * Returns FALSE if it would be non-trivial or if a pte is already loaded
+ * into the slot.
  */
 int
 pmap_prefault_ok(pmap_t pmap, vm_offset_t addr)
 {
        pt_entry_t *pte;
-       pd_entry_t *pde;
-       int ret;
 
-       lwkt_gettoken(&pmap->pm_token);
-       pde = pmap_pde(pmap, addr);
-       if (pde == NULL || *pde == 0) {
-               ret = 0;
-       } else {
-               pte = vtopte(addr);
-               ret = (*pte) ? 0 : 1;
+       spin_lock(&pmap->pm_spin);
+       if ((pte = pmap_pte(pmap, addr)) != NULL) {
+               if (*pte & PG_V) {
+                       spin_unlock(&pmap->pm_spin);
+                       return FALSE;
+               }
        }
-       lwkt_reltoken(&pmap->pm_token);
-       return(ret);
+       spin_unlock(&pmap->pm_spin);
+       return TRUE;
 }
 
 /*
- *     Routine:        pmap_change_wiring
- *     Function:       Change the wiring attribute for a map/virtual-address
- *                     pair.
- *     In/out conditions:
- *                     The mapping must already exist in the pmap.
+ * Change the wiring attribute for a pmap/va pair.  The mapping must already
+ * exist in the pmap.  The mapping may or may not be managed.
  */
 void
 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
 {
-       pt_entry_t *pte;
+       pt_entry_t *ptep;
+       pv_entry_t pv;
 
        if (pmap == NULL)
                return;
-
        lwkt_gettoken(&pmap->pm_token);
-       pte = pmap_pte(pmap, va);
+       pv = pmap_allocpte(pmap, pmap_pt_pindex(va), NULL);
+       ptep = pv_pte_lookup(pv, pmap_pte_index(va));
 
-       if (wired && !pmap_pte_w(pte))
-               pmap->pm_stats.wired_count++;
-       else if (!wired && pmap_pte_w(pte))
-               pmap->pm_stats.wired_count--;
+       if (wired && !pmap_pte_w(ptep))
+               atomic_add_long(&pmap->pm_stats.wired_count, 1);
+       else if (!wired && pmap_pte_w(ptep))
+               atomic_add_long(&pmap->pm_stats.wired_count, -1);
 
        /*
         * Wiring is not a hardware characteristic so there is no need to
@@ -3097,15 +3364,16 @@ pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
         */
 #ifdef SMP
        if (wired)
-               atomic_set_long(pte, PG_W);
+               atomic_set_long(ptep, PG_W);
        else
-               atomic_clear_long(pte, PG_W);
+               atomic_clear_long(ptep, PG_W);
 #else
        if (wired)
-               atomic_set_long_nonlocked(pte, PG_W);
+               atomic_set_long_nonlocked(ptep, PG_W);
        else
-               atomic_clear_long_nonlocked(pte, PG_W);
+               atomic_clear_long_nonlocked(ptep, PG_W);
 #endif
+       pv_put(pv);
        lwkt_reltoken(&pmap->pm_token);
 }
 
@@ -3121,147 +3389,6 @@ void
 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, 
          vm_size_t len, vm_offset_t src_addr)
 {
-       return;
-#if 0
-       pmap_inval_info info;
-       vm_offset_t addr;
-       vm_offset_t end_addr = src_addr + len;
-       vm_offset_t pdnxt;
-       pd_entry_t src_frame, dst_frame;
-       vm_page_t m;
-
-       if (dst_addr != src_addr)
-               return;
-#if JGPMAP32
-       src_frame = src_pmap->pm_pdir[PTDPTDI] & PG_FRAME;
-       if (src_frame != (PTDpde & PG_FRAME)) {
-               return;
-       }
-
-       dst_frame = dst_pmap->pm_pdir[PTDPTDI] & PG_FRAME;
-       if (dst_frame != (APTDpde & PG_FRAME)) {
-               APTDpde = (pd_entry_t) (dst_frame | PG_RW | PG_V);
-               /* The page directory is not shared between CPUs */
-               cpu_invltlb();
-       }
-#endif
-       pmap_inval_init(&info);
-       pmap_inval_add(&info, dst_pmap, -1);
-       pmap_inval_add(&info, src_pmap, -1);
-
-       lwkt_gettoken(&src_pmap->pm_token);
-       lwkt_gettoken(&dst_pmap->pm_token);
-       for (addr = src_addr; addr < end_addr; addr = pdnxt) {
-               pt_entry_t *src_pte, *dst_pte;
-               vm_page_t dstmpte, srcmpte;
-               vm_offset_t srcptepaddr;
-               vm_pindex_t ptepindex;
-
-               if (addr >= UPT_MIN_ADDRESS)
-                       panic("pmap_copy: invalid to pmap_copy page tables\n");
-
-               /*
-                * Don't let optional prefaulting of pages make us go
-                * way below the low water mark of free pages or way
-                * above high water mark of used pv entries.
-                */
-               if (vmstats.v_free_count < vmstats.v_free_reserved ||
-                   pv_entry_count > pv_entry_high_water)
-                       break;
-               
-               pdnxt = ((addr + PAGE_SIZE*NPTEPG) & ~(PAGE_SIZE*NPTEPG - 1));
-               ptepindex = addr >> PDRSHIFT;
-
-#if JGPMAP32
-               srcptepaddr = (vm_offset_t) src_pmap->pm_pdir[ptepindex];
-#endif
-               if (srcptepaddr == 0)
-                       continue;
-                       
-               if (srcptepaddr & PG_PS) {
-#if JGPMAP32
-                       if (dst_pmap->pm_pdir[ptepindex] == 0) {
-                               dst_pmap->pm_pdir[ptepindex] = (pd_entry_t) srcptepaddr;
-                               dst_pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE;
-                       }
-#endif
-                       continue;
-               }
-
-               /*
-                *
-                */
-               srcmpte = vm_page_lookup(src_pmap->pm_pteobj, ptepindex);
-               if (srcmpte == NULL || srcmpte->wire_count == 1 ||
-                   (srcmpte->flags & PG_BUSY)) {
-                       continue;
-               }
-
-               if (pdnxt > end_addr)
-                       pdnxt = end_addr;
-
-               src_pte = vtopte(addr);
-#if JGPMAP32
-               dst_pte = avtopte(addr);
-#endif
-               while (addr < pdnxt) {
-                       pt_entry_t ptetemp;
-
-                       ptetemp = *src_pte;
-                       /*
-                        * we only virtual copy managed pages
-                        */
-                       if ((ptetemp & PG_MANAGED) != 0) {
-                               /*
-                                * We have to check after allocpte for the
-                                * pte still being around...  allocpte can
-                                * block.
-                                *
-                                * pmap_allocpte() can block.  If we lose
-                                * our page directory mappings we stop.
-                                */
-                               dstmpte = pmap_allocpte(dst_pmap, addr);
-
-#if JGPMAP32
-                               if (src_frame != (PTDpde & PG_FRAME) ||
-                                   dst_frame != (APTDpde & PG_FRAME)
-                               ) {
-                                       kprintf("WARNING: pmap_copy: detected and corrected race\n");
-                                       pmap_unwire_pte_hold(dst_pmap, dstmpte, &info);
-                                       goto failed;
-                               } else if ((*dst_pte == 0) &&
-                                          (ptetemp = *src_pte) != 0 &&
-                                          (ptetemp & PG_MANAGED)) {
-                                       /*
-                                        * Clear the modified and
-                                        * accessed (referenced) bits
-                                        * during the copy.
-                                        */
-                                       m = PHYS_TO_VM_PAGE(ptetemp);
-                                       *dst_pte = ptetemp & ~(PG_M | PG_A);
-                                       ++dst_pmap->pm_stats.resident_count;
-                                       pmap_insert_entry(dst_pmap, addr,
-                                                         dstmpte, m);
-                                       KKASSERT(m->flags & PG_MAPPED);
-                               } else {
-                                       kprintf("WARNING: pmap_copy: dst_pte race detected and corrected\n");
-                                       pmap_unwire_pte_hold(dst_pmap, dstmpte, &info);
-                                       goto failed;
-                               }
-#endif
-                               if (dstmpte->hold_count >= srcmpte->hold_count)
-                                       break;
-                       }
-                       addr += PAGE_SIZE;
-                       src_pte++;
-                       dst_pte++;
-               }
-       }
-failed:
-       lwkt_reltoken(&dst_pmap->pm_token);
-       lwkt_reltoken(&src_pmap->pm_token);
-       pmap_inval_done(&info);
-#endif
 }      
 
 /*
@@ -3353,11 +3480,10 @@ pmap_copy_page_frag(vm_paddr_t src, vm_paddr_t dst, size_t bytes)
 }
 
 /*
- * Returns true if the pmap's pv is one of the first
- * 16 pvs linked to from this page.  This count may
- * be changed upwards or downwards in the future; it
- * is only necessary that true be returned for a small
- * subset of pmaps for proper page aging.
+ * Returns true if the pmap's pv is one of the first 16 pvs linked to from
+ * this page.  This count may be changed upwards or downwards in the future;
+ * it is only necessary that true be returned for a small subset of pmaps
+ * for proper page aging.
  */
 boolean_t
 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
@@ -3368,167 +3494,34 @@ pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
        if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
                return FALSE;
 
-       spin_lock(&pmap_spin);
+       vm_page_spin_lock(m);
        TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
                if (pv->pv_pmap == pmap) {
-                       spin_unlock(&pmap_spin);
+                       vm_page_spin_unlock(m);
                        return TRUE;
                }
                loops++;
                if (loops >= 16)
                        break;
        }
-       spin_unlock(&pmap_spin);
+       vm_page_spin_unlock(m);
        return (FALSE);
 }
 
 /*
  * Remove all pages from specified address space this aids process exit
- * speeds.  Also, this code is special cased for current process only, but
- * can have the more generic (and slightly slower) mode enabled.  This
- * is much faster than pmap_remove in the case of running down an entire
- * address space.
+ * speeds.  Also, this code may be special cased for the current process
+ * only.
  */
 void
 pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
-       struct lwp *lp;
-       pt_entry_t *pte, tpte;
-       pv_entry_t pv, npv;
-       vm_page_t m;
-       vm_offset_t va;
-       pmap_inval_info info;
-       int iscurrentpmap;
-       int save_generation;
-
-       lp = curthread->td_lwp;
-       if (lp && pmap == vmspace_pmap(lp->lwp_vmspace))
-               iscurrentpmap = 1;
-       else
-               iscurrentpmap = 0;
-
-       if (pmap->pm_pteobj)
-               vm_object_hold(pmap->pm_pteobj);
-       lwkt_gettoken(&pmap->pm_token);
-       pmap_inval_init(&info);
-
-       spin_lock(&pmap_spin);
-       for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
-               /*
-                * Validate the pv.  We have to interlock the address with
-                * pmap_spin unlocked.
-                */
-               if (pv->pv_va >= eva || pv->pv_va < sva) {
-                       npv = TAILQ_NEXT(pv, pv_plist);
-                       continue;
-               }
-
-               KKASSERT(pmap == pv->pv_pmap);
-               if (iscurrentpmap)
-                       pte = vtopte(pv->pv_va);
-               else
-                       pte = pmap_pte_quick(pmap, pv->pv_va);
-
-               /*
-                * We cannot remove wired pages from a process' mapping
-                * at this time.  This does not require an invaldiation
-                * interlock as PG_W cannot be set by the MMU.
-                */
-               if (*pte & PG_W) {
-                       npv = TAILQ_NEXT(pv, pv_plist);
-                       continue;
-               }
-
-               /*
-                * Interlock the pte so we can safely remove it
-                */
-               save_generation = pmap->pm_generation;
-               va = pv->pv_va;
-               spin_unlock(&pmap_spin);
-
-               pmap_inval_interlock(&info, pmap, va);
-
-               /*
-                * Restart the scan if the pv list changed out from under us.
-                */
-               spin_lock(&pmap_spin);
-               if (save_generation != pmap->pm_generation) {
-                       spin_unlock(&pmap_spin);
-                       pmap_inval_deinterlock(&info, pmap);
-                       kprintf("Warning: pmap_remove_pages race-A avoided\n");
-                       spin_lock(&pmap_spin);
-                       npv = TAILQ_FIRST(&pmap->pm_pvlist);
-                       continue;
-               }
-               KKASSERT(pmap == pv->pv_pmap && va == pv->pv_va);
-
-               /*
-                * Extract the pte and clear its memory
-                */
-               tpte = pte_load_clear(pte);
-               KKASSERT(tpte & PG_MANAGED);
-
-               m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
-               KASSERT(m < &vm_page_array[vm_page_array_size],
-                       ("pmap_remove_pages: bad tpte %lx", tpte));
-
-               /*
-                * Remove the entry, set npv
-                */
-               npv = TAILQ_NEXT(pv, pv_plist);
-               TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
-               m->md.pv_generation++;
-               m->md.pv_list_count--;
-               vm_page_spin_lock(m);
-               if (m->object)
-                       atomic_add_int(&m->object->agg_pv_list_count, -1);
-               vm_page_spin_unlock(m);
-               TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
-               if (TAILQ_EMPTY(&m->md.pv_list))
-                       vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
-               save_generation = ++pmap->pm_generation;
-
-               spin_unlock(&pmap_spin);
-
-               /*
-                * Adjust the pmap and cleanup the tpte and related vm_page
-                */
-               KKASSERT(pmap->pm_stats.resident_count > 0);
-               --pmap->pm_stats.resident_count;
-               pmap_inval_deinterlock(&info, pmap);
-
-               /*
-                * Update the vm_page_t clean and reference bits.
-                */
-               if (tpte & PG_M) {
-                       vm_page_dirty(m);
-               }
-
-               pmap_unwire_pte_hold(pmap, pv->pv_va, pv->pv_ptem, &info);
-               free_pv_entry(pv);
-
-               /*
-                * Restart the scan if we blocked during the unuse or free
-                * calls and other removals were made.
-                */
-               spin_lock(&pmap_spin);
-               if (save_generation != pmap->pm_generation) {
-                       kprintf("Warning: pmap_remove_pages race-A avoided\n");
-                       npv = TAILQ_FIRST(&pmap->pm_pvlist);
-               }
-       }
-       spin_unlock(&pmap_spin);
-       pmap_inval_done(&info);
-       lwkt_reltoken(&pmap->pm_token);
-       if (pmap->pm_pteobj)
-               vm_object_drop(pmap->pm_pteobj);
+       pmap_remove(pmap, sva, eva);
 }
 
 /*
  * pmap_testbit tests bits in pte's note that the testbit/clearbit
  * routines are inline, and a lot of things compile-time evaluate.
- *
- * Caller must hold pmap_spin
  */
 static
 boolean_t
@@ -3542,6 +3535,11 @@ pmap_testbit(vm_page_t m, int bit)
 
        if (TAILQ_FIRST(&m->md.pv_list) == NULL)
                return FALSE;
+       vm_page_spin_lock(m);
+       if (TAILQ_FIRST(&m->md.pv_list) == NULL) {
+               vm_page_spin_unlock(m);
+               return FALSE;
+       }
 
        TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
                /*
@@ -3550,7 +3548,7 @@ pmap_testbit(vm_page_t m, int bit)
                 * modified.
                 */
                if (bit & (PG_A|PG_M)) {
-                       if (!pmap_track_modified(pv->pv_va))
+                       if (!pmap_track_modified(pv->pv_pindex))
                                continue;
                }
 
@@ -3560,29 +3558,31 @@ pmap_testbit(vm_page_t m, int bit)
                        continue;
                }
 #endif
-               pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
-               if (*pte & bit)
+               pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT);
+               if (*pte & bit) {
+                       vm_page_spin_unlock(m);
                        return TRUE;
+               }
        }
+       vm_page_spin_unlock(m);
        return (FALSE);
 }
 
 /*
  * This routine is used to modify bits in ptes
  *
- * Caller must NOT hold pmap_spin
+ * Caller must NOT hold any spin locks
  */
 static __inline
 void
 pmap_clearbit(vm_page_t m, int bit)
 {
        struct pmap_inval_info info;
-       int save_generation;
-       vm_offset_t save_va;
-       struct pmap *save_pmap;
        pv_entry_t pv;
        pt_entry_t *pte;
        pt_entry_t pbits;
+       vm_pindex_t save_pindex;
+       pmap_t save_pmap;
 
        if (bit == PG_RW)
                vm_page_flag_clear(m, PG_WRITEABLE);
@@ -3596,14 +3596,14 @@ pmap_clearbit(vm_page_t m, int bit)
         * Loop over all current mappings setting/clearing as appropos If
         * setting RO do we need to clear the VAC?
         */
-       spin_lock(&pmap_spin);
+       vm_page_spin_lock(m);
 restart:
        TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
                /*
                 * don't write protect pager mappings
                 */
                if (bit == PG_RW) {
-                       if (!pmap_track_modified(pv->pv_va))
+                       if (!pmap_track_modified(pv->pv_pindex))
                                continue;
                }
 
@@ -3623,21 +3623,22 @@ restart:
                 * PG_M even for PTEs generated via virtual memory maps,
                 * because the virtual kernel will invalidate the pmap
                 * entry when/if it needs to resynchronize the Modify bit.
-                *
-                * We have to restart our scan if m->md.pv_generation changes
-                * on us.
                 */
                if (bit & PG_RW) {
-                       save_generation = m->md.pv_generation;
                        save_pmap = pv->pv_pmap;
-                       save_va = pv->pv_va;
-                       spin_unlock(&pmap_spin);
-                       pmap_inval_interlock(&info, save_pmap, save_va);
-                       spin_lock(&pmap_spin);
-                       if (save_generation != m->md.pv_generation)
+                       save_pindex = pv->pv_pindex;
+                       pv_hold(pv);
+                       vm_page_spin_unlock(m);
+                       pmap_inval_interlock(&info, save_pmap,
+                                    (vm_offset_t)save_pindex << PAGE_SHIFT);
+                       vm_page_spin_lock(m);
+                       if (pv->pv_pmap == NULL) {
+                               pv_drop(pv);
                                goto restart;
+                       }
+                       pv_drop(pv);
                }
-               pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
+               pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT);
 again:
                pbits = *pte;
                if (pbits & bit) {
@@ -3670,16 +3671,19 @@ again:
                        }
                }
                if (bit & PG_RW) {
-                       save_generation = m->md.pv_generation;
                        save_pmap = pv->pv_pmap;
-                       spin_unlock(&pmap_spin);
+                       pv_hold(pv);
+                       vm_page_spin_unlock(m);
                        pmap_inval_deinterlock(&info, save_pmap);
-                       spin_lock(&pmap_spin);
-                       if (save_generation != m->md.pv_generation)
+                       vm_page_spin_lock(m);
+                       if (pv->pv_pmap == NULL) {
+                               pv_drop(pv);
                                goto restart;
+                       }
+                       pv_drop(pv);
                }
        }
-       spin_unlock(&pmap_spin);
+       vm_page_spin_unlock(m);
        pmap_inval_done(&info);
 }
 
@@ -3726,43 +3730,30 @@ pmap_phys_address(vm_pindex_t ppn)
 int
 pmap_ts_referenced(vm_page_t m)
 {
-       pv_entry_t pv, pvf, pvn;
+       pv_entry_t pv;
        pt_entry_t *pte;
        int rtval = 0;
 
        if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
                return (rtval);
 
-       spin_lock(&pmap_spin);
-       if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
-               pvf = pv;
-               do {
-                       pvn = TAILQ_NEXT(pv, pv_list);
-
-                       TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
-                       TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
-                       /*++pv->pv_pmap->pm_generation; not needed */
-
-                       if (!pmap_track_modified(pv->pv_va))
-                               continue;
-
-                       pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
-
-                       if (pte && (*pte & PG_A)) {
+       vm_page_spin_lock(m);
+       TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
+               if (!pmap_track_modified(pv->pv_pindex))
+                       continue;
+               pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT);
+               if (pte && (*pte & PG_A)) {
 #ifdef SMP
-                               atomic_clear_long(pte, PG_A);
+                       atomic_clear_long(pte, PG_A);
 #else
-                               atomic_clear_long_nonlocked(pte, PG_A);
+                       atomic_clear_long_nonlocked(pte, PG_A);
 #endif
-                               rtval++;
-                               if (rtval > 4) {
-                                       break;
-                               }
-                       }
-               } while ((pv = pvn) != NULL && pv != pvf);
+                       rtval++;
+                       if (rtval > 4)
+                               break;
+               }
        }
-       spin_unlock(&pmap_spin);
-
+       vm_page_spin_unlock(m);
        return (rtval);
 }
 
@@ -3777,9 +3768,7 @@ pmap_is_modified(vm_page_t m)
 {
        boolean_t res;
 
-       spin_lock(&pmap_spin);
        res = pmap_testbit(m, PG_M);
-       spin_unlock(&pmap_spin);
        return (res);
 }