kernel - Implement segment pmap optimizations for x86-64
authorMatthew Dillon <dillon@apollo.backplane.com>
Thu, 13 Sep 2012 01:25:19 +0000 (18:25 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Thu, 13 Sep 2012 01:25:19 +0000 (18:25 -0700)
* Implement 2MB segment optimizations for x86-64.  Any shared read-only
  or read-write VM object mapped into memory, including physical objects
  (so both sysv_shm and mmap), which is a multiple of the segment size
  and segment-aligned can be optimized.

* Enable with sysctl machdep.pmap_mmu_optimize=1

  Default is off for now.  This is an experimental feature.

* It works as follows:  A VM object which is large enough will, when VM
  faults are generated, store a truncated pmap (PD, PT, and PTEs) in the
  VM object itself.

  VM faults whos vm_map_entry's can be optimized will cause the PTE, PT,
  and also the PD (for now) to be stored in a pmap embedded in the VM_OBJECT,
  instead of in the process pmap.

  The process pmap then creates PT entry in the PD page table that points
  to the PT page table page stored in the VM_OBJECT's pmap.

* This removes nearly all page table overhead from fork()'d processes or
  even unrelated process which massively share data via mmap() or sysv_shm.
  We still recommend using sysctl kern.ipc.shm_use_phys=1 (which is now
  the default), which also removes the PV entries associated with the
  shared pmap.  However, with this optimization PV entries are no longer
  a big issue since they will not be replicated in each process, only in
  the common pmap stored in the VM_OBJECT.

* Features of this optimization:

  * Number of PV entries is reduced to approximately the number of live
    pages and no longer multiplied by the number of processes separately
    mapping the shared memory.

  * One process faulting in a page naturally makes the PTE available to
    all other processes mapping the same shared memory.  The other processes
    do not have to fault that same page in.

  * Page tables survive process exit and restart.

  * Once page tables are populated and cached, any new process that maps
    the shared memory will take far fewer faults because each fault will
    bring in an ENTIRE page table.  Postgres w/ 64-clients, VM fault rate
    was observed to drop from 1M faults/sec to less than 500 at startup,
    and during the run the fault rates dropped from a steady decline into
    the hundreds of thousands into an instant decline to virtually zero
    VM faults.

  * We no longer have to depend on sysv_shm to optimize the MMU.

  * CPU caches will do a better job caching page tables since most of
    them are now themselves shared.  Even when we invltlb, more of the
    page tables will be in the L1, L2, and L3 caches.

* EXPERIMENTAL!!!!!

16 files changed:
sys/kern/kern_slaballoc.c
sys/platform/pc32/acpica5/acpi_wakeup.c
sys/platform/pc32/i386/pmap.c
sys/platform/pc32/include/pmap.h
sys/platform/pc64/include/pmap.h
sys/platform/pc64/include/pmap_inval.h
sys/platform/pc64/x86_64/pmap.c
sys/platform/pc64/x86_64/pmap_inval.c
sys/platform/vkernel/include/pmap.h
sys/platform/vkernel/platform/pmap.c
sys/platform/vkernel64/include/pmap.h
sys/platform/vkernel64/platform/pmap.c
sys/vm/pmap.h
sys/vm/vm_fault.c
sys/vm/vm_object.c
sys/vm/vm_object.h

index a923f2b..9fc8a59 100644 (file)
@@ -1536,7 +1536,8 @@ kmem_slab_alloc(vm_size_t size, vm_offset_t align, int flags)
         */
        m->valid = VM_PAGE_BITS_ALL;
        vm_page_wire(m);
-       pmap_enter(&kernel_pmap, addr + i, m, VM_PROT_ALL | VM_PROT_NOSYNC, 1);
+       pmap_enter(&kernel_pmap, addr + i, m, VM_PROT_ALL | VM_PROT_NOSYNC,
+                  1, NULL);
        if ((m->flags & PG_ZERO) == 0 && (flags & M_ZERO))
            bzero((char *)addr + i, PAGE_SIZE);
        vm_page_flag_clear(m, PG_ZERO);
index 53806a6..2ff2d50 100644 (file)
@@ -223,7 +223,8 @@ acpi_sleep_machdep(struct acpi_softc *sc, int state)
                opage = PHYS_TO_VM_PAGE(oldphys);
        page = PHYS_TO_VM_PAGE(sc->acpi_wakephys);
        pmap_enter(pm, sc->acpi_wakephys, page,
-                  VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE, 1);
+                  VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE,
+                  1, NULL);
 
        ret_addr = 0;
        if (acpi_savecpu()) {
@@ -294,7 +295,8 @@ out:
        vm_page_unlock_queues();
        if (opage) {
                pmap_enter(pm, sc->acpi_wakephys, page,
-                          VM_PROT_READ | VM_PROT_WRITE, 0);
+                          VM_PROT_READ | VM_PROT_WRITE,
+                          0, NULL);
        }
 
        if (pteobj_allocated) {
index 3d362ea..d2e14c5 100644 (file)
@@ -1611,27 +1611,6 @@ pmap_growkernel(vm_offset_t kstart, vm_offset_t kend)
        vm_object_drop(kptobj);
 }
 
-/*
- * Retire the given physical map from service.
- *
- * Should only be called if the map contains no valid mappings.
- *
- * No requirements.
- */
-void
-pmap_destroy(pmap_t pmap)
-{
-       if (pmap == NULL)
-               return;
-
-       lwkt_gettoken(&vm_token);
-       if (--pmap->pm_count == 0) {
-               pmap_release(pmap);
-               panic("destroying a pmap is not yet implemented");
-       }
-       lwkt_reltoken(&vm_token);
-}
-
 /*
  * Add a reference to the specified pmap.
  *
@@ -2190,7 +2169,7 @@ again:
  */
 void
 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
-          boolean_t wired)
+          boolean_t wired, vm_map_entry_t entry __unused)
 {
        vm_paddr_t pa;
        unsigned *pte;
@@ -2714,7 +2693,8 @@ pmap_prefault_ok(pmap_t pmap, vm_offset_t addr)
  * No requirements.
  */
 void
-pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
+pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired,
+                  vm_map_entry_t entry __unused)
 {
        unsigned *pte;
 
@@ -3621,3 +3601,15 @@ pmap_kvtom(vm_offset_t va)
 {
        return(PHYS_TO_VM_PAGE(*vtopte(va) & PG_FRAME));
 }
+
+void
+pmap_object_init(vm_object_t object)
+{
+       /* empty */
+}
+
+void
+pmap_object_free(vm_object_t object)
+{
+       /* empty */
+}
index a82e923..f52a54c 100644 (file)
@@ -212,6 +212,9 @@ struct md_page {
        TAILQ_HEAD(,pv_entry)   pv_list;
 };
 
+struct md_object {
+};
+
 /*
  * Each machine dependent implementation is expected to
  * keep certain statistics.  They may do this anyway they
index 870e49a..6c67d6c 100644 (file)
@@ -209,15 +209,29 @@ pte_store(pt_entry_t *ptep, pt_entry_t pte)
 /*
  * Pmap stuff
  */
+struct pmap;
 struct pv_entry;
 struct vm_page;
 struct vm_object;
 struct vmspace;
 
+/*
+ * vm_page structures embed a list of related pv_entry's
+ */
 struct md_page {
        TAILQ_HEAD(,pv_entry)   pv_list;
 };
 
+/*
+ * vm_object's representing large mappings can contain embedded pmaps
+ * to organize sharing at higher page table levels for PROT_READ and
+ * PROT_READ|PROT_WRITE maps.
+ */
+struct md_object {
+       struct pmap *pmap_rw;
+       struct pmap *pmap_ro;
+};
+
 /*
  * Each machine dependent implementation is expected to
  * keep certain statistics.  They may do this anyway they
@@ -244,7 +258,7 @@ struct pmap {
        RB_HEAD(pv_entry_rb_tree, pv_entry) pm_pvroot;
        int                     pm_count;       /* reference count */
        cpumask_t               pm_active;      /* active on cpus */
-       int                     pm_filler02;    /* (filler sync w/vkernel) */
+       int                     pm_flags;
        struct pmap_statistics  pm_stats;       /* pmap statistics */
        struct pv_entry         *pm_pvhint;     /* pv_entry lookup hint */
        int                     pm_generation;  /* detect pvlist deletions */
@@ -255,6 +269,8 @@ struct pmap {
 #define CPUMASK_LOCK           CPUMASK(SMP_MAXCPU)
 #define CPUMASK_BIT            SMP_MAXCPU      /* for 1LLU << SMP_MAXCPU */
 
+#define PMAP_FLAG_SIMPLE       0x00000001
+
 #define pmap_resident_count(pmap) (pmap)->pm_stats.resident_count
 
 typedef struct pmap    *pmap_t;
@@ -274,17 +290,19 @@ typedef struct pv_entry {
        RB_ENTRY(pv_entry)      pv_entry;
        struct vm_page  *pv_m;          /* page being mapped */
        u_int           pv_hold;        /* interlock action */
-       u_int           pv_unused01;
+       u_int           pv_flags;
 #ifdef PMAP_DEBUG
        const char      *pv_func;
        int             pv_line;
 #endif
 } *pv_entry_t;
 
-#define PV_HOLD_LOCKED 0x80000000U
-#define PV_HOLD_WAITING        0x40000000U
-#define PV_HOLD_DELETED        0x20000000U
-#define PV_HOLD_MASK   0x1FFFFFFFU
+#define PV_HOLD_LOCKED         0x80000000U
+#define PV_HOLD_WAITING                0x40000000U
+#define PV_HOLD_DELETED                0x20000000U
+#define PV_HOLD_MASK           0x1FFFFFFFU
+
+#define PV_FLAG_VMOBJECT       0x00000001U     /* shared pt in VM obj */
 
 #ifdef _KERNEL
 
index a1685a9..5dcbc8b 100644 (file)
@@ -61,6 +61,7 @@ typedef pmap_inval_info *pmap_inval_info_t;
 
 void pmap_inval_init(pmap_inval_info_t);
 void pmap_inval_interlock(pmap_inval_info_t, pmap_t, vm_offset_t);
+void pmap_inval_invltlb(pmap_inval_info_t);
 void pmap_inval_deinterlock(pmap_inval_info_t, pmap_t);
 void pmap_inval_done(pmap_inval_info_t);
 
index e23f564..928ef7c 100644 (file)
@@ -6,7 +6,7 @@
  * Copyright (c) 2005-2008 Alan L. Cox <alc@cs.rice.edu>
  * Copyright (c) 2008, 2009 The DragonFly Project.
  * Copyright (c) 2008, 2009 Jordan Gordeev.
- * Copyright (c) 2011 Matthew Dillon
+ * Copyright (c) 2011-2012 Matthew Dillon
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
@@ -154,6 +154,8 @@ static int protection_codes[8];
 struct pmap kernel_pmap;
 static TAILQ_HEAD(,pmap)       pmap_list = TAILQ_HEAD_INITIALIZER(pmap_list);
 
+MALLOC_DEFINE(M_OBJPMAP, "objpmap", "pmaps associated with VM objects");
+
 vm_paddr_t avail_start;                /* PA of first available physical page */
 vm_paddr_t avail_end;          /* PA of last available physical page */
 vm_offset_t virtual2_start;    /* cutout free area prior to kernel start */
@@ -209,6 +211,9 @@ static caddr_t crashdumpmap;
 static int pmap_yield_count = 64;
 SYSCTL_INT(_machdep, OID_AUTO, pmap_yield_count, CTLFLAG_RW,
     &pmap_yield_count, 0, "Yield during init_pt/release");
+static int pmap_mmu_optimize = 0;
+SYSCTL_INT(_machdep, OID_AUTO, pmap_mmu_optimize, CTLFLAG_RW,
+    &pmap_mmu_optimize, 0, "Share page table pages when possible");
 
 #define DISABLE_PSE
 
@@ -230,16 +235,19 @@ static void pv_free(pv_entry_t pv);
 static void *pv_pte_lookup(pv_entry_t pv, vm_pindex_t pindex);
 static pv_entry_t pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
                      pv_entry_t *pvpp);
+static pv_entry_t pmap_allocpte_seg(pmap_t pmap, vm_pindex_t ptepindex,
+                     pv_entry_t *pvpp, vm_map_entry_t entry, vm_offset_t va);
 static void pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp,
                      struct pmap_inval_info *info);
 static vm_page_t pmap_remove_pv_page(pv_entry_t pv);
+static int pmap_release_pv(pv_entry_t pv);
 
 static void pmap_remove_callback(pmap_t pmap, struct pmap_inval_info *info,
-                     pv_entry_t pte_pv, pv_entry_t pt_pv, vm_offset_t va,
-                     pt_entry_t *ptep, void *arg __unused);
+                     pv_entry_t pte_pv, pv_entry_t pt_pv, int sharept,
+                     vm_offset_t va, pt_entry_t *ptep, void *arg __unused);
 static void pmap_protect_callback(pmap_t pmap, struct pmap_inval_info *info,
-                     pv_entry_t pte_pv, pv_entry_t pt_pv, vm_offset_t va,
-                     pt_entry_t *ptep, void *arg __unused);
+                     pv_entry_t pte_pv, pv_entry_t pt_pv, int sharept,
+                     vm_offset_t va, pt_entry_t *ptep, void *arg __unused);
 
 static void i386_protection_init (void);
 static void create_pagetables(vm_paddr_t *firstaddr);
@@ -395,6 +403,9 @@ pmap_pdp_index(vm_offset_t va)
 
 /*
  * Generic procedure to index a pte from a pt, pd, or pdp.
+ *
+ * NOTE: Normally passed pindex as pmap_xx_index().  pmap_xx_pindex() is NOT
+ *      a page table page index but is instead of PV lookup index.
  */
 static
 void *
@@ -1301,24 +1312,37 @@ pmap_pinit0(struct pmap *pmap)
  * Initialize a preallocated and zeroed pmap structure,
  * such as one in a vmspace structure.
  */
-void
-pmap_pinit(struct pmap *pmap)
+static void
+pmap_pinit_simple(struct pmap *pmap)
 {
-       pv_entry_t pv;
-       int j;
-
        /*
         * Misc initialization
         */
        pmap->pm_count = 1;
        pmap->pm_active = 0;
        pmap->pm_pvhint = NULL;
+       pmap->pm_flags = PMAP_FLAG_SIMPLE;
+
+       /*
+        * Don't blow up locks/tokens on re-use (XXX fix/use drop code
+        * for this).
+        */
        if (pmap->pm_pmlpv == NULL) {
                RB_INIT(&pmap->pm_pvroot);
                bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
                spin_init(&pmap->pm_spin);
                lwkt_token_init(&pmap->pm_token, "pmap_tok");
        }
+}
+
+void
+pmap_pinit(struct pmap *pmap)
+{
+       pv_entry_t pv;
+       int j;
+
+       pmap_pinit_simple(pmap);
+       pmap->pm_flags &= ~PMAP_FLAG_SIMPLE;
 
        /*
         * No need to allocate page table space yet but we do need a valid
@@ -1416,9 +1440,6 @@ pmap_puninit(pmap_t pmap)
 void
 pmap_pinit2(struct pmap *pmap)
 {
-       /*
-        * XXX copies current process, does not fill in MPPTDI
-        */
        spin_lock(&pmap_spin);
        TAILQ_INSERT_TAIL(&pmap_list, pmap, pm_pmnode);
        spin_unlock(&pmap_spin);
@@ -1443,12 +1464,14 @@ pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp)
        vm_pindex_t pt_pindex;
        vm_page_t m;
        int isnew;
+       int ispt;
 
        /*
         * If the pv already exists and we aren't being asked for the
         * parent page table page we can just return it.  A locked+held pv
         * is returned.
         */
+       ispt = 0;
        pv = pv_alloc(pmap, ptepindex, &isnew);
        if (isnew == 0 && pvpp == NULL)
                return(pv);
@@ -1505,13 +1528,23 @@ pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp)
                 */
                ptepindex = pv->pv_pindex - pmap_pt_pindex(0);
                ptepindex &= ((1ul << NPDEPGSHIFT) - 1);
+               ispt = 1;
        } else if (ptepindex < pmap_pdp_pindex(0)) {
                /*
                 * pv is PD, pvp is PDP
+                *
+                * SIMPLE PMAP NOTE: Simple pmaps do not allocate above
+                *                   the PD.
                 */
                ptepindex = (ptepindex - pmap_pd_pindex(0)) >> NPDPEPGSHIFT;
                ptepindex += NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL;
-               pvp = pmap_allocpte(pmap, ptepindex, NULL);
+
+               if (pmap->pm_flags & PMAP_FLAG_SIMPLE) {
+                       KKASSERT(pvpp == NULL);
+                       pvp = NULL;
+               } else {
+                       pvp = pmap_allocpte(pmap, ptepindex, NULL);
+               }
                if (!isnew)
                        goto notnew;
 
@@ -1585,11 +1618,33 @@ pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp)
         * we just put it away.
         *
         * No interlock is needed for pte 0 -> non-zero.
+        *
+        * In the situation where *ptep is valid we might have an unmanaged
+        * page table page shared from another page table which we need to
+        * unshare before installing our private page table page.
         */
        if (pvp) {
-               vm_page_wire_quick(pvp->pv_m);
                ptep = pv_pte_lookup(pvp, ptepindex);
-               KKASSERT((*ptep & PG_V) == 0);
+               if (*ptep & PG_V) {
+                       pt_entry_t pte;
+                       pmap_inval_info info;
+
+                       kprintf("pmap_allocpte: restate shared pg table pg\n");
+
+                       if (ispt == 0) {
+                               panic("pmap_allocpte: unexpected pte %p/%d",
+                                     pvp, (int)ptepindex);
+                       }
+                       pmap_inval_init(&info);
+                       pmap_inval_interlock(&info, pmap, (vm_offset_t)-1);
+                       pte = pte_load_clear(ptep);
+                       pmap_inval_deinterlock(&info, pmap);
+                       pmap_inval_done(&info);
+                       if (vm_page_unwire_quick(PHYS_TO_VM_PAGE(pte & PG_FRAME)))
+                               panic("pmap_allocpte: shared pgtable pg bad wirecount");
+               } else {
+                       vm_page_wire_quick(pvp->pv_m);
+               }
                *ptep = VM_PAGE_TO_PHYS(m) | (PG_U | PG_RW | PG_V |
                                              PG_A | PG_M);
        }
@@ -1602,6 +1657,206 @@ notnew:
        return (pv);
 }
 
+/*
+ * This version of pmap_allocpte() checks for possible segment optimizations
+ * that would allow page-table sharing.  It can be called for terminal
+ * page or page table page ptepindex's.
+ *
+ * The function is called with page table page ptepindex's for fictitious
+ * and unmanaged terminal pages.  That is, we don't want to allocate a
+ * terminal pv, we just want the pt_pv.  pvpp is usually passed as NULL
+ * for this case.
+ *
+ * This function can return a pv and *pvpp associated with the passed in pmap
+ * OR a pv and *pvpp associated with the shared pmap.  In the latter case
+ * an unmanaged page table page will be entered into the pass in pmap.
+ */
+static
+pv_entry_t
+pmap_allocpte_seg(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp,
+                 vm_map_entry_t entry, vm_offset_t va)
+{
+       struct pmap_inval_info info;
+       vm_object_t object;
+       pmap_t obpmap;
+       pmap_t *obpmapp;
+       vm_offset_t b;
+       pv_entry_t pte_pv;      /* in original or shared pmap */
+       pv_entry_t pt_pv;       /* in original or shared pmap */
+       pv_entry_t proc_pd_pv;  /* in original pmap */
+       pv_entry_t proc_pt_pv;  /* in original pmap */
+       pv_entry_t xpv;         /* PT in shared pmap */
+       pd_entry_t *pt;         /* PT entry in PD of original pmap */
+       pd_entry_t opte;        /* contents of *pt */
+       pd_entry_t npte;        /* contents of *pt */
+       vm_page_t m;
+
+       /*
+        * Basic tests, require a non-NULL vm_map_entry, require proper
+        * alignment and type for the vm_map_entry, require that the
+        * underlying object already be allocated.
+        *
+        * We currently allow any type of object to use this optimization.
+        * The object itself does NOT have to be sized to a multiple of the
+        * segment size, but the memory mapping does.
+        */
+       if (entry == NULL ||
+           pmap_mmu_optimize == 0 ||                   /* not enabled */
+           ptepindex >= pmap_pd_pindex(0) ||           /* not terminal */
+           entry->inheritance != VM_INHERIT_SHARE ||   /* not shared */
+           entry->maptype != VM_MAPTYPE_NORMAL ||      /* weird map type */
+           entry->object.vm_object == NULL ||          /* needs VM object */
+           (entry->offset & SEG_MASK) ||               /* must be aligned */
+           (entry->start & SEG_MASK)) {
+               return(pmap_allocpte(pmap, ptepindex, pvpp));
+       }
+
+       /*
+        * Make sure the full segment can be represented.
+        */
+       b = va & ~(vm_offset_t)SEG_MASK;
+       if (b < entry->start && b + SEG_SIZE > entry->end)
+               return(pmap_allocpte(pmap, ptepindex, pvpp));
+
+       /*
+        * If the full segment can be represented dive the VM object's
+        * shared pmap, allocating as required.
+        */
+       object = entry->object.vm_object;
+
+       if (entry->protection & VM_PROT_WRITE)
+               obpmapp = &object->md.pmap_rw;
+       else
+               obpmapp = &object->md.pmap_ro;
+
+       /*
+        * We allocate what appears to be a normal pmap but because portions
+        * of this pmap are shared with other unrelated pmaps we have to
+        * set pm_active to point to all cpus.
+        *
+        * XXX Currently using pmap_spin to interlock the update, can't use
+        *     vm_object_hold/drop because the token might already be held
+        *     shared OR exclusive and we don't know.
+        */
+       while ((obpmap = *obpmapp) == NULL) {
+               obpmap = kmalloc(sizeof(*obpmap), M_OBJPMAP, M_WAITOK|M_ZERO);
+               pmap_pinit_simple(obpmap);
+               pmap_pinit2(obpmap);
+               spin_lock(&pmap_spin);
+               if (*obpmapp != NULL) {
+                       /*
+                        * Handle race
+                        */
+                       spin_unlock(&pmap_spin);
+                       pmap_release(obpmap);
+                       pmap_puninit(obpmap);
+                       kfree(obpmap, M_OBJPMAP);
+               } else {
+                       obpmap->pm_active = smp_active_mask;
+                       *obpmapp = obpmap;
+                       spin_unlock(&pmap_spin);
+               }
+       }
+
+       /*
+        * Layering is: PTE, PT, PD, PDP, PML4.  We have to return the
+        * pte/pt using the shared pmap from the object but also adjust
+        * the process pmap's page table page as a side effect.
+        */
+
+       /*
+        * Resolve the terminal PTE and PT in the shared pmap.  This is what
+        * we will return.  This is true if ptepindex represents a terminal
+        * page, otherwise pte_pv is actually the PT and pt_pv is actually
+        * the PD.
+        */
+       pt_pv = NULL;
+       pte_pv = pmap_allocpte(obpmap, ptepindex, &pt_pv);
+       if (ptepindex >= pmap_pt_pindex(0))
+               xpv = pte_pv;
+       else
+               xpv = pt_pv;
+
+       /*
+        * Resolve the PD in the process pmap so we can properly share the
+        * page table page.  Lock order is bottom-up (leaf first)!
+        *
+        * NOTE: proc_pt_pv can be NULL.
+        */
+       proc_pt_pv = pv_get(pmap, pmap_pt_pindex(b));
+       proc_pd_pv = pmap_allocpte(pmap, pmap_pd_pindex(b), NULL);
+
+       /*
+        * xpv is the page table page pv from the shared object
+        * (for convenience).
+        *
+        * Calculate the pte value for the PT to load into the process PD.
+        * If we have to change it we must properly dispose of the previous
+        * entry.
+        */
+       pt = pv_pte_lookup(proc_pd_pv, pmap_pt_index(b));
+       npte = VM_PAGE_TO_PHYS(xpv->pv_m) |
+              (PG_U | PG_RW | PG_V | PG_A | PG_M);
+       if (*pt == 0) {
+               *pt = npte;
+               vm_page_wire_quick(xpv->pv_m);
+               vm_page_wire_quick(proc_pd_pv->pv_m);
+               atomic_add_long(&pmap->pm_stats.resident_count, 1);
+       } else if (*pt != npte) {
+               pmap_inval_init(&info);
+               pmap_inval_interlock(&info, pmap, (vm_offset_t)-1);
+               if (*pt != npte) {
+                       opte = pte_load_clear(pt);
+                       *pt = npte;
+                       vm_page_wire_quick(xpv->pv_m);
+
+                       /*
+                        * Clean up opte, bump the wire_count for the process
+                        * PD page representing the new entry if it was
+                        * previously empty.
+                        *
+                        * If the entry was not previously empty and we have
+                        * a PT in the proc pmap then opte must match that
+                        * pt.  The proc pt must be retired (this is done
+                        * later on in this procedure).
+                        */
+                       if (opte & PG_V) {
+                               m = PHYS_TO_VM_PAGE(opte & PG_FRAME);
+                               if (proc_pt_pv) {
+                                       KKASSERT(proc_pt_pv->pv_m == m);
+                               } else {
+                                       if (vm_page_unwire_quick(m)) {
+                                               panic("pmap_allocpte_seg: "
+                                                     "bad wire count %p",
+                                                     m);
+                                       }
+                               }
+                       } else {
+                               vm_page_wire_quick(proc_pd_pv->pv_m);
+                       }
+               }
+               pmap_inval_deinterlock(&info, pmap);
+               pmap_inval_done(&info);
+       } else {
+               KKASSERT(proc_pt_pv == NULL);
+       }
+
+       /*
+        * The existing process page table was replaced and must be destroyed
+        * here.
+        */
+       if (proc_pd_pv)
+               pv_put(proc_pd_pv);
+       if (proc_pt_pv)
+               pmap_release_pv(proc_pt_pv);
+       if (pvpp)
+               *pvpp = pt_pv;
+       else
+               pv_put(pt_pv);
+
+       return (pte_pv);
+}
+
 /*
  * Release any resources held by the given physical map.
  *
@@ -1647,7 +1902,9 @@ pmap_release(struct pmap *pmap)
         * One resident page (the pml4 page) should remain.
         * No wired pages should remain.
         */
-       KKASSERT(pmap->pm_stats.resident_count == 1);
+       KKASSERT(pmap->pm_stats.resident_count ==
+                ((pmap->pm_flags & PMAP_FLAG_SIMPLE) ? 0 : 1));
+
        KKASSERT(pmap->pm_stats.wired_count == 0);
 }
 
@@ -1656,7 +1913,7 @@ pmap_release_callback(pv_entry_t pv, void *data)
 {
        struct pmap_release_info *info = data;
        pmap_t pmap = info->pmap;
-       vm_page_t p;
+       int r;
 
        if (pv_hold_try(pv)) {
                spin_unlock(&pmap->pm_spin);
@@ -1670,6 +1927,19 @@ pmap_release_callback(pv_entry_t pv, void *data)
                        return(-1);
                }
        }
+       r = pmap_release_pv(pv);
+       spin_lock(&pmap->pm_spin);
+       return(r);
+}
+
+/*
+ * Called with held (i.e. also locked) pv.  This function will dispose of
+ * the lock along with the pv.
+ */
+static int
+pmap_release_pv(pv_entry_t pv)
+{
+       vm_page_t p;
 
        /*
         * The pmap is currently not spinlocked, pv is held+locked.
@@ -1698,7 +1968,6 @@ pmap_release_callback(pv_entry_t pv, void *data)
         */
        if (pv->pv_pindex == pmap_pml4_pindex()) {
                pv_put(pv);
-               spin_lock(&pmap->pm_spin);
                return(-1);
        }
 
@@ -1719,12 +1988,16 @@ pmap_release_callback(pv_entry_t pv, void *data)
 
        vm_page_unwire(p, 0);
        KKASSERT(p->wire_count == 0);
-       /* JG eventually revert to using vm_page_free_zero() */
+
+       /*
+        * Theoretically this page, if not the pml4 page, should contain
+        * all-zeros.  But its just too dangerous to mark it PG_ZERO.  Free
+        * normally.
+        */
        vm_page_free(p);
 skip:
        pv_free(pv);
-       spin_lock(&pmap->pm_spin);
-       return(0);
+       return 0;
 }
 
 /*
@@ -1776,6 +2049,7 @@ pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, struct pmap_inval_info *info)
                if (pvp == NULL) {
                        pml4_pindex = pmap_pml4_pindex();
                        pvp = pv_get(pv->pv_pmap, pml4_pindex);
+                       KKASSERT(pvp);
                        gotpvp = 1;
                }
                pdp = &pmap->pm_pml4[pdp_index & ((1ul << NPML4EPGSHIFT) - 1)];
@@ -1785,7 +2059,11 @@ pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, struct pmap_inval_info *info)
                KKASSERT(info == NULL);
        } else if (ptepindex >= pmap_pd_pindex(0)) {
                /*
-                *  Remove a PD page from the pdp
+                * Remove a PD page from the pdp
+                *
+                * SIMPLE PMAP NOTE: Non-existant pvp's are ok in the case
+                *                   of a simple pmap because it stops at
+                *                   the PD page.
                 */
                vm_pindex_t pdp_pindex;
                vm_pindex_t pd_index;
@@ -1797,12 +2075,19 @@ pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, struct pmap_inval_info *info)
                        pdp_pindex = NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL +
                                     (pd_index >> NPML4EPGSHIFT);
                        pvp = pv_get(pv->pv_pmap, pdp_pindex);
-                       gotpvp = 1;
+                       if (pvp)
+                               gotpvp = 1;
+               }
+               if (pvp) {
+                       pd = pv_pte_lookup(pvp, pd_index &
+                                               ((1ul << NPDPEPGSHIFT) - 1));
+                       KKASSERT((*pd & PG_V) != 0);
+                       p = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
+                       *pd = 0;
+               } else {
+                       KKASSERT(pmap->pm_flags & PMAP_FLAG_SIMPLE);
+                       p = pv->pv_m;           /* degenerate test later */
                }
-               pd = pv_pte_lookup(pvp, pd_index & ((1ul << NPDPEPGSHIFT) - 1));
-               KKASSERT((*pd & PG_V) != 0);
-               p = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
-               *pd = 0;
                KKASSERT(info == NULL);
        } else if (ptepindex >= pmap_pt_pindex(0)) {
                /*
@@ -1818,6 +2103,7 @@ pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, struct pmap_inval_info *info)
                        pd_pindex = NUPTE_TOTAL + NUPT_TOTAL +
                                    (pt_index >> NPDPEPGSHIFT);
                        pvp = pv_get(pv->pv_pmap, pd_pindex);
+                       KKASSERT(pvp);
                        gotpvp = 1;
                }
                pt = pv_pte_lookup(pvp, pt_index & ((1ul << NPDPEPGSHIFT) - 1));
@@ -1848,6 +2134,7 @@ pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, struct pmap_inval_info *info)
                                pt_pindex = NUPTE_TOTAL +
                                            (ptepindex >> NPDPEPGSHIFT);
                                pvp = pv_get(pv->pv_pmap, pt_pindex);
+                               KKASSERT(pvp);
                                gotpvp = 1;
                        }
                        ptep = pv_pte_lookup(pvp, ptepindex &
@@ -2042,36 +2329,24 @@ pmap_growkernel(vm_offset_t kstart, vm_offset_t kend)
 }
 
 /*
- *     Retire the given physical map from service.
- *     Should only be called if the map contains
- *     no valid mappings.
+ *     Add a reference to the specified pmap.
  */
 void
-pmap_destroy(pmap_t pmap)
+pmap_reference(pmap_t pmap)
 {
-       int count;
-
-       if (pmap == NULL)
-               return;
-
-       lwkt_gettoken(&pmap->pm_token);
-       count = --pmap->pm_count;
-       if (count == 0) {
-               pmap_release(pmap);     /* eats pm_token */
-               panic("destroying a pmap is not yet implemented");
+       if (pmap != NULL) {
+               lwkt_gettoken(&pmap->pm_token);
+               ++pmap->pm_count;
+               lwkt_reltoken(&pmap->pm_token);
        }
-       lwkt_reltoken(&pmap->pm_token);
 }
 
-/*
- *     Add a reference to the specified pmap.
- */
 void
-pmap_reference(pmap_t pmap)
+pmap_drop(pmap_t pmap)
 {
        if (pmap != NULL) {
                lwkt_gettoken(&pmap->pm_token);
-               pmap->pm_count++;
+               --pmap->pm_count;
                lwkt_reltoken(&pmap->pm_token);
        }
 }
@@ -2473,19 +2748,24 @@ pmap_collect(void)
 
 /*
  * Scan the pmap for active page table entries and issue a callback.
- * The callback must dispose of pte_pv.
+ * The callback must dispose of pte_pv, whos PTE entry is at *ptep in
+ * its parent page table.
  *
- * NOTE: Unmanaged page table entries will not have a pte_pv
+ * pte_pv will be NULL if the page is unmanaged.
+ * pt_pv will point to the page table page containing the pte for the page.
  *
- * NOTE: Kernel page table entries will not have a pt_pv.  That is, wiring
- *      counts are not tracked in kernel page table pages.
+ * NOTE! If we come across an unmanaged page TABLE (verses an unmanaged page),
+ *      we pass a NULL pte_pv and we pass a pt_pv pointing to the passed
+ *      process pmap's PD and page to the callback function.  This can be
+ *      confusing because the pt_pv is really a pd_pv, and the target page
+ *      table page is simply aliased by the pmap and not owned by it.
  *
  * It is assumed that the start and end are properly rounded to the page size.
  */
 static void
 pmap_scan(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva,
          void (*func)(pmap_t, struct pmap_inval_info *,
-                      pv_entry_t, pv_entry_t, vm_offset_t,
+                      pv_entry_t, pv_entry_t, int, vm_offset_t,
                       pt_entry_t *, void *),
          void *arg)
 {
@@ -2531,13 +2811,27 @@ pmap_scan(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva,
                        ptep = vtopte(sva);
                } else {
                        /*
-                        * User mappings may or may not have a pte_pv but
-                        * will always have a pt_pv if the page is present.
+                        * User pages which are unmanaged will not have a
+                        * pte_pv.  User page table pages which are unmanaged
+                        * (shared from elsewhere) will also not have a pt_pv.
+                        * The func() callback will pass both pte_pv and pt_pv
+                        * as NULL in that case.
                         */
                        pte_pv = pv_get(pmap, pmap_pte_pindex(sva));
                        pt_pv = pv_get(pmap, pmap_pt_pindex(sva));
                        if (pt_pv == NULL) {
                                KKASSERT(pte_pv == NULL);
+                               pd_pv = pv_get(pmap, pmap_pd_pindex(sva));
+                               if (pd_pv) {
+                                       ptep = pv_pte_lookup(pd_pv,
+                                                       pmap_pt_index(sva));
+                                       if (*ptep) {
+                                               func(pmap, &info,
+                                                    NULL, pd_pv, 1,
+                                                    sva, ptep, arg);
+                                       }
+                                       pv_put(pd_pv);
+                               }
                                goto fast_skip;
                        }
                        ptep = pv_pte_lookup(pt_pv, pmap_pte_index(sva));
@@ -2555,12 +2849,12 @@ pmap_scan(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva,
                                                                PG_V),
                                ("bad *ptep %016lx sva %016lx pte_pv %p",
                                *ptep, sva, pte_pv));
-                       func(pmap, &info, pte_pv, pt_pv, sva, ptep, arg);
+                       func(pmap, &info, pte_pv, pt_pv, 0, sva, ptep, arg);
                } else {
                        KASSERT((*ptep & (PG_MANAGED|PG_V)) == PG_V,
                                ("bad *ptep %016lx sva %016lx pte_pv NULL",
                                *ptep, sva));
-                       func(pmap, &info, pte_pv, pt_pv, sva, ptep, arg);
+                       func(pmap, &info, NULL, pt_pv, 0, sva, ptep, arg);
                }
                if (pt_pv)
                        pv_put(pt_pv);
@@ -2659,10 +2953,29 @@ fast_skip:
                }
 
                /*
-                * We will scan or skip a page table page so adjust va_next
-                * either way.
+                * If pt_pv is NULL we either have an shared page table
+                * page and must issue a callback specific to that case,
+                * or there is no page table page.
+                *
+                * Either way we can skip the page table page.
                 */
                if (pt_pv == NULL) {
+                       /*
+                        * Possible unmanaged (shared from another pmap)
+                        * page table page.
+                        */
+                       if (pd_pv == NULL)
+                               pd_pv = pv_get(pmap, pmap_pd_pindex(sva));
+                       KKASSERT(pd_pv != NULL);
+                       ptep = pv_pte_lookup(pd_pv, pmap_pt_index(sva));
+                       if (*ptep & PG_V) {
+                               func(pmap, &info, NULL, pd_pv, 1,
+                                    sva, ptep, arg);
+                       }
+
+                       /*
+                        * Done, move to next page table page.
+                        */
                        va_next = (sva + NBPDR) & ~PDRMASK;
                        if (va_next < sva)
                                va_next = eva;
@@ -2735,6 +3048,12 @@ kernel_skip:
                                        pte_pv = NULL;
                                        pt_pv = pv_get(pmap,
                                                       pmap_pt_pindex(sva));
+                                       /*
+                                        * pt_pv reloaded, need new ptep
+                                        */
+                                       KKASSERT(pt_pv != NULL);
+                                       ptep = pv_pte_lookup(pt_pv,
+                                                       pmap_pte_index(sva));
                                        continue;
                                }
                        } else {
@@ -2768,16 +3087,16 @@ kernel_skip:
                                        ("bad *ptep %016lx sva %016lx "
                                         "pte_pv %p",
                                         *ptep, sva, pte_pv));
-                               func(pmap, &info, pte_pv, pt_pv, sva,
-                                    ptep, arg);
+                               func(pmap, &info, pte_pv, pt_pv, 0,
+                                    sva, ptep, arg);
                        } else {
                                KASSERT((*ptep & (PG_MANAGED|PG_V)) ==
                                         PG_V,
                                        ("bad *ptep %016lx sva %016lx "
                                         "pte_pv NULL",
                                         *ptep, sva));
-                               func(pmap, &info, pte_pv, pt_pv, sva,
-                                    ptep, arg);
+                               func(pmap, &info, NULL, pt_pv, 0,
+                                    sva, ptep, arg);
                        }
                        pte_pv = NULL;
                        sva += PAGE_SIZE;
@@ -2808,8 +3127,8 @@ pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva)
 
 static void
 pmap_remove_callback(pmap_t pmap, struct pmap_inval_info *info,
-                    pv_entry_t pte_pv, pv_entry_t pt_pv, vm_offset_t va,
-                    pt_entry_t *ptep, void *arg __unused)
+                    pv_entry_t pte_pv, pv_entry_t pt_pv, int sharept,
+                    vm_offset_t va, pt_entry_t *ptep, void *arg __unused)
 {
        pt_entry_t pte;
 
@@ -2821,8 +3140,10 @@ pmap_remove_callback(pmap_t pmap, struct pmap_inval_info *info,
                pmap_remove_pv_pte(pte_pv, pt_pv, info);
                pmap_remove_pv_page(pte_pv);
                pv_free(pte_pv);
-       } else {
+       } else if (sharept == 0) {
                /*
+                * Unmanaged page
+                *
                 * pt_pv's wire_count is still bumped by unmanaged pages
                 * so we must decrement it manually.
                 */
@@ -2832,8 +3153,24 @@ pmap_remove_callback(pmap_t pmap, struct pmap_inval_info *info,
                if (pte & PG_W)
                        atomic_add_long(&pmap->pm_stats.wired_count, -1);
                atomic_add_long(&pmap->pm_stats.resident_count, -1);
-               if (pt_pv && vm_page_unwire_quick(pt_pv->pv_m))
+               if (vm_page_unwire_quick(pt_pv->pv_m))
                        panic("pmap_remove: insufficient wirecount");
+       } else {
+               /*
+                * Unmanaged page table, pt_pv is actually the pd_pv
+                * for our pmap (not the share object pmap).
+                *
+                * We have to unwire the target page table page and we
+                * have to unwire our page directory page.
+                */
+               pmap_inval_interlock(info, pmap, va);
+               pte = pte_load_clear(ptep);
+               pmap_inval_deinterlock(info, pmap);
+               atomic_add_long(&pmap->pm_stats.resident_count, -1);
+               if (vm_page_unwire_quick(PHYS_TO_VM_PAGE(pte & PG_FRAME)))
+                       panic("pmap_remove: shared pgtable1 bad wirecount");
+               if (vm_page_unwire_quick(pt_pv->pv_m))
+                       panic("pmap_remove: shared pgtable2 bad wirecount");
        }
 }
 
@@ -2882,13 +3219,14 @@ pmap_remove_all(vm_page_t m)
 }
 
 /*
- * pmap_protect:
+ * Set the physical protection on the specified range of this map
+ * as requested.  This function is typically only used for debug watchpoints
+ * and COW pages.
  *
- *     Set the physical protection on the specified range of this map
- *     as requested.
+ * This function may not be called from an interrupt if the map is
+ * not the kernel_pmap.
  *
- *     This function may not be called from an interrupt if the map is
- *     not the kernel_pmap.
+ * NOTE!  For shared page table pages we just unmap the page.
  */
 void
 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
@@ -2909,11 +3247,12 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 static
 void
 pmap_protect_callback(pmap_t pmap, struct pmap_inval_info *info,
-                     pv_entry_t pte_pv, pv_entry_t pt_pv, vm_offset_t va,
-                     pt_entry_t *ptep, void *arg __unused)
+                     pv_entry_t pte_pv, pv_entry_t pt_pv, int sharept,
+                     vm_offset_t va, pt_entry_t *ptep, void *arg __unused)
 {
        pt_entry_t pbits;
        pt_entry_t cbits;
+       pt_entry_t pte;
        vm_page_t m;
 
        /*
@@ -2939,10 +3278,30 @@ again:
                                cbits &= ~PG_M;
                        }
                }
+       } else if (sharept) {
+               /*
+                * Unmanaged page table, pt_pv is actually the pd_pv
+                * for our pmap (not the share object pmap).
+                *
+                * When asked to protect something in a shared page table
+                * page we just unmap the page table page.  We have to
+                * invalidate the tlb in this situation.
+                */
+               pte = pte_load_clear(ptep);
+               pmap_inval_invltlb(info);
+               if (vm_page_unwire_quick(PHYS_TO_VM_PAGE(pte & PG_FRAME)))
+                       panic("pmap_protect: pgtable1 pg bad wirecount");
+               if (vm_page_unwire_quick(pt_pv->pv_m))
+                       panic("pmap_protect: pgtable2 pg bad wirecount");
+               ptep = NULL;
        }
-       cbits &= ~PG_RW;
-       if (pbits != cbits && !atomic_cmpset_long(ptep, pbits, cbits)) {
-               goto again;
+       /* else unmanaged page, adjust bits, no wire changes */
+
+       if (ptep) {
+               cbits &= ~PG_RW;
+               if (pbits != cbits && !atomic_cmpset_long(ptep, pbits, cbits)) {
+                       goto again;
+               }
        }
        pmap_inval_deinterlock(info, pmap);
        if (pte_pv)
@@ -2953,12 +3312,17 @@ again:
  * Insert the vm_page (m) at the virtual address (va), replacing any prior
  * mapping at that address.  Set protection and wiring as requested.
  *
+ * If entry is non-NULL we check to see if the SEG_SIZE optimization is
+ * possible.  If it is we enter the page into the appropriate shared pmap
+ * hanging off the related VM object instead of the passed pmap, then we
+ * share the page table page from the VM object's pmap into the current pmap.
+ *
  * NOTE: This routine MUST insert the page into the pmap now, it cannot
  *      lazy-evaluate.
  */
 void
 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
-          boolean_t wired)
+          boolean_t wired, vm_map_entry_t entry __unused)
 {
        pmap_inval_info info;
        pv_entry_t pt_pv;       /* page table */
@@ -3015,24 +3379,31 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
                pte_pv = NULL;
                pt_pv = NULL;
                ptep = vtopte(va);
-       } else if (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) {
+       } else if (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) { /* XXX */
                pte_pv = NULL;
                if (va >= VM_MAX_USER_ADDRESS) {
                        pt_pv = NULL;
                        ptep = vtopte(va);
                } else {
-                       pt_pv = pmap_allocpte(pmap, pmap_pt_pindex(va), NULL);
+                       pt_pv = pmap_allocpte_seg(pmap, pmap_pt_pindex(va),
+                                                 NULL, entry, va);
                        ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va));
                }
                KKASSERT(*ptep == 0 || (*ptep & PG_MANAGED) == 0);
        } else {
                if (va >= VM_MAX_USER_ADDRESS) {
+                       /*
+                        * Kernel map, pv_entry-tracked.
+                        */
                        pt_pv = NULL;
                        pte_pv = pmap_allocpte(pmap, pmap_pte_pindex(va), NULL);
                        ptep = vtopte(va);
                } else {
-                       pte_pv = pmap_allocpte(pmap, pmap_pte_pindex(va),
-                                              &pt_pv);
+                       /*
+                        * User map
+                        */
+                       pte_pv = pmap_allocpte_seg(pmap, pmap_pte_pindex(va),
+                                                  &pt_pv, entry, va);
                        ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va));
                }
                KKASSERT(*ptep == 0 || (*ptep & PG_MANAGED));
@@ -3090,12 +3461,20 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
                        if (pte_pv->pv_m)
                                pmap_remove_pv_page(pte_pv);
                } else if (prot & VM_PROT_NOSYNC) {
-                       /* leave wire count on PT page intact */
+                       /*
+                        * Unmanaged page, NOSYNC (no mmu sync) requested.
+                        *
+                        * Leave wire count on PT page intact.
+                        */
                        (void)pte_load_clear(ptep);
                        cpu_invlpg((void *)va);
                        atomic_add_long(&pmap->pm_stats.resident_count, -1);
                } else {
-                       /* leave wire count on PT page intact */
+                       /*
+                        * Unmanaged page, normal enter.
+                        *
+                        * Leave wire count on PT page intact.
+                        */
                        pmap_inval_interlock(&info, pmap, va);
                        (void)pte_load_clear(ptep);
                        pmap_inval_deinterlock(&info, pmap);
@@ -3129,9 +3508,10 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
        }
 
        /*
-        * Ok, for UVM (pt_pv != NULL) we don't need to interlock or
-        * invalidate anything, the TLB won't have any stale entries to
-        * remove.
+        * Kernel VMAs (pt_pv == NULL) require pmap invalidation interlocks.
+        *
+        * User VMAs do not because those will be zero->non-zero, so no
+        * stale entries to worry about at this point.
         *
         * For KVM there appear to still be issues.  Theoretically we
         * should be able to scrap the interlocks entirely but we
@@ -3139,6 +3519,10 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
         */
        if ((prot & VM_PROT_NOSYNC) == 0 && pt_pv == NULL)
                pmap_inval_interlock(&info, pmap, va);
+
+       /*
+        * Set the pte
+        */
        *(volatile pt_entry_t *)ptep = newpte;
 
        if ((prot & VM_PROT_NOSYNC) == 0 && pt_pv == NULL)
@@ -3146,12 +3530,22 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
        else if (pt_pv == NULL)
                cpu_invlpg((void *)va);
 
-       if (wired)
-               atomic_add_long(&pmap->pm_stats.wired_count, 1);
+       if (wired) {
+               if (pte_pv) {
+                       atomic_add_long(&pte_pv->pv_pmap->pm_stats.wired_count,
+                                       1);
+               } else {
+                       atomic_add_long(&pmap->pm_stats.wired_count, 1);
+               }
+       }
        if (newpte & PG_RW)
                vm_page_flag_set(m, PG_WRITEABLE);
-       if (pte_pv == NULL)
-               atomic_add_long(&pmap->pm_stats.resident_count, 1);
+
+       /*
+        * Unmanaged pages need manual resident_count tracking.
+        */
+       if (pte_pv == NULL && pt_pv)
+               atomic_add_long(&pt_pv->pv_pmap->pm_stats.resident_count, 1);
 
        /*
         * Cleanup
@@ -3180,7 +3574,7 @@ done:
 void
 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m)
 {
-       pmap_enter(pmap, va, m, VM_PROT_READ, FALSE);
+       pmap_enter(pmap, va, m, VM_PROT_READ, FALSE, NULL);
 }
 
 /*
@@ -3228,6 +3622,9 @@ pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_prot_t prot,
        if (lp == NULL || pmap != vmspace_pmap(lp->lwp_vmspace))
                return;
 
+       /*
+        * Misc additional checks
+        */
        psize = x86_64_btop(size);
 
        if ((object->type != OBJT_VNODE) ||
@@ -3245,6 +3642,18 @@ pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_prot_t prot,
        if (psize == 0)
                return;
 
+       /*
+        * If everything is segment-aligned do not pre-init here.  Instead
+        * allow the normal vm_fault path to pass a segment hint to
+        * pmap_enter() which will then use an object-referenced shared
+        * page table page.
+        */
+       if ((addr & SEG_MASK) == 0 &&
+           (ctob(psize) & SEG_MASK) == 0 &&
+           (ctob(pindex) & SEG_MASK) == 0) {
+               return;
+       }
+
        /*
         * Use a red-black scan to traverse the requested range and load
         * any valid pages found into the pmap.
@@ -3332,7 +3741,8 @@ pmap_prefault_ok(pmap_t pmap, vm_offset_t addr)
  * exist in the pmap.  The mapping may or may not be managed.
  */
 void
-pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
+pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired,
+                  vm_map_entry_t entry)
 {
        pt_entry_t *ptep;
        pv_entry_t pv;
@@ -3340,13 +3750,13 @@ pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
        if (pmap == NULL)
                return;
        lwkt_gettoken(&pmap->pm_token);
-       pv = pmap_allocpte(pmap, pmap_pt_pindex(va), NULL);
+       pv = pmap_allocpte_seg(pmap, pmap_pt_pindex(va), NULL, entry, va);
        ptep = pv_pte_lookup(pv, pmap_pte_index(va));
 
        if (wired && !pmap_pte_w(ptep))
-               atomic_add_long(&pmap->pm_stats.wired_count, 1);
+               atomic_add_long(&pv->pv_pmap->pm_stats.wired_count, 1);
        else if (!wired && pmap_pte_w(ptep))
-               atomic_add_long(&pmap->pm_stats.wired_count, -1);
+               atomic_add_long(&pv->pv_pmap->pm_stats.wired_count, -1);
 
        /*
         * Wiring is not a hardware characteristic so there is no need to
@@ -4082,3 +4492,47 @@ pmap_kvtom(vm_offset_t va)
 {
        return(PHYS_TO_VM_PAGE(*vtopte(va) & PG_FRAME));
 }
+
+/*
+ * Initialize machine-specific shared page directory support.  This
+ * is executed when a VM object is created.
+ */
+void
+pmap_object_init(vm_object_t object)
+{
+       object->md.pmap_rw = NULL;
+       object->md.pmap_ro = NULL;
+}
+
+/*
+ * Clean up machine-specific shared page directory support.  This
+ * is executed when a VM object is destroyed.
+ */
+void
+pmap_object_free(vm_object_t object)
+{
+       pmap_t pmap;
+
+       if ((pmap = object->md.pmap_rw) != NULL) {
+               object->md.pmap_rw = NULL;
+               kprintf("pmap_object_free: destroying pmap %p in obj %p\n",
+                       pmap, object);
+               pmap_remove_pages(pmap,
+                                 VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
+               pmap->pm_active = 0;
+               pmap_release(pmap);
+               pmap_puninit(pmap);
+               kfree(pmap, M_OBJPMAP);
+       }
+       if ((pmap = object->md.pmap_ro) != NULL) {
+               object->md.pmap_ro = NULL;
+               kprintf("pmap_object_free: destroying pmap %p in obj %p\n",
+                       pmap, object);
+               pmap_remove_pages(pmap,
+                                 VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
+               pmap->pm_active = 0;
+               pmap_release(pmap);
+               pmap_puninit(pmap);
+               kfree(pmap, M_OBJPMAP);
+       }
+}
index f93a713..874acbd 100644 (file)
@@ -116,6 +116,12 @@ pmap_inval_interlock(pmap_inval_info_t info, pmap_t pmap, vm_offset_t va)
     lwkt_cpusync_interlock(&info->pir_cpusync);
 }
 
+void
+pmap_inval_invltlb(pmap_inval_info_t info)
+{
+       info->pir_va = (vm_offset_t)-1;
+}
+
 void
 pmap_inval_deinterlock(pmap_inval_info_t info, pmap_t pmap)
 {
index 20974f0..56ad735 100644 (file)
@@ -113,6 +113,9 @@ struct md_page {
        TAILQ_HEAD(,pv_entry)   pv_list;
 };
 
+struct md_object {
+};
+
 /*
  * Each machine dependent implementation is expected to
  * keep certain statistics.  They may do this anyway they
index cb85225..e455580 100644 (file)
@@ -375,26 +375,6 @@ pmap_release_callback(struct vm_page *p, void *data)
        return(0);
 }
 
-/*
- * Retire the given physical map from service.  Should only be called if
- * the map contains no valid mappings.
- *
- * No requirements.
- */
-void
-pmap_destroy(pmap_t pmap)
-{
-       if (pmap == NULL)
-               return;
-
-       lwkt_gettoken(&vm_token);
-       if (--pmap->pm_count == 0) {
-               pmap_release(pmap);
-               panic("destroying a pmap is not yet implemented");
-       }
-       lwkt_reltoken(&vm_token);
-}
-
 /*
  * Add a reference to the specified pmap.
  *
@@ -1752,7 +1732,7 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
  */
 void
 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
-          boolean_t wired)
+          boolean_t wired, vm_map_entry_t entry __unused)
 {
        vm_paddr_t pa;
        vpte_t *pte;
@@ -2180,7 +2160,8 @@ pmap_prefault_ok(pmap_t pmap, vm_offset_t addr)
  * No other requirements.
  */
 void
-pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
+pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired,
+                  vm_map_entry_t entry __unused)
 {
        vpte_t *pte;
 
@@ -3091,3 +3072,15 @@ pmap_kvtom(vm_offset_t va)
        ptep = KernelPTA + (va >> PAGE_SHIFT);
        return(PHYS_TO_VM_PAGE(*ptep & PG_FRAME));
 }
+
+void
+pmap_object_init(vm_object_t object)
+{
+       /* empty */
+}
+
+void
+pmap_object_free(vm_object_t object)
+{
+       /* empty */
+}
index 2a9ffc2..0c8c064 100644 (file)
@@ -138,6 +138,9 @@ struct md_page {
        TAILQ_HEAD(,pv_entry)   pv_list;
 };
 
+struct md_object {
+};
+
 /*
  * Each machine dependent implementation is expected to
  * keep certain statistics.  They may do this anyway they
index e871124..2248790 100644 (file)
@@ -1595,26 +1595,6 @@ pmap_growkernel(vm_offset_t kstart, vm_offset_t kend)
        vm_object_drop(kptobj);
 }
 
-/*
- * Retire the given physical map from service.  Should only be called
- * if the map contains no valid mappings.
- *
- * No requirements.
- */
-void
-pmap_destroy(pmap_t pmap)
-{
-       if (pmap == NULL)
-               return;
-
-       lwkt_gettoken(&vm_token);
-       if (--pmap->pm_count == 0) {
-               pmap_release(pmap);
-               panic("destroying a pmap is not yet implemented");
-       }
-       lwkt_reltoken(&vm_token);
-}
-
 /*
  * Add a reference to the specified pmap.
  *
@@ -2200,7 +2180,7 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
  */
 void
 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
-          boolean_t wired)
+          boolean_t wired, vm_map_entry_t entry __unused)
 {
        vm_paddr_t pa;
        pd_entry_t *pde;
@@ -2602,7 +2582,8 @@ pmap_prefault_ok(pmap_t pmap, vm_offset_t addr)
  * No other requirements.
  */
 void
-pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
+pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired,
+                  vm_map_entry_t entry __unused)
 {
        pt_entry_t *pte;
 
@@ -3295,3 +3276,15 @@ pmap_kvtom(vm_offset_t va)
        ptep = vtopte(va);
        return(PHYS_TO_VM_PAGE(*ptep & PG_FRAME));
 }
+
+void
+pmap_object_init(vm_object_t object)
+{
+       /* empty */
+}
+
+void
+pmap_object_free(vm_object_t object)
+{
+       /* empty */
+}
index 97c41c5..315f807 100644 (file)
@@ -97,6 +97,7 @@ struct thread;
 struct vm_page;
 struct vmspace;
 struct vmspace_entry;
+struct vm_map_entry;
 
 /*
  * Most of these variables represent parameters set up by low level MD kernel
@@ -143,7 +144,8 @@ kva_p(const void *addr)
 #endif
 }
 
-void            pmap_change_wiring (pmap_t, vm_offset_t, boolean_t);
+void            pmap_change_wiring (pmap_t, vm_offset_t, boolean_t,
+                       vm_map_entry_t);
 void            pmap_clear_modify (struct vm_page *m);
 void            pmap_clear_reference (struct vm_page *m);
 void            pmap_collect (void);
@@ -151,9 +153,8 @@ void                 pmap_copy (pmap_t, pmap_t, vm_offset_t, vm_size_t,
                        vm_offset_t);
 void            pmap_copy_page (vm_paddr_t, vm_paddr_t);
 void            pmap_copy_page_frag (vm_paddr_t, vm_paddr_t, size_t bytes);
-void            pmap_destroy (pmap_t);
 void            pmap_enter (pmap_t, vm_offset_t, struct vm_page *,
-                       vm_prot_t, boolean_t);
+                       vm_prot_t, boolean_t, struct vm_map_entry *);
 void            pmap_enter_quick (pmap_t, vm_offset_t, struct vm_page *);
 vm_paddr_t      pmap_extract (pmap_t pmap, vm_offset_t va);
 void            pmap_growkernel (vm_offset_t, vm_offset_t);
@@ -183,6 +184,7 @@ void                 pmap_kmodify_nc(vm_offset_t va);
 void            pmap_kremove (vm_offset_t);
 void            pmap_kremove_quick (vm_offset_t);
 void            pmap_reference (pmap_t);
+void            pmap_drop (pmap_t);
 void            pmap_remove (pmap_t, vm_offset_t, vm_offset_t);
 void            pmap_remove_pages (pmap_t, vm_offset_t, vm_offset_t);
 void            pmap_zero_page (vm_paddr_t);
@@ -199,6 +201,9 @@ vm_offset_t  pmap_addr_hint (vm_object_t obj, vm_offset_t addr, vm_size_t size);
 void           *pmap_kenter_temporary (vm_paddr_t pa, long i);
 void            pmap_init2 (void);
 struct vm_page *pmap_kvtom(vm_offset_t va);
+void            pmap_object_init(vm_object_t object);
+void            pmap_object_free(vm_object_t object);
+
 
 #endif /* _KERNEL */
 
index 86d0807..6fcc5de 100644 (file)
@@ -511,7 +511,7 @@ quick:
         * Enter the page into the pmap and do pmap-related adjustments.
         */
        vm_page_flag_set(fs.m, PG_REFERENCED);
-       pmap_enter(fs.map->pmap, vaddr, fs.m, fs.prot, fs.wired);
+       pmap_enter(fs.map->pmap, vaddr, fs.m, fs.prot, fs.wired, fs.entry);
        mycpu->gd_cnt.v_vm_faults++;
        if (curthread->td_lwp)
                ++curthread->td_lwp->lwp_ru.ru_minflt;
@@ -770,7 +770,7 @@ RetryFault:
         */
        vm_page_flag_set(fs.m, PG_REFERENCED);
 #if 0
-       pmap_enter(fs.map->pmap, vaddr, fs.m, fs.prot, fs.wired);
+       pmap_enter(fs.map->pmap, vaddr, fs.m, fs.prot, fs.wired, NULL);
        mycpu->gd_cnt.v_vm_faults++;
        if (curthread->td_lwp)
                ++curthread->td_lwp->lwp_ru.ru_minflt;
@@ -1830,7 +1830,7 @@ vm_fault_wire(vm_map_t map, vm_map_entry_t entry, boolean_t user_wire)
                                va -= PAGE_SIZE;
                                if ((pa = pmap_extract(pmap, va)) == 0)
                                        continue;
-                               pmap_change_wiring(pmap, va, FALSE);
+                               pmap_change_wiring(pmap, va, FALSE, entry);
                                if (!fictitious) {
                                        m = PHYS_TO_VM_PAGE(pa);
                                        vm_page_busy_wait(m, FALSE, "vmwrpg");
@@ -1880,7 +1880,7 @@ vm_fault_unwire(vm_map_t map, vm_map_entry_t entry)
        for (va = start; va < end; va += PAGE_SIZE) {
                pa = pmap_extract(pmap, va);
                if (pa != 0) {
-                       pmap_change_wiring(pmap, va, FALSE);
+                       pmap_change_wiring(pmap, va, FALSE, entry);
                        if (!fictitious) {
                                m = PHYS_TO_VM_PAGE(pa);
                                vm_page_busy_wait(m, FALSE, "vmwupg");
@@ -1901,6 +1901,8 @@ vm_fault_unwire(vm_map_t map, vm_map_entry_t entry)
  * entry corresponding to a main map entry that is wired down).
  *
  * No other requirements.
+ *
+ * XXX do segment optimization
  */
 void
 vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map,
@@ -1968,7 +1970,7 @@ vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map,
                 */
 
                vm_page_flag_clear(dst_m, PG_ZERO);
-               pmap_enter(dst_map->pmap, vaddr, dst_m, prot, FALSE);
+               pmap_enter(dst_map->pmap, vaddr, dst_m, prot, FALSE, dst_entry);
 
                /*
                 * Mark it no longer busy, and put it on the active list.
@@ -2427,7 +2429,7 @@ vm_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry, int prot,
                         */
                        if (pprot & VM_PROT_WRITE)
                                vm_set_nosync(m, entry);
-                       pmap_enter(pmap, addr, m, pprot, 0);
+                       pmap_enter(pmap, addr, m, pprot, 0, entry);
                        mycpu->gd_cnt.v_vm_faults++;
                        if (curthread->td_lwp)
                                ++curthread->td_lwp->lwp_ru.ru_minflt;
@@ -2464,7 +2466,7 @@ vm_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry, int prot,
                        }
                        if (pprot & VM_PROT_WRITE)
                                vm_set_nosync(m, entry);
-                       pmap_enter(pmap, addr, m, pprot, 0);
+                       pmap_enter(pmap, addr, m, pprot, 0, entry);
                        mycpu->gd_cnt.v_vm_faults++;
                        if (curthread->td_lwp)
                                ++curthread->td_lwp->lwp_ru.ru_minflt;
@@ -2599,7 +2601,7 @@ vm_prefault_quick(pmap_t pmap, vm_offset_t addra,
                                        swap_pager_unswapped(m);
                                }
                        }
-                       pmap_enter(pmap, addr, m, prot, 0);
+                       pmap_enter(pmap, addr, m, prot, 0, entry);
                        mycpu->gd_cnt.v_vm_faults++;
                        if (curthread->td_lwp)
                                ++curthread->td_lwp->lwp_ru.ru_minflt;
index e0cbf5e..c822fb8 100644 (file)
@@ -430,6 +430,7 @@ _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object)
        object->swblock_count = 0;
        RB_INIT(&object->swblock_root);
        vm_object_lock_init(object);
+       pmap_object_init(object);
 
        vm_object_hold(object);
        lwkt_gettoken(&vmobj_token);
@@ -929,6 +930,11 @@ vm_object_terminate(vm_object_t object)
                      "ref_count=%d", object->ref_count);
        }
 
+       /*
+        * Cleanup any shared pmaps associated with this object.
+        */
+       pmap_object_free(object);
+
        /*
         * Now free any remaining pages. For internal objects, this also
         * removes them from paging queues. Don't free wired pages, just
index 108180e..bf3b3dc 100644 (file)
 #ifndef _SYS_THREAD_H_
 #include <sys/thread.h>
 #endif
+#ifndef _MACHINE_PMAP_H_
+#include <machine/pmap.h>
+#endif
+#ifndef _MACHINE_ATOMIC_H_
 #include <machine/atomic.h>
+#endif
 #ifndef _VM_VM_H_
 #include <vm/vm.h>
 #endif
@@ -186,6 +191,7 @@ struct vm_object {
        RB_HEAD(swblock_rb_tree, swblock) swblock_root;
        int     swblock_count;
        struct  lwkt_token      token;
+       struct md_object        md;     /* machine specific (typ pmap) */
 };
 
 /*