kernel - VM rework part 12 - Core pmap work, stabilize & optimize
authorMatthew Dillon <dillon@apollo.backplane.com>
Sun, 19 May 2019 16:53:12 +0000 (09:53 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Mon, 20 May 2019 19:39:25 +0000 (12:39 -0700)
* Add tracking for the number of PTEs mapped writeable in md_page.
  Change how PG_WRITEABLE and PG_MAPPED is cleared in the vm_page
  to avoid clear/set races.  This problem occurs because we would
  have otherwise tried to clear the bits without hard-busying the
  page. This allows the bits to be set with only an atomic op.

  Procedures which test these bits universally do so while holding
  the page hard-busied, and now call pmap_mapped_sfync() prior to
  properly synchronize the bits.

* Fix bugs related to various counterse.  pm_stats.resident_count,
  wiring counts, vm_page->md.writeable_count, and
  vm_page->md.pmap_count.

* Fix bugs related to synchronizing removed pte's with the vm_page.
  Fix one case where we were improperly updating (m)'s state based
  on a lost race against a pte swap-to-0 (pulling the pte).

* Fix a bug related to the page soft-busying code when the
  m->object/m->pindex race is lost.

* Implement a heuristical version of vm_page_active() which just
  updates act_count unlocked if the page is already in the
  PQ_ACTIVE queue, or if it is fictitious.

* Allow races against the backing scan for pmap_remove_all() and
  pmap_page_protect(VM_PROT_READ).  Callers of these routines for
  these cases expect full synchronization of the page dirty state.
  We can identify when a page has not been fully cleaned out by
  checking vm_page->md.pmap_count and vm_page->md.writeable_count.
  In the rare situation where this happens, simply retry.

* Assert that the PTE pindex is properly interlocked in pmap_enter().
  We still allows PTEs to be pulled by other routines without the
  interlock, but multiple pmap_enter()s of the same page will be
  interlocked.

* Assert additional wiring count failure cases.

* (UNTESTED) Flag DEVICE pages (dev_pager_getfake()) as being
  PG_UNMANAGED.  This essentially prevents all the various
  reference counters (e.g. vm_page->md.pmap_count and
  vm_page->md.writeable_count), PG_M, PG_A, etc from being
  updated.

  The vm_page's aren't tracked in the pmap at all because there
  is no way to find them.. they are 'fake', so without a pv_entry,
  we can't track them.  Instead we simply rely on the vm_map_backing
  scan to manipulate the PTEs.

* Optimize the new vm_map_entry_shadow() to use a shared object
  token instead of an exclusive one.  OBJ_ONEMAPPING will be cleared
  with the shared token.

* Optimize single-threaded access to pmaps to avoid pmap_inval_*()
  complexities.

* Optimize __read_mostly for more globals.

* Optimize pmap_testbit(), pmap_clearbit(), pmap_page_protect().
  Pre-check vm_page->md.writeable_count and vm_page->md.pmap_count
  for an easy degenerate return; before real work.

* Optimize pmap_inval_smp() and pmap_inval_smp_cmpset() for the
  single-threaded pmap case, when called on the same CPU the pmap
  is associated with.  This allows us to use simple atomics and
  cpu_*() instructions and avoid the complexities of the
  pmap_inval_*() infrastructure.

* Randomize the page queue used in bio_page_alloc().  This does not
  appear to hurt performance (e.g. heavy tmpfs use) on large many-core
  NUMA machines and it makes the vm_page_alloc()'s job easier.

  This change might have a downside for temporary files, but for more
  long-lasting files there's no point allocating pages localized to a
  particular cpu.

* Optimize vm_page_alloc().

  (1) Refactor the _vm_page_list_find*() routines to avoid re-scanning
      the same array indices over and over again when trying to find
      a page.

  (2) Add a heuristic, vpq.lastq, for each queue, which we set if a
      _vm_page_list_find*() operation had to go far-afield to find its
      page.  Subsequent finds will skip to the far-afield position until
      the current CPUs queues have pages again.

  (3) Reduce PQ_L2_SIZE From an extravagant 2048 entries per queue down
      to 1024.  The original 2048 was meant to provide 8-way
      set-associativity for 256 cores but wound up reducing performance
      due to longer index iterations.

* Refactor the vm_page_hash[] array.  This array is used to shortcut
  vm_object locks and locate VM pages more quickly, without locks.
  The new code limits the size of the array to something more reasonable,
  implements a 4-way set-associative replacement policy using 'ticks',
  and rewrites the hashing math.

* Effectively remove pmap_object_init_pt() for now.  In current tests
  it does not actually improve performance, probably because it may
  map pages that are not actually used by the program.

* Remove vm_map_backing->refs.  This field is no longer used.

* Remove more of the old now-stale code related to use of pv_entry's
  for terminal PTEs.

* Remove more of the old shared page-table-page code.  This worked but
  could never be fully validated and was prone to bugs.  So remove it.
  In the future we will likely use larger 2MB and 1GB pages anyway.

* Remove pmap_softwait()/pmap_softhold()/pmap_softdone().

* Remove more #if 0'd code.

18 files changed:
sys/kern/kern_fork.c
sys/kern/kern_synch.c
sys/kern/vfs_bio.c
sys/platform/pc64/include/pmap.h
sys/platform/pc64/x86_64/pmap.c
sys/platform/pc64/x86_64/pmap_inval.c
sys/platform/vkernel64/platform/pmap.c
sys/vm/device_pager.c
sys/vm/pmap.h
sys/vm/swap_pager.c
sys/vm/vm_fault.c
sys/vm/vm_map.c
sys/vm/vm_map.h
sys/vm/vm_object.c
sys/vm/vm_page.c
sys/vm/vm_page.h
sys/vm/vm_page2.h
sys/vm/vm_pageout.c

index 01b2777..70bc465 100644 (file)
@@ -729,7 +729,7 @@ done:
 
 static struct lwp *
 lwp_fork(struct lwp *origlp, struct proc *destproc, int flags,
-    const cpumask_t *mask)
+        const cpumask_t *mask)
 {
        globaldata_t gd = mycpu;
        struct lwp *lp;
@@ -827,6 +827,7 @@ lwp_fork(struct lwp *origlp, struct proc *destproc, int flags,
         * This flag is set and never cleared.  It means that the process
         * was threaded at some point.  Used to improve exit performance.
         */
+       pmap_maybethreaded(&destproc->p_vmspace->vm_pmap);
        destproc->p_flags |= P_MAYBETHREADED;
 
        return (lp);
index f8c9be8..cd510b2 100644 (file)
@@ -80,11 +80,11 @@ SYSINIT(sched_dyninit, SI_BOOT1_DYNALLOC, SI_ORDER_FIRST, sched_dyninit, NULL);
 
 int    lbolt;
 void   *lbolt_syncer;
-int    ncpus;
-int    ncpus_fit, ncpus_fit_mask;              /* note: mask not cpumask_t */
-int    safepri;
-int    tsleep_now_works;
 int    tsleep_crypto_dump = 0;
+__read_mostly int ncpus;
+__read_mostly int ncpus_fit, ncpus_fit_mask;   /* note: mask not cpumask_t */
+__read_mostly int safepri;
+__read_mostly int tsleep_now_works;
 
 MALLOC_DEFINE(M_TSLEEP, "tslpque", "tsleep queues");
 
index e5de1e8..bf01c67 100644 (file)
@@ -4279,6 +4279,11 @@ bio_page_alloc(struct buf *bp, vm_object_t obj, vm_pindex_t pg, int deficit)
 
        ASSERT_LWKT_TOKEN_HELD(vm_object_token(obj));
 
+       /*
+        * Randomize
+        */
+       vmflags |= VM_ALLOC_CPU(obj->pg_color % ncpus);
+
        /*
         * Try a normal allocation first.
         */
index 58c75d0..1153586 100644 (file)
@@ -227,6 +227,7 @@ struct vmspace;
  */
 struct md_page {
        long pmap_count;
+       long writeable_count;
 };
 
 /*
@@ -325,6 +326,7 @@ struct pmap {
 #define PMAP_EMULATE_AD_BITS   0x00000002
 #define PMAP_HVM               0x00000004
 #define PMAP_SEGSHARED         0x00000008      /* segment shared opt */
+#define PMAP_MULTI             0x00000010      /* multi-threaded use */
 
 #define pmap_resident_count(pmap) ((pmap)->pm_stats.resident_count)
 #define pmap_resident_tlnw_count(pmap) ((pmap)->pm_stats.resident_count - \
index 26789e7..4caf4e6 100644 (file)
@@ -297,9 +297,6 @@ SYSCTL_INT(_machdep, OID_AUTO, pmap_enter_debug, CTLFLAG_RW,
 static int pmap_yield_count = 64;
 SYSCTL_INT(_machdep, OID_AUTO, pmap_yield_count, CTLFLAG_RW,
     &pmap_yield_count, 0, "Yield during init_pt/release");
-static int pmap_mmu_optimize = 0;
-SYSCTL_INT(_machdep, OID_AUTO, pmap_mmu_optimize, CTLFLAG_RW,
-    &pmap_mmu_optimize, 0, "Share page table pages when possible");
 int pmap_fast_kernel_cpusync = 0;
 SYSCTL_INT(_machdep, OID_AUTO, pmap_fast_kernel_cpusync, CTLFLAG_RW,
     &pmap_fast_kernel_cpusync, 0, "Share page table pages when possible");
@@ -431,41 +428,6 @@ pmap_page_stats_deleting(vm_page_t m)
        }
 }
 
-/*
- * This is an ineligent crowbar to prevent heavily threaded programs
- * from creating long live-locks in the pmap code when pmap_mmu_optimize
- * is enabled.  Without it a pmap-local page table page can wind up being
- * constantly created and destroyed (without injury, but also without
- * progress) as the optimization tries to switch to the object's shared page
- * table page.
- */
-static __inline void
-pmap_softwait(pmap_t pmap)
-{
-       while (pmap->pm_softhold) {
-               tsleep_interlock(&pmap->pm_softhold, 0);
-               if (pmap->pm_softhold)
-                       tsleep(&pmap->pm_softhold, PINTERLOCKED, "mmopt", 0);
-       }
-}
-
-static __inline void
-pmap_softhold(pmap_t pmap)
-{
-       while (atomic_swap_int(&pmap->pm_softhold, 1) == 1) {
-               tsleep_interlock(&pmap->pm_softhold, 0);
-               if (atomic_swap_int(&pmap->pm_softhold, 1) == 1)
-                       tsleep(&pmap->pm_softhold, PINTERLOCKED, "mmopt", 0);
-       }
-}
-
-static __inline void
-pmap_softdone(pmap_t pmap)
-{
-       atomic_swap_int(&pmap->pm_softhold, 0);
-       wakeup(&pmap->pm_softhold);
-}
-
 /*
  * Move the kernel virtual free pointer to the next
  * 2MB.  This is used to help improve performance
@@ -1348,6 +1310,7 @@ pmap_init(void)
 
                m = &vm_page_array[i];
                m->md.pmap_count = 0;
+               m->md.writeable_count = 0;
        }
 
        /*
@@ -1619,6 +1582,7 @@ pmap_page_init(struct vm_page *m)
 {
        vm_page_init(m);
        m->md.pmap_count = 0;
+       m->md.writeable_count = 0;
 }
 
 /***************************************************
@@ -2628,6 +2592,9 @@ pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp)
                else
                        ptep_iso  = NULL;
                if (*ptep & pmap->pmap_bits[PG_V_IDX]) {
+                       KKASSERT(0);
+#if 0
+                       /* REMOVED replaces shared page table page */
                        pt_entry_t pte;
 
                        if (ispt == 0) {
@@ -2645,6 +2612,7 @@ pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp)
                                panic("pmap_allocpte: shared pgtable "
                                      "pg bad wirecount");
                        }
+#endif
                } else {
                        pt_entry_t pte;
 
@@ -3087,6 +3055,11 @@ pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, pmap_inval_bulk_t *bulk,
                pmap_inval_bulk(bulk, (vm_offset_t)-1, pt, 0);
                KKASSERT(pv->pv_m == p);        /* debugging */
        } else {
+               /*
+                * XXX REMOVE ME
+                */
+               KKASSERT(0);
+
                /*
                 * Remove a PTE from the PT page.  The PV might exist even if
                 * the PTE is not managed, in whichcase pv->pv_m should be
@@ -3251,8 +3224,11 @@ pmap_remove_pv_page(pv_entry_t pv)
                vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
                KKASSERT(m->md.pmap_count == 0);
        } else {
-               atomic_add_long(&m->md.pmap_count, -1);
-               if (m->md.pmap_count == 0)
+               /*
+                * Used only for page table pages, so safe to clear on
+                * the 1->0 transition.
+                */
+               if (atomic_fetchadd_long(&m->md.pmap_count, -1) == 1)
                        vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
        }
        /* pmap_page_stats_deleting(m); */
@@ -3411,6 +3387,24 @@ pmap_reference(pmap_t pmap)
                atomic_add_int(&pmap->pm_count, 1);
 }
 
+void
+pmap_maybethreaded(pmap_t pmap)
+{
+       atomic_set_int(&pmap->pm_flags, PMAP_MULTI);
+}
+
+/*
+ * Called while page is hard-busied to clear the PG_MAPPED and PG_WRITEABLE
+ * flags if able.
+ */
+int
+pmap_mapped_sync(vm_page_t m)
+{
+       if (m->md.pmap_count == 0)
+               vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
+       return (m->flags);
+}
+
 /***************************************************
  * page management routines.
  ***************************************************/
@@ -3939,8 +3933,10 @@ _pv_free(pv_entry_t pv, pv_entry_t pvp PMAP_DEBUG_DECL)
                 * and do it normally.  Drop two refs and the lock all in
                 * one go.
                 */
-               if (pvp)
-                       vm_page_unwire_quick(pvp->pv_m);
+               if (pvp) {
+                       if (vm_page_unwire_quick(pvp->pv_m))
+                               panic("_pv_free: bad wirecount on pvp");
+               }
                if (atomic_cmpset_int(&pv->pv_hold, PV_HOLD_LOCKED | 2, 0)) {
 #ifdef PMAP_DEBUG2
                        if (pmap_enter_debug > 0) {
@@ -4537,7 +4533,10 @@ kernel_skip:
                        }
                        if (pd_pv) {
                                pv_lock(pd_pv);
-                               vm_page_unwire_quick(pd_pv->pv_m);
+                               if (vm_page_unwire_quick(pd_pv->pv_m)) {
+                                       panic("pmap_scan_callback: "
+                                             "bad wirecount on pd_pv");
+                               }
                                if (pd_pv->pv_pmap == NULL) {
                                        va_next = sva;          /* retry */
                                        break;
@@ -4638,17 +4637,26 @@ pmap_remove_callback(pmap_t pmap, struct pmap_scan_info *info,
                        vm_page_dirty(p);
                if (pte & pmap->pmap_bits[PG_A_IDX])
                        vm_page_flag_set(p, PG_REFERENCED);
+
+               /*
+                * NOTE: p is not hard-busied so it is not safe to
+                *       clear PG_MAPPED and PG_WRITEABLE on the 1->0
+                *       transition against them being set in
+                *       pmap_enter().
+                */
+               if (pte & pmap->pmap_bits[PG_RW_IDX])
+                       atomic_add_long(&p->md.writeable_count, -1);
                atomic_add_long(&p->md.pmap_count, -1);
-               if (p->md.pmap_count == 0)
-                       vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE);
+       }
+       if (pte & pmap->pmap_bits[PG_V_IDX]) {
+               atomic_add_long(&pmap->pm_stats.resident_count, -1);
+               if (pt_pv && vm_page_unwire_quick(pt_pv->pv_m))
+                       panic("pmap_remove: insufficient wirecount");
        }
        if (pte & pmap->pmap_bits[PG_W_IDX])
                atomic_add_long(&pmap->pm_stats.wired_count, -1);
        if (pte & pmap->pmap_bits[PG_G_IDX])
                cpu_invlpg((void *)va);
-       atomic_add_long(&pmap->pm_stats.resident_count, -1);
-       if (pt_pv && vm_page_unwire_quick(pt_pv->pv_m))
-               panic("pmap_remove: insufficient wirecount");
        pv_placemarker_wakeup(pmap, pte_placemark);
 }
 
@@ -4666,10 +4674,15 @@ static
 void
 pmap_remove_all(vm_page_t m)
 {
+       int retry;
+
        if (!pmap_initialized /* || (m->flags & PG_FICTITIOUS)*/)
                return;
        if (m->md.pmap_count == 0)
                return;
+
+       retry = ticks + hz * 60;
+again:
        PMAP_PAGE_BACKING_SCAN(m, NULL, ipmap, iptep, ipte, iva) {
                if (!pmap_inval_smp_cmpset(ipmap, iva, iptep, ipte, 0))
                        PMAP_PAGE_BACKING_RETRY;
@@ -4678,9 +4691,16 @@ pmap_remove_all(vm_page_t m)
                                vm_page_dirty(m);
                        if (ipte & ipmap->pmap_bits[PG_A_IDX])
                                vm_page_flag_set(m, PG_REFERENCED);
+
+                       /*
+                        * NOTE: m is not hard-busied so it is not safe to
+                        *       clear PG_MAPPED and PG_WRITEABLE on the 1->0
+                        *       transition against them being set in
+                        *       pmap_enter().
+                        */
+                       if (ipte & ipmap->pmap_bits[PG_RW_IDX])
+                               atomic_add_long(&m->md.writeable_count, -1);
                        atomic_add_long(&m->md.pmap_count, -1);
-                       if (m->md.pmap_count == 0)
-                               vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
                }
 
                /*
@@ -4695,7 +4715,10 @@ pmap_remove_all(vm_page_t m)
                        spin_unlock_shared(&ipmap->pm_spin);
 
                        if (pt_pv) {
-                               vm_page_unwire_quick(pt_pv->pv_m);
+                               if (vm_page_unwire_quick(pt_pv->pv_m)) {
+                                       panic("pmap_remove_all: bad "
+                                             "wire_count on pt_pv");
+                               }
                                atomic_add_long(
                                        &ipmap->pm_stats.resident_count, -1);
                        }
@@ -4705,7 +4728,21 @@ pmap_remove_all(vm_page_t m)
                if (ipte & ipmap->pmap_bits[PG_G_IDX])
                        cpu_invlpg((void *)iva);
        } PMAP_PAGE_BACKING_DONE;
-       KKASSERT(m->md.pmap_count == 0);
+
+       /*
+        * pmap_count should be zero but it is possible to race a pmap_enter()
+        * replacement (see 'oldm').  Once it is zero it cannot become
+        * non-zero because the page is hard-busied.
+        */
+       if (m->md.pmap_count || m->md.writeable_count) {
+               tsleep(&m->md.pmap_count, 0, "pgunm", 1);
+               if (retry - ticks > 0)
+                       goto again;
+               panic("pmap_remove_all: cannot return pmap_count "
+                     "to 0 (%ld, %ld)",
+                     m->md.pmap_count, m->md.writeable_count);
+       }
+       vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
 }
 
 /*
@@ -4728,9 +4765,16 @@ pmap_remove_specific(pmap_t pmap_match, vm_page_t m)
                                vm_page_dirty(m);
                        if (ipte & ipmap->pmap_bits[PG_A_IDX])
                                vm_page_flag_set(m, PG_REFERENCED);
+
+                       /*
+                        * NOTE: m is not hard-busied so it is not safe to
+                        *       clear PG_MAPPED and PG_WRITEABLE on the 1->0
+                        *       transition against them being set in
+                        *       pmap_enter().
+                        */
+                       if (ipte & ipmap->pmap_bits[PG_RW_IDX])
+                               atomic_add_long(&m->md.writeable_count, -1);
                        atomic_add_long(&m->md.pmap_count, -1);
-                       if (m->md.pmap_count == 0)
-                               vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
                }
 
                /*
@@ -4747,7 +4791,10 @@ pmap_remove_specific(pmap_t pmap_match, vm_page_t m)
                        if (pt_pv) {
                                atomic_add_long(
                                        &ipmap->pm_stats.resident_count, -1);
-                               vm_page_unwire_quick(pt_pv->pv_m);
+                               if (vm_page_unwire_quick(pt_pv->pv_m)) {
+                                       panic("pmap_remove_specific: bad "
+                                             "wire_count on pt_pv");
+                               }
                        }
                }
                if (ipte & ipmap->pmap_bits[PG_W_IDX])
@@ -4828,16 +4875,15 @@ again:
                        }
                }
                if (pbits & pmap->pmap_bits[PG_MANAGED_IDX]) {
-                       if ((pbits & pmap->pmap_bits[PG_DEVICE_IDX]) == 0) {
-                               if (pbits & pmap->pmap_bits[PG_A_IDX]) {
-                                       m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
-                                       vm_page_flag_set(m, PG_REFERENCED);
-                               }
-                               if (pbits & pmap->pmap_bits[PG_M_IDX]) {
-                                       m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
-                                       vm_page_dirty(m);
-                               }
-                       }
+                       KKASSERT((pbits & pmap->pmap_bits[PG_DEVICE_IDX]) == 0);
+                       m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
+                       if (pbits & pmap->pmap_bits[PG_A_IDX])
+                               vm_page_flag_set(m, PG_REFERENCED);
+                       if (pbits & pmap->pmap_bits[PG_M_IDX])
+                               vm_page_dirty(m);
+                       if (pbits & pmap->pmap_bits[PG_RW_IDX])
+                               atomic_add_long(&m->md.writeable_count, -1);
+
                }
        }
        pv_placemarker_wakeup(pmap, pte_placemark);
@@ -4866,8 +4912,10 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
        pv_entry_t pte_pv;      /* page table entry */
        vm_pindex_t *pte_placemark;
        pt_entry_t *ptep;
+       pt_entry_t origpte;
        vm_paddr_t opa;
-       pt_entry_t origpte, newpte;
+       vm_page_t oldm;
+       pt_entry_t newpte;
        vm_paddr_t pa;
 
        if (pmap == NULL)
@@ -4922,7 +4970,6 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
                ptep = vtopte(va);
                origpte = *ptep;
        } else {
-               pmap_softwait(pmap);
                pte_pv = pv_get(pmap, pmap_pte_pindex(va), &pte_placemark);
                KKASSERT(pte_pv == NULL);
                if (va >= VM_MAX_USER_ADDRESS) {
@@ -4965,11 +5012,60 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
                goto done;
        }
 
+       /*
+        * Adjust page flags.  The page is soft-busied or hard-busied, we
+        * should be able to safely set PG_* flag bits even with the (shared)
+        * soft-busy.
+        *
+        * As a bit of a safety, bump pmap_count and set the PG_* bits
+        * before mapping the page.  If another part of the system does
+        * not properly hard-busy the page (against our soft-busy) in
+        * order to remove mappings it might not see the pte that we are
+        * about to add and thus will not be able to drop pmap_count to 0.
+        *
+        * NOTE! PG_MAPPED and PG_WRITEABLE can only be cleared when
+        *       the page is hard-busied AND pmap_count is 0.  This
+        *       interlocks our setting of the flags here.
+        */
+       /*vm_page_spin_lock(m);*/
+       if ((m->flags & PG_UNMANAGED) == 0) {
+               atomic_add_long(&m->md.pmap_count, 1);
+               if (newpte & pmap->pmap_bits[PG_RW_IDX])
+                       atomic_add_long(&m->md.writeable_count, 1);
+       }
+       if (newpte & pmap->pmap_bits[PG_RW_IDX]) {
+               if ((m->flags & (PG_MAPPED | PG_WRITEABLE)) == 0)
+                       vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE);
+       } else {
+               if ((m->flags & PG_MAPPED) == 0)
+                       vm_page_flag_set(m, PG_MAPPED);
+       }
+       /*vm_page_spin_unlock(m);*/
+       /*pmap_page_stats_adding(m);*/
+
+       /*
+        * A race can develop when replacing an existing mapping.  The new
+        * page has been busied and the pte is placemark-locked, but the
+        * old page is could be ripped out from under us at any time by
+        * a backing scan.
+        *
+        * The race is handled by having the backing scans check pmap_count
+        * writeable_count when doing operations that should ensure one
+        * becomes 0.
+        */
+       opa = origpte & PG_FRAME;
+       if (opa && (origpte & pmap->pmap_bits[PG_MANAGED_IDX])) {
+               oldm = PHYS_TO_VM_PAGE(opa);
+               KKASSERT(opa == oldm->phys_addr);
+               KKASSERT(entry != NULL);
+       } else {
+               oldm = NULL;
+       }
+
        /*
         * Swap the new and old PTEs and perform any necessary SMP
         * synchronization.
         */
-       opa = origpte & PG_FRAME;
        if ((prot & VM_PROT_NOSYNC) || (opa == 0 && pt_pv != NULL)) {
                /*
                 * Explicitly permitted to avoid pmap cpu mask synchronization
@@ -5009,8 +5105,6 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
         * Retain the same wiring count due to replacing an existing page,
         * or bump the wiring count for a new page.
         */
-       if ((m->flags & PG_UNMANAGED) == 0)
-               atomic_add_long(&m->md.pmap_count, 1);
        if (pt_pv && opa == 0) {
                vm_page_wire_quick(pt_pv->pv_m);
                atomic_add_long(&pt_pv->pv_pmap->pm_stats.resident_count, 1);
@@ -5018,45 +5112,34 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
        if (wired && (origpte & pmap->pmap_bits[PG_W_IDX]) == 0)
                atomic_add_long(&pmap->pm_stats.wired_count, 1);
 
-       /*
-        * Adjust page flags.  The page is soft-busied or hard-busied, we
-        * should be able to safely set PG_* flag bits even with the (shared)
-        * soft-busy.
-        */
-       /*vm_page_spin_lock(m);*/
-       if ((m->flags & PG_MAPPED) == 0)
-               vm_page_flag_set(m, PG_MAPPED);
-       if ((newpte & pmap->pmap_bits[PG_RW_IDX]) &&
-           (m->flags & PG_WRITEABLE) == 0) {
-               vm_page_flag_set(m, PG_WRITEABLE);
-       }
-       /*vm_page_spin_unlock(m);*/
-       /*pmap_page_stats_adding(m);*/
-
        /*
         * Account for the removal of the old page.  pmap and pt_pv stats
         * have already been fully adjusted for both.
         *
-        * If managed we must update the pmap_count and reflect any residual
-        * [M]odified bit back to the page.
+        * WARNING! oldm is not soft or hard-busied.  The pte at worst can
+        *          only be removed out from under us since we hold the
+        *          placemarker.  So if it is still there, it must not have
+        *          changed.
         */
        if (opa && (origpte & pmap->pmap_bits[PG_MANAGED_IDX])) {
-               vm_page_t oldm;
-
-               oldm = PHYS_TO_VM_PAGE(opa);
-               if (origpte & pmap->pmap_bits[PG_M_IDX]) {
-                       vm_page_spin_lock(oldm);
+               KKASSERT(oldm == PHYS_TO_VM_PAGE(opa));
+               /* XXX PG_DEVICE_IDX pages */
+               if (origpte & pmap->pmap_bits[PG_M_IDX])
                        vm_page_dirty(oldm);
-                       vm_page_spin_unlock(oldm);
-               }
-               if (atomic_fetchadd_long(&oldm->md.pmap_count, -1) == 0) {
-                       vm_page_flag_clear(oldm, PG_MAPPED | PG_WRITEABLE);
-               }
+               if (origpte & pmap->pmap_bits[PG_A_IDX])
+                       vm_page_flag_set(oldm, PG_REFERENCED);
+
+               /*
+                * NOTE: oldm is not hard-busied so it is not safe to
+                *       clear PG_MAPPED and PG_WRITEABLE on the 1->0
+                *       transition against them being set in
+                *       pmap_enter().
+                */
+               if (origpte & pmap->pmap_bits[PG_RW_IDX])
+                       atomic_add_long(&oldm->md.writeable_count, -1);
+               atomic_add_long(&oldm->md.pmap_count, -1);
        }
 
-       /*
-        * Cleanup
-        */
 done:
        KKASSERT((newpte & pmap->pmap_bits[PG_MANAGED_IDX]) == 0 ||
                 (m->flags & PG_MAPPED));
@@ -5092,6 +5175,7 @@ pmap_kenter_temporary(vm_paddr_t pa, long i)
        return ((void *)crashdumpmap);
 }
 
+#if 0
 #define MAX_INIT_PT (96)
 
 /*
@@ -5100,11 +5184,13 @@ pmap_kenter_temporary(vm_paddr_t pa, long i)
  * immediately after an mmap.
  */
 static int pmap_object_init_pt_callback(vm_page_t p, void *data);
+#endif
 
 void
 pmap_object_init_pt(pmap_t pmap, vm_map_entry_t entry,
                    vm_offset_t addr, vm_size_t size, int limit)
 {
+#if 0
        vm_prot_t prot = entry->protection;
        vm_object_t object = entry->ba.object;
        vm_pindex_t pindex = atop(entry->ba.offset + (addr - entry->ba.start));
@@ -5182,8 +5268,11 @@ pmap_object_init_pt(pmap_t pmap, vm_map_entry_t entry,
        vm_page_rb_tree_RB_SCAN_NOLK(&object->rb_memq, rb_vm_page_scancmp,
                                     pmap_object_init_pt_callback, &info);
        vm_object_drop(object);
+#endif
 }
 
+#if 0
+
 static
 int
 pmap_object_init_pt_callback(vm_page_t p, void *data)
@@ -5246,6 +5335,8 @@ again:
        return(0);
 }
 
+#endif
+
 /*
  * Return TRUE if the pmap is in shape to trivially pre-fault the specified
  * address.
@@ -5253,6 +5344,9 @@ again:
  * Returns FALSE if it would be non-trivial or if a pte is already loaded
  * into the slot.
  *
+ * The address must reside within a vm_map mapped range to ensure that the
+ * page table doesn't get ripped out from under us.
+ *
  * XXX This is safe only because page table pages are not freed.
  */
 int
@@ -5444,8 +5538,17 @@ pmap_testbit(vm_page_t m, int bit)
 
        if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
                return FALSE;
-       if (m->md.pmap_count == 0)
+       /*
+        * Nothing to do if all the mappings are already read-only.
+        * The page's [M]odify bits have already been synchronized
+        * to the vm_page_t and cleaned out.
+        */
+       if (bit == PG_M_IDX && m->md.writeable_count == 0)
                return FALSE;
+
+       /*
+        * Iterate the mapping
+        */
        PMAP_PAGE_BACKING_SCAN(m, NULL, ipmap, iptep, ipte, iva) {
                if (ipte & ipmap->pmap_bits[bit]) {
                        res = TRUE;
@@ -5483,12 +5586,23 @@ void
 pmap_clearbit(vm_page_t m, int bit_index)
 {
        pt_entry_t npte;
+       int retry;
 
+       /*
+        * XXX It might make sense to allow PG_FICTITIOUS + PG_DEVICE
+        *     pages through to the backing scan, but atm devices do
+        *     not care about PG_WRITEABLE;
+        */
        if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) {
                if (bit_index == PG_RW_IDX)
                        vm_page_flag_clear(m, PG_WRITEABLE);
                return;
        }
+
+       /*
+        * Being asked to clear other random bits, we don't track them
+        * so we have to iterate.
+        */
        if (bit_index != PG_RW_IDX) {
                PMAP_PAGE_BACKING_SCAN(m, NULL, ipmap, iptep, ipte, iva) {
                        if (ipte & ipmap->pmap_bits[bit_index]) {
@@ -5499,11 +5613,29 @@ pmap_clearbit(vm_page_t m, int bit_index)
                return;
        }
 
+       /*
+        * Being asked to clear the RW bit.
+        *
+        * Nothing to do if all the mappings are already read-only
+        */
+       if (m->md.writeable_count == 0)
+               return;
+
+       /*
+        * Iterate the mappings and check.
+        */
+       retry = ticks + hz * 60;
+again:
        /*
         * Clear PG_RW. This also clears PG_M and marks the page dirty if
         * PG_M was set.
+        *
+        * Since the caller holds the page hard-busied we can safely clear
+        * PG_WRITEABLE, and callers expect us to for the PG_RW_IDX path.
         */
        PMAP_PAGE_BACKING_SCAN(m, NULL, ipmap, iptep, ipte, iva) {
+               if ((ipte & ipmap->pmap_bits[PG_MANAGED_IDX]) == 0)
+                       continue;
                if ((ipte & ipmap->pmap_bits[PG_RW_IDX]) == 0)
                        continue;
                npte = ipte & ~(ipmap->pmap_bits[PG_RW_IDX] |
@@ -5512,7 +5644,28 @@ pmap_clearbit(vm_page_t m, int bit_index)
                        PMAP_PAGE_BACKING_RETRY;
                if (ipte & ipmap->pmap_bits[PG_M_IDX])
                        vm_page_dirty(m);
+
+               /*
+                * NOTE: m is not hard-busied so it is not safe to
+                *       clear PG_WRITEABLE on the 1->0 transition
+                *       against it being set in pmap_enter().
+                */
+               atomic_add_long(&m->md.writeable_count, -1);
        } PMAP_PAGE_BACKING_DONE;
+
+       /*
+        * writeable_count should be zero but it is possible to race
+        * a pmap_enter() replacement (see 'oldm').  Once it is zero
+        * it cannot become non-zero because the page is hard-busied.
+        */
+       if (m->md.writeable_count != 0) {
+               tsleep(&m->md.writeable_count, 0, "pgwab", 1);
+               if (retry - ticks > 0)
+                       goto again;
+               panic("pmap_remove_all: cannot return writeable_count "
+                     "to 0 (%ld)",
+                     m->md.writeable_count);
+       }
        vm_page_flag_clear(m, PG_WRITEABLE);
 }
 
@@ -6099,7 +6252,10 @@ pmap_pgscan_callback(pmap_t pmap, struct pmap_scan_info *info,
                                        info->stop = 1;
                                if (pt_pv) {
                                        pv_lock(pt_pv);
-                                       vm_page_unwire_quick(pt_pv->pv_m);
+                                       if (vm_page_unwire_quick(pt_pv->pv_m)) {
+                                               panic("pmap_pgscan: bad wire_"
+                                                     "count on pt_pv");
+                                       }
                                }
                        } else {
                                vm_page_wakeup(m);
index 65e4ba7..bc5377a 100644 (file)
@@ -275,7 +275,6 @@ pmap_inval_smp(pmap_t pmap, vm_offset_t va, vm_pindex_t npgs,
         */
        if (pmap == NULL)
                pmap = &kernel_pmap;
-       pmap_inval_init(pmap);
 
        /*
         * Shortcut single-cpu case if possible.
@@ -286,6 +285,8 @@ pmap_inval_smp(pmap_t pmap, vm_offset_t va, vm_pindex_t npgs,
                 * Convert to invltlb if there are too many pages to
                 * invlpg on.
                 */
+               if (pmap->pm_flags & PMAP_MULTI)
+                       pmap_inval_init(pmap);
                if (npgs == 1) {
                        if (ptep)
                                opte = atomic_swap_long(ptep, npte);
@@ -313,7 +314,8 @@ pmap_inval_smp(pmap_t pmap, vm_offset_t va, vm_pindex_t npgs,
                                --npgs;
                        }
                }
-               pmap_inval_done(pmap);
+               if (pmap->pm_flags & PMAP_MULTI)
+                       pmap_inval_done(pmap);
 
                return opte;
        }
@@ -326,6 +328,7 @@ pmap_inval_smp(pmap_t pmap, vm_offset_t va, vm_pindex_t npgs,
         * tsc_target is our watchdog timeout that will attempt to recover
         * from a lost IPI.  Set to 1/16 second for now.
         */
+       pmap_inval_init(pmap);
        info = &invinfo[cpu];
 
        /*
@@ -446,22 +449,25 @@ pmap_inval_smp_cmpset(pmap_t pmap, vm_offset_t va, pt_entry_t *ptep,
         */
        if (pmap == NULL)
                pmap = &kernel_pmap;
-       pmap_inval_init(pmap);
 
        /*
         * Shortcut single-cpu case if possible.
         */
        if (CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask) &&
            pmap_inval_force_nonopt == 0) {
+               if (pmap->pm_flags & PMAP_MULTI)
+                       pmap_inval_init(pmap);
                if (atomic_cmpset_long(ptep, opte, npte)) {
                        if (va == (vm_offset_t)-1)
                                cpu_invltlb();
                        else
                                cpu_invlpg((void *)va);
-                       pmap_inval_done(pmap);
+                       if (pmap->pm_flags & PMAP_MULTI)
+                               pmap_inval_done(pmap);
                        return 1;
                } else {
-                       pmap_inval_done(pmap);
+                       if (pmap->pm_flags & PMAP_MULTI)
+                               pmap_inval_done(pmap);
                        return 0;
                }
        }
@@ -471,6 +477,7 @@ pmap_inval_smp_cmpset(pmap_t pmap, vm_offset_t va, pt_entry_t *ptep,
         * we setup our command.  A preemption might execute its own
         * pmap_inval*() command and create confusion below.
         */
+       pmap_inval_init(pmap);
        info = &invinfo[cpu];
 
        /*
index de8a079..77cf356 100644 (file)
@@ -3459,3 +3459,22 @@ pmap_pgscan(struct pmap_pgscan_info *pginfo)
        }
        vm_object_drop(pmap->pm_pteobj);
 }
+
+void
+pmap_maybethreaded(pmap_t pmap)
+{
+       /* nop */
+}
+
+/*
+ * Called while page is hard-busied to clear the PG_MAPPED and PG_WRITEABLE
+ * flags if able.
+ *
+ * vkernel code is using the old pmap style so the flags should already
+ * be properly set.
+ */
+int
+pmap_mapped_sync(vm_page_t m)
+{
+       return (m->flags);
+}
index 85bc610..323f06e 100644 (file)
@@ -280,7 +280,7 @@ dev_pager_getfake(vm_paddr_t paddr, int pat_mode)
 
        pmap_page_init(m);
 
-       m->flags = PG_FICTITIOUS;
+       m->flags = PG_FICTITIOUS | PG_UNMANAGED;
        m->valid = VM_PAGE_BITS_ALL;
        m->dirty = 0;
        m->queue = PQ_NONE;
index 6277b2c..d63d45f 100644 (file)
@@ -176,6 +176,8 @@ void                 pmap_copy_page (vm_paddr_t, vm_paddr_t);
 void            pmap_copy_page_frag (vm_paddr_t, vm_paddr_t, size_t bytes);
 void            pmap_enter (pmap_t, vm_offset_t, struct vm_page *,
                        vm_prot_t, boolean_t, struct vm_map_entry *);
+void            pmap_maybethreaded(pmap_t);
+int             pmap_mapped_sync(vm_page_t m);
 vm_page_t       pmap_fault_page_quick(pmap_t, vm_offset_t, vm_prot_t, int *);
 vm_paddr_t      pmap_extract (pmap_t pmap, vm_offset_t va, void **handlep);
 void            pmap_extract_done (void *handle);
index d4c0089..5204e8a 100644 (file)
@@ -1869,10 +1869,12 @@ swp_pager_async_iodone(struct bio *bio)
                                iscsi_crc32(bp->b_data, bp->b_bcount));
                        for (i = 0; i < bp->b_xio.xio_npages; ++i) {
                                vm_page_t m = bp->b_xio.xio_pages[i];
-                               if (m->flags & PG_WRITEABLE)
+                               if ((m->flags & PG_WRITEABLE) &&
+                                   (pmap_mapped_sync(m) & PG_WRITEABLE)) {
                                        kprintf("SWAPOUT: "
                                                "%d/%d %p writable\n",
                                                i, bp->b_xio.xio_npages, m);
+                               }
                        }
                }
        }
index 4d4c527..f7801c1 100644 (file)
@@ -952,8 +952,11 @@ vm_fault_quick(struct faultstate *fs, vm_pindex_t first_pindex,
         * Even though we are only soft-busied we can still move pages
         * around in the normal queue(s).  The soft-busy prevents the
         * page from being removed from the object, etc (normal operation).
+        *
+        * However, in this fast path it is excessively important to avoid
+        * any hard locks, so we use a special passive version of activate.
         */
-       vm_page_activate(m);
+       vm_page_soft_activate(m);
        fs->m = m;
        fs->msoftonly = 1;
 #ifdef VM_FAULT_QUICK_DEBUG
index 8cd61cc..c478b62 100644 (file)
@@ -747,20 +747,10 @@ vm_map_entry_shadow(vm_map_entry_t entry)
         *
         * Caller ensures source exists (all backing_ba's must have objects),
         * typically indirectly by virtue of the NEEDS_COPY flag being set.
-        *
-        * WARNING! Checking ref_count == 1 only works because we are testing
-        *          the object embedded in the entry (entry->ba.object).
-        *          This test DOES NOT WORK if checking an object hanging off
-        *          the backing chain (entry->ba.backing_ba list) because the
-        *          vm_map_backing might be shared, or part of a chain that
-        *          is shared.  Checking ba->refs is worthless.
-        *
-        *          XXX since we now replicate vm_map_backing's, ref_count==1
-        *          actually works generally for non-vnodes.
         */
        source = entry->ba.object;
        KKASSERT(source);
-       vm_object_hold(source);
+       vm_object_hold_shared(source);
 
        if (source->type != OBJT_VNODE) {
                if (source->ref_count == 1 &&
@@ -810,12 +800,10 @@ vm_map_entry_shadow(vm_map_entry_t entry)
         */
        vm_map_backing_detach(&entry->ba);
        *ba = entry->ba;                /* previous ba */
-       ba->refs = 1;                   /* initialize ref count */
        entry->ba.object = result;      /* new ba (at head of entry) */
        entry->ba.backing_ba = ba;
        entry->ba.backing_count = ba->backing_count + 1;
        entry->ba.offset = 0;
-       entry->ba.refs = 0;
 
        /* cpu localization twist */
        result->pg_color = vm_quickcolor();
@@ -1088,13 +1076,8 @@ static void
 vm_map_entry_dispose_ba(vm_map_backing_t ba)
 {
        vm_map_backing_t next;
-       long refs;
 
        while (ba) {
-               refs = atomic_fetchadd_long(&ba->refs, -1);
-               if (refs > 1)
-                       break;
-               KKASSERT(refs == 1);    /* transitioned 1->0 */
                if (ba->object) {
                        vm_map_backing_detach(ba);
                        vm_object_deallocate(ba->object);
@@ -1377,7 +1360,6 @@ vm_map_insert(vm_map_t map, int *countp, void *map_object, void *map_aux,
        new_entry->ba.backing_ba = NULL;
        new_entry->ba.backing_count = 0;
        new_entry->ba.offset = offset;
-       new_entry->ba.refs = 0;
        new_entry->ba.flags = 0;
        new_entry->ba.pmap = map->pmap;
 
@@ -3429,7 +3411,6 @@ vm_map_backing_replicated(vm_map_t map, vm_map_entry_t entry, int flags)
        for (;;) {
                object = ba->object;
                ba->pmap = map->pmap;
-               ba->refs = 1;
                if (object &&
                    (entry->maptype == VM_MAPTYPE_VPAGETABLE ||
                     entry->maptype == VM_MAPTYPE_NORMAL)) {
@@ -3454,7 +3435,6 @@ vm_map_backing_replicated(vm_map_t map, vm_map_entry_t entry, int flags)
                ba = nba;
                /* pmap is replaced at the top of the loop */
        }
-       entry->ba.refs = 0;     /* base entry refs is 0 */
 }
 
 static
index 52a2cd2..abbcfda 100644 (file)
@@ -182,7 +182,6 @@ struct vm_map_backing {
        };
 
        vm_ooffset_t            offset;         /* absolute offset in obj */
-       long                    refs;           /* shared refs */
        uint32_t                flags;
        uint32_t                backing_count;  /* #entries backing us */
 };
index 35cad30..307042c 100644 (file)
@@ -1472,9 +1472,17 @@ vm_object_page_remove_callback(vm_page_t p, void *data)
        }
 
        /*
-        * Destroy the page
+        * Destroy the page.  But we have to re-test whether its dirty after
+        * removing it from its pmaps.
         */
        vm_page_protect(p, VM_PROT_NONE);
+       if (info->limit && p->valid) {
+               vm_page_test_dirty(p);
+               if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) {
+                       vm_page_wakeup(p);
+                       goto done;
+               }
+       }
        vm_page_free(p);
 
        /*
index f56bfaa..583619d 100644 (file)
 #include <vm/vm_page2.h>
 #include <sys/spinlock2.h>
 
+struct vm_page_hash_elm {
+       vm_page_t       m;
+       int             ticks;
+       int             unused01;
+};
+
 /*
  * SET - Minimum required set associative size, must be a power of 2.  We
- *      want this to match or exceed the set-associativeness of the cpu.
- *
- * GRP - A larger set that allows bleed-over into the domains of other
- *      nearby cpus.  Also must be a power of 2.  Used by the page zeroing
- *      code to smooth things out a bit.
+ *      want this to match or exceed the set-associativeness of the cpu,
+ *      up to a reasonable limit (we will use 16).
  */
-#define PQ_SET_ASSOC           16
-#define PQ_SET_ASSOC_MASK      (PQ_SET_ASSOC - 1)
-
-#define PQ_GRP_ASSOC           (PQ_SET_ASSOC * 2)
-#define PQ_GRP_ASSOC_MASK      (PQ_GRP_ASSOC - 1)
+__read_mostly static int set_assoc_mask = 16 - 1;
 
 static void vm_page_queue_init(void);
 static void vm_page_free_wakeup(void);
 static vm_page_t vm_page_select_cache(u_short pg_color);
-static vm_page_t _vm_page_list_find2(int basequeue, int index);
+static vm_page_t _vm_page_list_find2(int basequeue, int index, int *lastp);
 static void _vm_page_deactivate_locked(vm_page_t m, int athead);
 static void vm_numa_add_topology_mem(cpu_node_t *cpup, int physid, long bytes);
 
@@ -131,7 +130,8 @@ static struct alist vm_contig_alist;
 static struct almeta vm_contig_ameta[ALIST_RECORDS_65536];
 static struct spinlock vm_contig_spin = SPINLOCK_INITIALIZER(&vm_contig_spin, "vm_contig_spin");
 
-static struct vm_page **vm_page_hash;
+static struct vm_page_hash_elm *vm_page_hash;
+__read_mostly static int vm_page_hash_size;
 
 static u_long vm_dma_reserved = 0;
 TUNABLE_ULONG("vm.dma_reserved", &vm_dma_reserved);
@@ -169,6 +169,7 @@ vm_page_queue_init(void)
        /* PQ_NONE has no queue */
 
        for (i = 0; i < PQ_COUNT; i++) {
+               vm_page_queues[i].lastq = -1;
                TAILQ_INIT(&vm_page_queues[i].pl);
                spin_init(&vm_page_queues[i].spin, "vm_page_queue_init");
        }
@@ -698,8 +699,28 @@ vm_page_startup_finish(void *dummy __unused)
        alist_blk_t xcount;
        alist_blk_t bfree;
        vm_page_t m;
-       vm_page_t *mp;
+       struct vm_page_hash_elm *mp;
+       int mask;
+
+       /*
+        * Set the set_assoc_mask based on the fitted number of CPUs.
+        * This is a mask, so we subject 1.
+        *
+        * w/PQ_L2_SIZE = 1024:
+        *
+        *      Don't let the associativity drop below 8.  So if we have
+        *      256 CPUs, two hyper-threads will wind up sharing.  The
+        *      maximum is PQ_L2_SIZE.
+        */
+       mask = PQ_L2_SIZE / ncpus_fit - 1;
+       if (mask < 7)           /* minimum is 8-way w/256 CPU threads */
+               mask = 7;
+       cpu_ccfence();
+       set_assoc_mask = mask;
 
+       /*
+        * Return part of the initial reserve back to the system
+        */
        spin_lock(&vm_contig_spin);
        for (;;) {
                bfree = alist_free_info(&vm_contig_alist, &blk, &count);
@@ -768,13 +789,22 @@ vm_page_startup_finish(void *dummy __unused)
                (PAGE_SIZE / 1024),
                (intmax_t)vm_contig_alist.bl_free * (PAGE_SIZE / 1024));
 
+       /*
+        * Power of 2
+        */
+       vm_page_hash_size = 4096;
+       while (vm_page_hash_size < (vm_page_array_size / 16))
+               vm_page_hash_size <<= 1;
+       if (vm_page_hash_size > 1024*1024)
+               vm_page_hash_size = 1024*1024;
+
        /*
         * hash table for vm_page_lookup_quick()
         */
        mp = (void *)kmem_alloc3(&kernel_map,
-                                vm_page_array_size * sizeof(vm_page_t),
+                                vm_page_hash_size * sizeof(*vm_page_hash),
                                 VM_SUBSYS_VMPGHASH, KM_CPU(0));
-       bzero(mp, vm_page_array_size * sizeof(vm_page_t));
+       bzero(mp, vm_page_hash_size * sizeof(*mp));
        cpu_sfence();
        vm_page_hash = mp;
 }
@@ -1287,17 +1317,6 @@ vm_page_hold(vm_page_t m)
 {
        atomic_add_int(&m->hold_count, 1);
        KKASSERT(m->queue - m->pc != PQ_FREE);
-#if 0
-       vm_page_spin_lock(m);
-       atomic_add_int(&m->hold_count, 1);
-       if (m->queue - m->pc == PQ_FREE) {
-               _vm_page_queue_spin_lock(m);
-               _vm_page_rem_queue_spinlocked(m);
-               _vm_page_add_queue_spinlocked(m, PQ_HOLD + m->pc, 0);
-               _vm_page_queue_spin_unlock(m);
-       }
-       vm_page_spin_unlock(m);
-#endif
 }
 
 /*
@@ -1472,15 +1491,18 @@ vm_page_remove(vm_page_t m)
 
 /*
  * Calculate the hash position for the vm_page hash heuristic.
+ *
+ * Mask by ~3 to offer 4-way set-assoc
  */
 static __inline
-struct vm_page **
+struct vm_page_hash_elm *
 vm_page_hash_hash(vm_object_t object, vm_pindex_t pindex)
 {
        size_t hi;
 
-       hi = (uintptr_t)object % (uintptr_t)vm_page_array_size + pindex;
-       hi %= vm_page_array_size;
+       hi = ((object->pg_color << 8) ^ (uintptr_t)object) + (pindex << 2);
+       hi &= vm_page_hash_size - 1;
+       hi &= ~3;
        return (&vm_page_hash[hi]);
 }
 
@@ -1493,25 +1515,29 @@ vm_page_hash_hash(vm_object_t object, vm_pindex_t pindex)
 vm_page_t
 vm_page_hash_get(vm_object_t object, vm_pindex_t pindex)
 {
-       struct vm_page **mp;
+       struct vm_page_hash_elm *mp;
        vm_page_t m;
+       int i;
 
        if (vm_page_hash == NULL)
                return NULL;
        mp = vm_page_hash_hash(object, pindex);
-       m = *mp;
-       cpu_ccfence();
-       if (m == NULL)
-               return NULL;
-       if (m->object != object || m->pindex != pindex)
-               return NULL;
-       if (vm_page_sbusy_try(m))
-               return NULL;
-       if (m->object != object || m->pindex != pindex) {
-               vm_page_wakeup(m);
-               return NULL;
+       for (i = 0; i < 4; ++i) {
+               m = mp[i].m;
+               cpu_ccfence();
+               if (m == NULL)
+                       continue;
+               if (m->object != object || m->pindex != pindex)
+                       continue;
+               if (vm_page_sbusy_try(m))
+                       continue;
+               if (m->object == object && m->pindex == pindex) {
+                       mp[i].ticks = ticks;
+                       return m;
+               }
+               vm_page_sbusy_drop(m);
        }
-       return m;
+       return NULL;
 }
 
 /*
@@ -1522,14 +1548,31 @@ static __inline
 void
 vm_page_hash_enter(vm_page_t m)
 {
-       struct vm_page **mp;
+       struct vm_page_hash_elm *mp;
+       struct vm_page_hash_elm *best;
+       int i;
 
        if (vm_page_hash &&
            m > &vm_page_array[0] &&
            m < &vm_page_array[vm_page_array_size]) {
                mp = vm_page_hash_hash(m->object, m->pindex);
-               if (*mp != m)
-                       *mp = m;
+               best = mp;
+               for (i = 0; i < 4; ++i) {
+                       if (mp[i].m == m) {
+                               mp[i].ticks = ticks;
+                               return;
+                       }
+
+                       /*
+                        * The best choice is the oldest entry
+                        */
+                       if ((ticks - best->ticks) < (ticks - mp[i].ticks) ||
+                           (int)(ticks - mp[i].ticks) < 0) {
+                               best = &mp[i];
+                       }
+               }
+               best->m = m;
+               best->ticks = ticks;
        }
 }
 
@@ -1830,6 +1873,7 @@ _vm_page_list_find(int basequeue, int index)
                                continue;
                        KKASSERT(m->queue == basequeue + index);
                        _vm_page_rem_queue_spinlocked(m);
+                       pq->lastq = -1;
                        return(m);
                }
                spin_unlock(&pq->spin);
@@ -1837,9 +1881,14 @@ _vm_page_list_find(int basequeue, int index)
 
        /*
         * If we are unable to get a page, do a more involved NUMA-aware
-        * search.
+        * search.  However, to avoid re-searching empty queues over and
+        * over again skip to pq->last if appropriate.
         */
-       m = _vm_page_list_find2(basequeue, index);
+       if (pq->lastq >= 0)
+               index = pq->lastq;
+
+       m = _vm_page_list_find2(basequeue, index, &pq->lastq);
+
        return(m);
 }
 
@@ -1848,11 +1897,12 @@ _vm_page_list_find(int basequeue, int index)
  * a nearby (NUMA-aware) queue.
  */
 static vm_page_t
-_vm_page_list_find2(int basequeue, int index)
+_vm_page_list_find2(int basequeue, int index, int *lastp)
 {
        struct vpgqueues *pq;
        vm_page_t m = NULL;
-       int pqmask = PQ_SET_ASSOC_MASK >> 1;
+       int pqmask = set_assoc_mask >> 1;
+       int pqstart = 0;
        int pqi;
        int i;
 
@@ -1860,8 +1910,10 @@ _vm_page_list_find2(int basequeue, int index)
        pq = &vm_page_queues[basequeue];
 
        /*
-        * Run local sets of 16, 32, 64, 128, and the whole queue if all
-        * else fails (PQ_L2_MASK which is 255).
+        * Run local sets of 16, 32, 64, 128, up to the entire queue if all
+        * else fails (PQ_L2_MASK).
+        *
+        * pqmask is a mask, 15, 31, 63, etc.
         *
         * Test each queue unlocked first, then lock the queue and locate
         * a page.  Note that the lock order is reversed, but we do not want
@@ -1870,7 +1922,7 @@ _vm_page_list_find2(int basequeue, int index)
         */
        do {
                pqmask = (pqmask << 1) | 1;
-               for (i = 0; i <= pqmask; ++i) {
+               for (i = pqstart; i <= pqmask; ++i) {
                        pqi = (index & ~pqmask) | ((index + i) & pqmask);
                        if (TAILQ_FIRST(&pq[pqi].pl)) {
                                spin_lock(&pq[pqi].spin);
@@ -1879,11 +1931,19 @@ _vm_page_list_find2(int basequeue, int index)
                                                continue;
                                        KKASSERT(m->queue == basequeue + pqi);
                                        _vm_page_rem_queue_spinlocked(m);
+
+                                       /*
+                                        * If we had to wander too far, set
+                                        * *lastp to skip past empty queues.
+                                        */
+                                       if (i >= 8)
+                                               *lastp = pqi & PQ_L2_MASK;
                                        return(m);
                                }
                                spin_unlock(&pq[pqi].spin);
                        }
                }
+               pqstart = i;
        } while (pqmask != PQ_L2_MASK);
 
        return(m);
@@ -1918,7 +1978,7 @@ vm_page_select_cache(u_short pg_color)
        vm_page_t m;
 
        for (;;) {
-               m = _vm_page_list_find(PQ_CACHE, pg_color & PQ_L2_MASK);
+               m = _vm_page_list_find(PQ_CACHE, pg_color);
                if (m == NULL)
                        break;
                /*
@@ -1968,7 +2028,7 @@ vm_page_select_free(u_short pg_color)
        vm_page_t m;
 
        for (;;) {
-               m = _vm_page_list_find(PQ_FREE, pg_color & PQ_L2_MASK);
+               m = _vm_page_list_find(PQ_FREE, pg_color);
                if (m == NULL)
                        break;
                if (vm_page_busy_try(m, TRUE)) {
@@ -2540,6 +2600,17 @@ vm_page_activate(vm_page_t m)
        }
 }
 
+void
+vm_page_soft_activate(vm_page_t m)
+{
+       if (m->queue - m->pc == PQ_ACTIVE || (m->flags & PG_FICTITIOUS)) {
+               if (m->act_count < ACT_INIT)
+                       m->act_count = ACT_INIT;
+       } else {
+               vm_page_activate(m);
+       }
+}
+
 /*
  * Helper routine for vm_page_free_toq() and vm_page_cache().  This
  * routine is called when a page has been added to the cache or free
@@ -2615,6 +2686,8 @@ void
 vm_page_free_toq(vm_page_t m)
 {
        mycpu->gd_cnt.v_tfree++;
+       if (m->flags & (PG_MAPPED | PG_WRITEABLE))
+               pmap_mapped_sync(m);
        KKASSERT((m->flags & PG_MAPPED) == 0);
        KKASSERT(m->busy_count & PBUSY_LOCKED);
 
@@ -2792,21 +2865,8 @@ vm_page_unwire(vm_page_t m, int activate)
                                ;
                        } else if (activate || (m->flags & PG_NEED_COMMIT)) {
                                vm_page_activate(m);
-#if 0
-                               vm_page_spin_lock(m);
-                               _vm_page_add_queue_spinlocked(m,
-                                                       PQ_ACTIVE + m->pc, 0);
-                               _vm_page_and_queue_spin_unlock(m);
-#endif
                        } else {
                                vm_page_deactivate(m);
-#if 0
-                               vm_page_spin_lock(m);
-                               vm_page_flag_clear(m, PG_WINATCFLS);
-                               _vm_page_add_queue_spinlocked(m,
-                                                       PQ_INACTIVE + m->pc, 0);
-                               _vm_page_and_queue_spin_unlock(m);
-#endif
                        }
                }
        }
@@ -2980,12 +3040,18 @@ vm_page_cache(vm_page_t m)
         * Already in the cache (and thus not mapped)
         */
        if ((m->queue - m->pc) == PQ_CACHE) {
+               if (m->flags & (PG_MAPPED | PG_WRITEABLE))
+                       pmap_mapped_sync(m);
                KKASSERT((m->flags & PG_MAPPED) == 0);
                vm_page_wakeup(m);
                return;
        }
 
+#if 0
        /*
+        * REMOVED - it is possible for dirty to get set at any time as
+        *           long as the page is still mapped and writeable.
+        *
         * Caller is required to test m->dirty, but note that the act of
         * removing the page from its maps can cause it to become dirty
         * on an SMP system due to another cpu running in usermode.
@@ -2994,6 +3060,7 @@ vm_page_cache(vm_page_t m)
                panic("vm_page_cache: caching a dirty page, pindex: %ld",
                        (long)m->pindex);
        }
+#endif
 
        /*
         * Remove all pmaps and indicate that the page is not
@@ -3002,6 +3069,7 @@ vm_page_cache(vm_page_t m)
         * everything.
         */
        vm_page_protect(m, VM_PROT_NONE);
+       pmap_mapped_sync(m);
        if ((m->flags & (PG_UNMANAGED | PG_MAPPED)) ||
            (m->busy_count & PBUSY_MASK) ||
            m->wire_count || m->hold_count) {
@@ -3558,14 +3626,17 @@ vm_page_is_valid(vm_page_t m, int base, int size)
 }
 
 /*
- * update dirty bits from pmap/mmu.  May not block.
+ * Update dirty bits from pmap/mmu.  May not block.
  *
  * Caller must hold the page busy
+ *
+ * WARNING! Unless the page has been unmapped, this function only
+ *         provides a likely dirty status.
  */
 void
 vm_page_test_dirty(vm_page_t m)
 {
-       if ((m->dirty != VM_PAGE_BITS_ALL) && pmap_is_modified(m)) {
+       if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m)) {
                vm_page_dirty(m);
        }
 }
index ca7cbd5..060b151 100644 (file)
@@ -186,11 +186,16 @@ typedef struct vm_page *vm_page_t;
  * In today's world of many-core systems, we must be able to provide enough VM
  * page queues for each logical cpu thread to cover the L1/L2/L3 cache set
  * associativity.  If we don't, the cpu caches will not be properly utilized.
- * Using 2048 allows 8-way set-assoc with 256 logical cpus.
+ *
+ * Using 2048 allows 8-way set-assoc with 256 logical cpus, but seems to
+ * have a number of downsides when queues are assymetrically starved.
+ *
+ * Using 1024 allows 4-way set-assoc with 256 logical cpus, and more with
+ * fewer cpus.
  */
 #define PQ_PRIME1 31   /* Prime number somewhat less than PQ_HASH_SIZE */
 #define PQ_PRIME2 23   /* Prime number somewhat less than PQ_HASH_SIZE */
-#define PQ_L2_SIZE 2048        /* Must be enough for maximal ncpus x hw set-assoc */
+#define PQ_L2_SIZE 1024        /* Must be enough for maximal ncpus x hw set-assoc */
 #define PQ_L2_MASK     (PQ_L2_SIZE - 1)
 
 #define PQ_NONE                0
@@ -232,6 +237,7 @@ struct vpgqueues {
        long    lcnt;
        long    adds;           /* heuristic, add operations */
        int     cnt_offset;     /* offset into vmstats structure (int) */
+       int     lastq;          /* heuristic, skip empty queues */
 } __aligned(64);
 
 extern struct vpgqueues vm_page_queues[PQ_COUNT];
@@ -248,15 +254,21 @@ extern struct vpgqueues vm_page_queues[PQ_COUNT];
  *  PG_MAPPED and PG_WRITEABLE flags are not applicable.
  *
  *  PG_MAPPED only applies to managed pages, indicating whether the page
- *  is mapped onto one or more pmaps.  A page might still be mapped to
+ *  MIGHT be mapped onto one or more pmaps.  A page might still be mapped to
  *  special pmaps in an unmanaged fashion, for example when mapped into a
  *  buffer cache buffer, without setting PG_MAPPED.
  *
+ *  PG_MAPPED can only be tested for NOT being set after a pmap_mapped_sync()
+ *  called made while the page is hard-busied
+ *
  *  PG_WRITEABLE indicates that there may be a writeable managed pmap entry
  *  somewhere, and that the page can be dirtied by hardware at any time
  *  and may have to be tested for that.  The modified bit in unmanaged
  *  mappings or in the special clean map is not tested.
  *
+ *  PG_WRITEABLE can only be tested for NOT being set after a
+ *  pmap_mapped_sync() called made while the page is hard-busied.
+ *
  *  PG_SWAPPED indicates that the page is backed by a swap block.  Any
  *  VM object type other than OBJT_DEFAULT can have swap-backed pages now.
  */
@@ -264,8 +276,8 @@ extern struct vpgqueues vm_page_queues[PQ_COUNT];
 #define        PG_UNUSED0002   0x00000002
 #define PG_WINATCFLS   0x00000004      /* flush dirty page on inactive q */
 #define        PG_FICTITIOUS   0x00000008      /* physical page doesn't exist (O) */
-#define        PG_WRITEABLE    0x00000010      /* page is writeable */
-#define PG_MAPPED      0x00000020      /* page is mapped (managed) */
+#define        PG_WRITEABLE    0x00000010      /* page may be writeable */
+#define PG_MAPPED      0x00000020      /* page may be mapped (managed) */
 #define        PG_UNUSED0040   0x00000040
 #define PG_REFERENCED  0x00000080      /* page has been referenced */
 #define PG_CLEANCHK    0x00000100      /* page will be checked for cleaning */
@@ -383,6 +395,7 @@ void vm_page_wakeup(vm_page_t m);
 void vm_page_hold(vm_page_t);
 void vm_page_unhold(vm_page_t);
 void vm_page_activate (vm_page_t);
+void vm_page_soft_activate (vm_page_t);
 
 vm_size_t vm_contig_avail_pages(void);
 vm_page_t vm_page_alloc (struct vm_object *, vm_pindex_t, int);
index e59572f..bf99237 100644 (file)
@@ -334,11 +334,13 @@ vm_page_protect(vm_page_t m, int prot)
 {
        KKASSERT(m->busy_count & PBUSY_LOCKED);
        if (prot == VM_PROT_NONE) {
-               if (m->flags & (PG_WRITEABLE|PG_MAPPED)) {
+               if (pmap_mapped_sync(m)) {
                        pmap_page_protect(m, VM_PROT_NONE);
                        /* PG_WRITEABLE & PG_MAPPED cleared by call */
                }
-       } else if ((prot == VM_PROT_READ) && (m->flags & PG_WRITEABLE)) {
+       } else if ((prot == VM_PROT_READ) &&
+                  (m->flags & PG_WRITEABLE) &&
+                  (pmap_mapped_sync(m) & PG_WRITEABLE)) {
                pmap_page_protect(m, VM_PROT_READ);
                /* PG_WRITEABLE cleared by call */
        }
index 8cff802..7e605b1 100644 (file)
@@ -597,7 +597,8 @@ vm_pageout_mdp_callback(struct pmap_pgscan_info *info, vm_offset_t va,
         * longer tracks it so we have to make sure that it is staged for
         * potential flush action.
         */
-       if ((p->flags & PG_MAPPED) == 0) {
+       if ((p->flags & PG_MAPPED) == 0 ||
+           (pmap_mapped_sync(p) & PG_MAPPED) == 0) {
                if (p->queue - p->pc == PQ_ACTIVE) {
                        vm_page_deactivate(p);
                }
@@ -1593,6 +1594,7 @@ vm_pageout_scan_cache(long avail_shortage, int pass,
                        vm_page_wakeup(m);
                        continue;
                }
+               pmap_mapped_sync(m);
                KKASSERT((m->flags & PG_MAPPED) == 0);
                KKASSERT(m->dirty == 0);
                vm_pageout_page_free(m);