kernel - VM rework part 13 - Core pmap work, stabilize & optimize
authorMatthew Dillon <dillon@apollo.backplane.com>
Sun, 19 May 2019 19:59:49 +0000 (12:59 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Mon, 20 May 2019 19:39:25 +0000 (12:39 -0700)
* Refactor the vm_page_hash hash again to get a better distribution.

* I tried to only hash shared objects but this resulted in a number of
  edge cases where program re-use could miss the optimization.

* Add a sysctl vm.page_hash_vnode_only (default off).  If turned on,
  only vm_page's associated with vnodes will be hashed.  This should
  generally not be necessary.

* Refactor vm_page_list_find2() again to avoid all duplicate queue
  checks.  This time I mocked the algorithm up in userland and twisted
  it until it did what I wanted.

* VM_FAULT_QUICK_DEBUG was accidently left on, turn it off.

* Do not remove the original page from the pmap when vm_fault_object()
  must do a COW.  And just in case this is ever added back in later,
  don't do it using pmap_remove_specific() !!!  Use pmap_remove_pages()
  to avoid the backing scan lock.

  vm_fault_page() will now do this removal (for procfs rwmem), the normal
  vm_fault will of course replace the page anyway, and the umtx code
  uses different recovery mechanisms now and should be ok.

* Optimize vm_map_entry_shadow() for the situation where the old
  object is no longer shared.  Get rid of an unnecessary transient
  kmalloc() and vm_object_hold_shared().

sys/vm/vm_fault.c
sys/vm/vm_map.c
sys/vm/vm_page.c
test/debug/vmpageinfo.c

index f7801c1..73755e1 100644 (file)
@@ -161,10 +161,17 @@ __read_mostly int vm_shared_fault = 1;
 TUNABLE_INT("vm.shared_fault", &vm_shared_fault);
 SYSCTL_INT(_vm, OID_AUTO, shared_fault, CTLFLAG_RW,
                &vm_shared_fault, 0, "Allow shared token on vm_object");
-__read_mostly static int vm_fault_quick_enable = 0;
+__read_mostly static int vm_fault_quick_enable = 1;
 TUNABLE_INT("vm.fault_quick", &vm_fault_quick_enable);
 SYSCTL_INT(_vm, OID_AUTO, fault_quick, CTLFLAG_RW,
                &vm_fault_quick_enable, 0, "Allow fast vm_fault shortcut");
+
+/*
+ * Define here for debugging ioctls.  Note that these are globals, so
+ * they were cause a ton of cache line bouncing.  Only use for debugging
+ * purposes.
+ */
+/*#define VM_FAULT_QUICK_DEBUG */
 #ifdef VM_FAULT_QUICK_DEBUG
 static long vm_fault_quick_success_count = 0;
 SYSCTL_LONG(_vm, OID_AUTO, fault_quick_success_count, CTLFLAG_RW,
@@ -865,14 +872,6 @@ vm_fault_quick(struct faultstate *fs, vm_pindex_t first_pindex,
 
        /*
         * Don't waste time if the object is only being used by one vm_map.
-        *
-        * WARNING! We can't rely on obj->ref_count here because it might
-        *          be part of a shared ba chain, and we can't rely on
-        *          ba->refs for the same reason.  The combination of it
-        *          being the ba embedded in the entry (aka first_ba) AND
-        *          ref_count == 1 would work, but OBJ_ONEMAPPING is better
-        *          because it will remain flagged even when ref_count > 1
-        *          for situations where entries are clipped.
         */
        obj = fs->first_ba->object;
        if (obj->flags & OBJ_ONEMAPPING)
@@ -2265,26 +2264,15 @@ next:
                                /*
                                 * Oh, well, lets copy it.
                                 *
-                                * Why are we unmapping the original page
-                                * here?  Well, in short, not all accessors
-                                * of user memory go through the pmap.  The
-                                * procfs code doesn't have access user memory
-                                * via a local pmap, so vm_fault_page*()
-                                * can't call pmap_enter().  And the umtx*()
-                                * code may modify the COW'd page via a DMAP
-                                * or kernel mapping and not via the pmap,
-                                * leaving the original page still mapped
-                                * read-only into the pmap.
-                                *
-                                * So we have to remove the page from at
-                                * least the current pmap if it is in it.
+                                * We used to unmap the original page here
+                                * because vm_fault_page() didn't and this
+                                * would cause havoc for the umtx*() code
+                                * and the procfs code.
                                 *
-                                * We used to just remove it from all pmaps
-                                * but that creates inefficiencies on SMP,
-                                * particularly for COW program & library
-                                * mappings that are concurrently exec'd.
-                                * Only remove the page from the current
-                                * pmap.
+                                * This is no longer necessary.  The
+                                * vm_fault_page() routine will now unmap the
+                                * page after a COW, and the umtx code will
+                                * recover on its own.
                                 */
                                /*
                                 * NOTE: Since fs->m is a backing page, it
@@ -2293,10 +2281,9 @@ next:
                                 */
                                KKASSERT(fs->first_shared == 0);
                                vm_page_copy(fs->m, fs->first_m);
-                               /*vm_page_protect(fs->m, VM_PROT_NONE);*/
-                               pmap_remove_specific(
+                               /* pmap_remove_specific(
                                    &curthread->td_lwp->lwp_vmspace->vm_pmap,
-                                   fs->m);
+                                   fs->m); */
                        }
 
                        /*
index c478b62..aa3071c 100644 (file)
@@ -740,28 +740,29 @@ vm_map_entry_shadow(vm_map_entry_t entry)
                length = 0x7FFFFFFF;
        else
                length = atop(entry->ba.end - entry->ba.start);
-       ba = kmalloc(sizeof(*ba), M_MAP_BACKING, M_INTWAIT); /* copied later */
 
        /*
         * Don't create the new object if the old object isn't shared.
+        * This case occurs quite often when programs fork/exec/wait.
         *
         * Caller ensures source exists (all backing_ba's must have objects),
         * typically indirectly by virtue of the NEEDS_COPY flag being set.
+        * We have a ref on source by virtue of the entry and do not need
+        * to lock it to do this test.
         */
        source = entry->ba.object;
        KKASSERT(source);
-       vm_object_hold_shared(source);
 
        if (source->type != OBJT_VNODE) {
                if (source->ref_count == 1 &&
                    source->handle == NULL &&
                    (source->type == OBJT_DEFAULT ||
                     source->type == OBJT_SWAP)) {
-                       vm_object_drop(source);
-                       kfree(ba, M_MAP_BACKING);
                        goto done;
                }
        }
+       ba = kmalloc(sizeof(*ba), M_MAP_BACKING, M_INTWAIT); /* copied later */
+       vm_object_hold_shared(source);
 
        /*
         * Once it becomes part of a backing_ba chain it can wind up anywhere,
index 583619d..9da2327 100644 (file)
@@ -106,6 +106,9 @@ struct vm_page_hash_elm {
        int             unused01;
 };
 
+#define VM_PAGE_HASH_SET       4               /* power of 2, set-assoc */
+#define VM_PAGE_HASH_MAX       (1024 * 1024)   /* power of 2, max size */
+
 /*
  * SET - Minimum required set associative size, must be a power of 2.  We
  *      want this to match or exceed the set-associativeness of the cpu,
@@ -130,8 +133,9 @@ static struct alist vm_contig_alist;
 static struct almeta vm_contig_ameta[ALIST_RECORDS_65536];
 static struct spinlock vm_contig_spin = SPINLOCK_INITIALIZER(&vm_contig_spin, "vm_contig_spin");
 
-static struct vm_page_hash_elm *vm_page_hash;
+__read_mostly static int vm_page_hash_vnode_only;
 __read_mostly static int vm_page_hash_size;
+__read_mostly static struct vm_page_hash_elm *vm_page_hash;
 
 static u_long vm_dma_reserved = 0;
 TUNABLE_ULONG("vm.dma_reserved", &vm_dma_reserved);
@@ -140,6 +144,14 @@ SYSCTL_ULONG(_vm, OID_AUTO, dma_reserved, CTLFLAG_RD, &vm_dma_reserved, 0,
 SYSCTL_UINT(_vm, OID_AUTO, dma_free_pages, CTLFLAG_RD,
            &vm_contig_alist.bl_free, 0, "Memory reserved for DMA");
 
+SYSCTL_INT(_vm, OID_AUTO, page_hash_vnode_only, CTLFLAG_RW,
+           &vm_page_hash_vnode_only, 0, "Only hash vnode pages");
+#if 0
+static int vm_page_hash_debug;
+SYSCTL_INT(_vm, OID_AUTO, page_hash_debug, CTLFLAG_RW,
+           &vm_page_hash_debug, 0, "Only hash vnode pages");
+#endif
+
 static int vm_contig_verbose = 0;
 TUNABLE_INT("vm.contig_verbose", &vm_contig_verbose);
 
@@ -706,15 +718,18 @@ vm_page_startup_finish(void *dummy __unused)
         * Set the set_assoc_mask based on the fitted number of CPUs.
         * This is a mask, so we subject 1.
         *
-        * w/PQ_L2_SIZE = 1024:
+        * w/PQ_L2_SIZE = 1024, Don't let the associativity drop below 8.
+        * So if we have 256 CPUs, two hyper-threads will wind up sharing.
         *
-        *      Don't let the associativity drop below 8.  So if we have
-        *      256 CPUs, two hyper-threads will wind up sharing.  The
-        *      maximum is PQ_L2_SIZE.
+        * The maximum is PQ_L2_SIZE.  However, we limit the starting
+        * maximum to 16 (mask = 15) in order to improve the cache locality
+        * of related kernel data structures.
         */
        mask = PQ_L2_SIZE / ncpus_fit - 1;
        if (mask < 7)           /* minimum is 8-way w/256 CPU threads */
                mask = 7;
+       if (mask < 15)
+               mask = 15;
        cpu_ccfence();
        set_assoc_mask = mask;
 
@@ -795,8 +810,8 @@ vm_page_startup_finish(void *dummy __unused)
        vm_page_hash_size = 4096;
        while (vm_page_hash_size < (vm_page_array_size / 16))
                vm_page_hash_size <<= 1;
-       if (vm_page_hash_size > 1024*1024)
-               vm_page_hash_size = 1024*1024;
+       if (vm_page_hash_size > VM_PAGE_HASH_MAX)
+               vm_page_hash_size = VM_PAGE_HASH_MAX;
 
        /*
         * hash table for vm_page_lookup_quick()
@@ -1500,9 +1515,12 @@ vm_page_hash_hash(vm_object_t object, vm_pindex_t pindex)
 {
        size_t hi;
 
-       hi = ((object->pg_color << 8) ^ (uintptr_t)object) + (pindex << 2);
-       hi &= vm_page_hash_size - 1;
-       hi &= ~3;
+       /* mix it up */
+       hi = (intptr_t)object ^ object->pg_color ^ pindex;
+       hi += object->pg_color * pindex;
+       hi = hi ^ (hi >> 20);
+       hi &= vm_page_hash_size - 1;            /* bounds */
+       hi &= ~(VM_PAGE_HASH_SET - 1);          /* set-assoc */
        return (&vm_page_hash[hi]);
 }
 
@@ -1522,7 +1540,7 @@ vm_page_hash_get(vm_object_t object, vm_pindex_t pindex)
        if (vm_page_hash == NULL)
                return NULL;
        mp = vm_page_hash_hash(object, pindex);
-       for (i = 0; i < 4; ++i) {
+       for (i = 0; i < VM_PAGE_HASH_SET; ++i) {
                m = mp[i].m;
                cpu_ccfence();
                if (m == NULL)
@@ -1552,28 +1570,49 @@ vm_page_hash_enter(vm_page_t m)
        struct vm_page_hash_elm *best;
        int i;
 
-       if (vm_page_hash &&
-           m > &vm_page_array[0] &&
-           m < &vm_page_array[vm_page_array_size]) {
-               mp = vm_page_hash_hash(m->object, m->pindex);
-               best = mp;
-               for (i = 0; i < 4; ++i) {
-                       if (mp[i].m == m) {
-                               mp[i].ticks = ticks;
-                               return;
-                       }
+       /*
+        * Only enter type-stable vm_pages with well-shared objects.
+        */
+       if (vm_page_hash == NULL ||
+           m < &vm_page_array[0] ||
+           m >= &vm_page_array[vm_page_array_size])
+               return;
+       if (m->object == NULL)
+               return;
+#if 0
+       /*
+        * Disabled at the moment, there are some degenerate conditions
+        * with often-exec'd programs that get ignored.  In particular,
+        * the kernel's elf loader does a vn_rdwr() on the first page of
+        * a binary.
+        */
+       if (m->object->ref_count <= 2 || (m->object->flags & OBJ_ONEMAPPING))
+               return;
+#endif
+       if (vm_page_hash_vnode_only && m->object->type != OBJT_VNODE)
+               return;
 
-                       /*
-                        * The best choice is the oldest entry
-                        */
-                       if ((ticks - best->ticks) < (ticks - mp[i].ticks) ||
-                           (int)(ticks - mp[i].ticks) < 0) {
-                               best = &mp[i];
-                       }
+       /*
+        *
+        */
+       mp = vm_page_hash_hash(m->object, m->pindex);
+       best = mp;
+       for (i = 0; i < VM_PAGE_HASH_SET; ++i) {
+               if (mp[i].m == m) {
+                       mp[i].ticks = ticks;
+                       return;
+               }
+
+               /*
+                * The best choice is the oldest entry
+                */
+               if ((ticks - best->ticks) < (ticks - mp[i].ticks) ||
+                   (int)(ticks - mp[i].ticks) < 0) {
+                       best = &mp[i];
                }
-               best->m = m;
-               best->ticks = ticks;
        }
+       best->m = m;
+       best->ticks = ticks;
 }
 
 /*
@@ -1902,12 +1941,17 @@ _vm_page_list_find2(int basequeue, int index, int *lastp)
        struct vpgqueues *pq;
        vm_page_t m = NULL;
        int pqmask = set_assoc_mask >> 1;
-       int pqstart = 0;
        int pqi;
-       int i;
+       int range;
+       int skip_start;
+       int skip_next;
+       int count;
 
        index &= PQ_L2_MASK;
        pq = &vm_page_queues[basequeue];
+       count = 0;
+       skip_start = -1;
+       skip_next = -1;
 
        /*
         * Run local sets of 16, 32, 64, 128, up to the entire queue if all
@@ -1922,9 +1966,16 @@ _vm_page_list_find2(int basequeue, int index, int *lastp)
         */
        do {
                pqmask = (pqmask << 1) | 1;
-               for (i = pqstart; i <= pqmask; ++i) {
-                       pqi = (index & ~pqmask) | ((index + i) & pqmask);
-                       if (TAILQ_FIRST(&pq[pqi].pl)) {
+
+               pqi = index;
+               range = pqmask + 1;
+
+               while (range > 0) {
+                       if (pqi >= skip_start && pqi < skip_next) {
+                               range -= skip_next - pqi;
+                               pqi = (pqi & ~pqmask) | (skip_next & pqmask);
+                       }
+                       if (range > 0 && TAILQ_FIRST(&pq[pqi].pl)) {
                                spin_lock(&pq[pqi].spin);
                                TAILQ_FOREACH(m, &pq[pqi].pl, pageq) {
                                        if (spin_trylock(&m->spin) == 0)
@@ -1936,14 +1987,18 @@ _vm_page_list_find2(int basequeue, int index, int *lastp)
                                         * If we had to wander too far, set
                                         * *lastp to skip past empty queues.
                                         */
-                                       if (i >= 8)
+                                       if (count >= 8)
                                                *lastp = pqi & PQ_L2_MASK;
                                        return(m);
                                }
                                spin_unlock(&pq[pqi].spin);
                        }
+                       --range;
+                       ++count;
+                       pqi = (pqi & ~pqmask) | ((pqi + 1) & pqmask);
                }
-               pqstart = i;
+               skip_start = pqi & ~pqmask;
+               skip_next = (pqi | pqmask) + 1;
        } while (pqmask != PQ_L2_MASK);
 
        return(m);
index 77419af..c7d0a04 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * VMPAGEINFO.C
  *
- * cc -I/usr/src/sys vmpageinfo.c -o /usr/local/bin/vmpageinfo -lkvm
+ * cc -I/usr/src/sys vmpageinfo.c -o ~/bin/vmpageinfo -lkvm
  *
  * vmpageinfo
  *