kernel - Preliminary vm_page hash lookup
authorMatthew Dillon <dillon@apollo.backplane.com>
Sat, 23 Mar 2019 18:37:36 +0000 (11:37 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Wed, 27 Mar 2019 03:32:47 +0000 (20:32 -0700)
* Add preliminary vm_page hash lookup code which avoids most
  locks, plus support in vm_fault.  Default disabled, with debugging
  for now.

* This code still soft-busies the vm_page, which is an improvement over
  hard-busying it in that it won't contend, but we will eventually want
  to entirely avoid all atomic ops on the vm_page to *really* get the
  concurrent fault performance.

sys/vm/vm_fault.c
sys/vm/vm_map.h
sys/vm/vm_page.c
sys/vm/vm_page.h

index 4cbad7e..d97ba5b 100644 (file)
@@ -142,6 +142,7 @@ struct faultstate {
        int fault_flags;
        int map_generation;
        int shared;
+       int msoftonly;
        int first_shared;
        int wflags;
        struct vnode *vp;
@@ -158,7 +159,28 @@ int vm_shared_fault = 1;
 TUNABLE_INT("vm.shared_fault", &vm_shared_fault);
 SYSCTL_INT(_vm, OID_AUTO, shared_fault, CTLFLAG_RW,
                &vm_shared_fault, 0, "Allow shared token on vm_object");
-
+static int vm_fault_quick_enable = 0;
+TUNABLE_INT("vm.fault_quick", &vm_fault_quick_enable);
+SYSCTL_INT(_vm, OID_AUTO, fault_quick, CTLFLAG_RW,
+               &vm_fault_quick_enable, 0, "Allow fast vm_fault shortcut");
+static long vm_fault_quick_success_count = 0;
+SYSCTL_LONG(_vm, OID_AUTO, fault_quick_success_count, CTLFLAG_RW,
+               &vm_fault_quick_success_count, 0, "");
+static long vm_fault_quick_failure_count1 = 0;
+SYSCTL_LONG(_vm, OID_AUTO, fault_quick_failure_count1, CTLFLAG_RW,
+               &vm_fault_quick_failure_count1, 0, "");
+static long vm_fault_quick_failure_count2 = 0;
+SYSCTL_LONG(_vm, OID_AUTO, fault_quick_failure_count2, CTLFLAG_RW,
+               &vm_fault_quick_failure_count2, 0, "");
+static long vm_fault_quick_failure_count3 = 0;
+SYSCTL_LONG(_vm, OID_AUTO, fault_quick_failure_count3, CTLFLAG_RW,
+               &vm_fault_quick_failure_count3, 0, "");
+static long vm_fault_quick_failure_count4 = 0;
+SYSCTL_LONG(_vm, OID_AUTO, fault_quick_failure_count4, CTLFLAG_RW,
+               &vm_fault_quick_failure_count4, 0, "");
+
+static int vm_fault_quick(struct faultstate *fs, vm_pindex_t first_pindex,
+                       vm_prot_t fault_type);
 static int vm_fault_object(struct faultstate *, vm_pindex_t, vm_prot_t, int);
 static int vm_fault_vpagetable(struct faultstate *, vm_pindex_t *,
                        vpte_t, int, int);
@@ -378,6 +400,7 @@ vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags)
        thread_t td;
        struct vm_map_ilock ilock;
        int didilock;
+       int didhold;
        int growstack;
        int retry = 0;
        int inherit_prot;
@@ -398,6 +421,12 @@ vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags)
                lp->lwp_flags |= LWP_PAGING;
 
 RetryFault:
+       /*
+        * vm_fault_quick() can shortcut us.
+        */
+       fs.msoftonly = 0;
+       didhold = 0;
+
        /*
         * Find the vm_map_entry representing the backing store and resolve
         * the top level object and page index.  This may have the side
@@ -600,6 +629,16 @@ RetryFault:
                fs.first_shared = 0;
        }
 
+       /*
+        * Try to shortcut the entire mess and run the fault lockless.
+        */
+       if (vm_fault_quick_enable &&
+           vm_fault_quick(&fs, first_pindex, fault_type) == KERN_SUCCESS) {
+               didilock = 0;
+               fault_flags &= ~VM_FAULT_BURST;
+               goto success;
+       }
+
        /*
         * Obtain a top-level object lock, shared or exclusive depending
         * on fs.first_shared.  If a shared lock winds up being insufficient
@@ -613,6 +652,7 @@ RetryFault:
                vm_object_hold(fs.first_object);
        if (fs.vp == NULL)
                fs.vp = vnode_pager_lock(fs.first_object);
+       didhold = 1;
 
        /*
         * The page we want is at (first_object, first_pindex), but if the
@@ -680,6 +720,8 @@ RetryFault:
                goto done;
        }
 
+success:
+
        /*
         * On success vm_fault_object() does not unlock or deallocate, and fs.m
         * will contain a busied page.
@@ -694,9 +736,6 @@ RetryFault:
        if (didilock)
                vm_map_deinterlock(fs.map, &ilock);
 
-       /*KKASSERT(fs.m->queue == PQ_NONE); page-in op may deactivate page */
-       KKASSERT(fs.m->busy_count & PBUSY_LOCKED);
-
        /*
         * If the page is not wired down, then put it where the pageout daemon
         * can find it.
@@ -709,7 +748,13 @@ RetryFault:
        } else {
                vm_page_activate(fs.m);
        }
-       vm_page_wakeup(fs.m);
+       if (fs.msoftonly) {
+               KKASSERT(fs.m->busy_count & PBUSY_MASK);
+               vm_page_sbusy_drop(fs.m);
+       } else {
+               KKASSERT(fs.m->busy_count & PBUSY_LOCKED);
+               vm_page_wakeup(fs.m);
+       }
 
        /*
         * Burst in a few more pages if possible.  The fs.map should still
@@ -760,7 +805,7 @@ done_success:
 
        result = KERN_SUCCESS;
 done:
-       if (fs.first_object)
+       if (fs.first_object && didhold)
                vm_object_drop(fs.first_object);
 done2:
        if (lp)
@@ -793,6 +838,86 @@ done2:
        return (result);
 }
 
+/*
+ * Attempt a lockless vm_fault() shortcut.  The stars have to align for this
+ * to work.  But if it does we can get our page only soft-busied and not
+ * have to touch the vm_object or vnode locks at all.
+ */
+static
+int
+vm_fault_quick(struct faultstate *fs, vm_pindex_t first_pindex,
+              vm_prot_t fault_type)
+{
+       vm_page_t m;
+       vm_object_t obj;        /* NOT LOCKED */
+
+       /*
+        * Don't waste time if the object is only being used by one vm_map.
+        */
+       obj = fs->first_object;
+       if (obj->flags & OBJ_ONEMAPPING)
+               return KERN_FAILURE;
+
+       /*
+        * Ick, can't handle this
+        */
+       if (fs->entry->maptype == VM_MAPTYPE_VPAGETABLE) {
+               ++vm_fault_quick_failure_count1;
+               return KERN_FAILURE;
+       }
+
+       /*
+        * Ok, try to get the vm_page quickly via the hash table.  The
+        * page will be soft-busied on success (NOT hard-busied).
+        */
+       m = vm_page_hash_get(obj, first_pindex);
+       if (m == NULL) {
+               ++vm_fault_quick_failure_count2;
+               return KERN_FAILURE;
+       }
+       if ((obj->flags & OBJ_DEAD) ||
+           m->valid != VM_PAGE_BITS_ALL ||
+           m->queue - m->pc == PQ_CACHE ||
+           (m->flags & PG_SWAPPED)) {
+               vm_page_sbusy_drop(m);
+               ++vm_fault_quick_failure_count3;
+               return KERN_FAILURE;
+       }
+
+       /*
+        * The page is already fully valid, ACTIVE, and is not PG_SWAPPED.
+        *
+        * Don't map the page writable when emulating the dirty bit, a
+        * fault must be taken for proper emulation (vkernel).
+        */
+       if (curthread->td_lwp && curthread->td_lwp->lwp_vmspace &&
+           pmap_emulate_ad_bits(&curthread->td_lwp->lwp_vmspace->vm_pmap)) {
+               if ((fault_type & VM_PROT_WRITE) == 0)
+                       fs->prot &= ~VM_PROT_WRITE;
+       }
+
+       /*
+        * Check write permissions.  We don't hold an object lock so the
+        * object must already be flagged writable and dirty.
+        */
+       if (fs->prot & VM_PROT_WRITE) {
+               if ((obj->flags & (OBJ_WRITEABLE | OBJ_MIGHTBEDIRTY)) !=
+                   (OBJ_WRITEABLE | OBJ_MIGHTBEDIRTY) ||
+                   m->dirty != VM_PAGE_BITS_ALL) {
+                       vm_page_sbusy_drop(m);
+                       ++vm_fault_quick_failure_count4;
+                       return KERN_FAILURE;
+               }
+               vm_set_nosync(m, fs->entry);
+       }
+       vm_page_activate(m);
+       fs->m = m;
+       fs->msoftonly = 1;
+       ++vm_fault_quick_success_count;
+
+       return KERN_SUCCESS;
+}
+
 /*
  * Fault in the specified virtual address in the current process map, 
  * returning a held VM page or NULL.  See vm_fault_page() for more 
@@ -879,6 +1004,7 @@ vm_fault_page(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
        fs.vp = NULL;
        fs.shared = vm_shared_fault;
        fs.first_shared = vm_shared_fault;
+       fs.msoftonly = 0;
        growstack = 1;
 
        /*
@@ -1233,6 +1359,7 @@ vm_fault_object_page(vm_object_t object, vm_ooffset_t offset,
        fs.map = NULL;
        fs.shared = vm_shared_fault;
        fs.first_shared = *sharedp;
+       fs.msoftonly = 0;
        fs.vp = NULL;
        KKASSERT((fault_flags & VM_FAULT_WIRE_MASK) == 0);
 
index b9082c6..4c647b8 100644 (file)
@@ -168,6 +168,8 @@ typedef enum {
        VM_SUBSYS_DRM_TTM,
        VM_SUBSYS_HAMMER,
 
+       VM_SUBSYS_VMPGHASH,
+
        VM_SUBSYS_LIMIT         /* end of list */
 } vm_subsys_t;
 
index 0f79ab5..72570e5 100644 (file)
@@ -131,6 +131,8 @@ static struct alist vm_contig_alist;
 static struct almeta vm_contig_ameta[ALIST_RECORDS_65536];
 static struct spinlock vm_contig_spin = SPINLOCK_INITIALIZER(&vm_contig_spin, "vm_contig_spin");
 
+static struct vm_page **vm_page_hash;
+
 static u_long vm_dma_reserved = 0;
 TUNABLE_ULONG("vm.dma_reserved", &vm_dma_reserved);
 SYSCTL_ULONG(_vm, OID_AUTO, dma_reserved, CTLFLAG_RD, &vm_dma_reserved, 0,
@@ -696,6 +698,7 @@ vm_page_startup_finish(void *dummy __unused)
        alist_blk_t xcount;
        alist_blk_t bfree;
        vm_page_t m;
+       vm_page_t *mp;
 
        spin_lock(&vm_contig_spin);
        for (;;) {
@@ -764,6 +767,16 @@ vm_page_startup_finish(void *dummy __unused)
                (intmax_t)(vmstats.v_dma_pages - vm_contig_alist.bl_free) *
                (PAGE_SIZE / 1024),
                (intmax_t)vm_contig_alist.bl_free * (PAGE_SIZE / 1024));
+
+       /*
+        * hash table for vm_page_lookup_quick()
+        */
+       mp = (void *)kmem_alloc3(&kernel_map,
+                                vm_page_array_size * sizeof(vm_page_t),
+                                VM_SUBSYS_VMPGHASH, KM_CPU(0));
+       bzero(mp, vm_page_array_size * sizeof(vm_page_t));
+       cpu_sfence();
+       vm_page_hash = mp;
 }
 SYSINIT(vm_pgend, SI_SUB_PROC0_POST, SI_ORDER_ANY,
        vm_page_startup_finish, NULL);
@@ -1445,6 +1458,69 @@ vm_page_remove(vm_page_t m)
        vm_object_drop(object);
 }
 
+/*
+ * Calculate the hash position for the vm_page hash heuristic.
+ */
+static __inline
+struct vm_page **
+vm_page_hash_hash(vm_object_t object, vm_pindex_t pindex)
+{
+       size_t hi;
+
+       hi = (uintptr_t)object % (uintptr_t)vm_page_array_size + pindex;
+       hi %= vm_page_array_size;
+       return (&vm_page_hash[hi]);
+}
+
+/*
+ * Heuristical page lookup that does not require any locks.  Returns
+ * a soft-busied page on success, NULL on failure.
+ *
+ * Caller must lookup the page the slow way if NULL is returned.
+ */
+vm_page_t
+vm_page_hash_get(vm_object_t object, vm_pindex_t pindex)
+{
+       struct vm_page **mp;
+       vm_page_t m;
+
+       if (vm_page_hash == NULL)
+               return NULL;
+       mp = vm_page_hash_hash(object, pindex);
+       m = *mp;
+       cpu_ccfence();
+       if (m == NULL)
+               return NULL;
+       if (m->object != object || m->pindex != pindex)
+               return NULL;
+       if (vm_page_sbusy_try(m))
+               return NULL;
+       if (m->object != object || m->pindex != pindex) {
+               vm_page_wakeup(m);
+               return NULL;
+       }
+       return m;
+}
+
+/*
+ * Enter page onto vm_page_hash[].  This is a heuristic, SMP collisions
+ * are allowed.
+ */
+static __inline
+void
+vm_page_hash_enter(vm_page_t m)
+{
+       struct vm_page **mp;
+
+       if (vm_page_hash &&
+           m > &vm_page_array[0] &&
+           m < &vm_page_array[vm_page_array_size]) {
+               mp = vm_page_hash_hash(m->object, m->pindex);
+               if (*mp != m)
+                       *mp = m;
+       }
+}
+
 /*
  * Locate and return the page at (object, pindex), or NULL if the
  * page could not be found.
@@ -1461,7 +1537,10 @@ vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
         */
        ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
        m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
-       KKASSERT(m == NULL || (m->object == object && m->pindex == pindex));
+       if (m) {
+               KKASSERT(m->object == object && m->pindex == pindex);
+               vm_page_hash_enter(m);
+       }
        return(m);
 }
 
@@ -1504,6 +1583,7 @@ VM_PAGE_DEBUG_EXT(vm_page_lookup_busy_wait)(struct vm_object *object,
                        m->busy_func = func;
                        m->busy_line = lineno;
 #endif
+                       vm_page_hash_enter(m);
                        break;
                }
        }
@@ -1550,6 +1630,7 @@ VM_PAGE_DEBUG_EXT(vm_page_lookup_busy_try)(struct vm_object *object,
                        m->busy_func = func;
                        m->busy_line = lineno;
 #endif
+                       vm_page_hash_enter(m);
                        break;
                }
        }
@@ -1581,6 +1662,8 @@ vm_page_lookup_sbusy_try(struct vm_object *object, vm_pindex_t pindex,
                           (m->flags & PG_FICTITIOUS)) {
                        vm_page_sbusy_drop(m);
                        m = NULL;
+               } else {
+                       vm_page_hash_enter(m);
                }
        }
        return m;
index d10d982..6697104 100644 (file)
@@ -375,6 +375,9 @@ void vm_page_deactivate (vm_page_t);
 void vm_page_deactivate_locked (vm_page_t);
 void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr);
 int vm_page_insert (vm_page_t, struct vm_object *, vm_pindex_t);
+
+vm_page_t vm_page_hash_get(vm_object_t object, vm_pindex_t pindex);
+
 vm_page_t vm_page_lookup (struct vm_object *, vm_pindex_t);
 vm_page_t vm_page_lookup_sbusy_try(struct vm_object *object,
                vm_pindex_t pindex, int pgoff, int pgbytes);