kernel - Simplify umtx_sleep and umtx_wakeup support
authorMatthew Dillon <dillon@apollo.backplane.com>
Sun, 15 Oct 2017 17:54:59 +0000 (10:54 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Mon, 16 Oct 2017 18:30:23 +0000 (11:30 -0700)
* Rip out the vm_page_action / vm_page_event() API.  This code was
  fairly SMP unfriendly and created serious bottlenecks with large
  threaded user programs using mutexes.

* Replace with a simpler mechanism that simply wakes up any UMTX
  domain tsleeps after a fork().

* Implement a 4uS spin loop in umtx_sleep() similar to what the
  pipe code does.

sys/kern/kern_fork.c
sys/kern/kern_umtx.c
sys/vm/vm_fault.c
sys/vm/vm_page.c
sys/vm/vm_page.h
sys/vm/vm_page2.h

index 6dd53d3..d6d5e5a 100644 (file)
@@ -106,6 +106,26 @@ rb_lwp_compare(struct lwp *lp1, struct lwp *lp2)
 
 RB_GENERATE2(lwp_rb_tree, lwp, u.lwp_rbnode, rb_lwp_compare, lwpid_t, lwp_tid);
 
+/*
+ * When forking, memory underpinning umtx-supported mutexes may be set
+ * COW causing the physical address to change.  We must wakeup any threads
+ * blocked on the physical address to allow them to re-resolve their VM.
+ */
+static void
+wake_umtx_threads(struct proc *p1)
+{
+       struct lwp *lp;
+       struct thread *td;
+
+       RB_FOREACH(lp, lwp_rb_tree, &p1->p_lwp_tree) {
+               td = lp->lwp_thread;
+               if (td && (td->td_flags & TDF_TSLEEPQ) &&
+                   (td->td_wdomain & PDOMAIN_MASK) == PDOMAIN_UMTX) {
+                       wakeup_domain(td->td_wchan, PDOMAIN_UMTX);
+               }
+       }
+}
+
 /*
  * fork() system call
  */
@@ -312,6 +332,7 @@ fork1(struct lwp *lp1, int flags, struct proc **procp)
                }
 
                vm_fork(p1, 0, flags);
+               wake_umtx_threads(p1);
 
                /*
                 * Close all file descriptors.
@@ -634,6 +655,7 @@ fork1(struct lwp *lp1, int flags, struct proc **procp)
        PHOLD(p1);
 
        vm_fork(p1, p2, flags);
+       wake_umtx_threads(p1);
 
        /*
         * Create the first lwp associated with the new proc.
index a2ce14f..ec3af2c 100644 (file)
@@ -47,6 +47,7 @@
 #include <sys/sysunion.h>
 #include <sys/sysent.h>
 #include <sys/syscall.h>
+#include <sys/sysctl.h>
 #include <sys/module.h>
 
 #include <cpu/lwbuf.h>
 
 #include <machine/vmm.h>
 
-static void umtx_sleep_page_action_cow(vm_page_t m, vm_page_action_t action);
+/*
+ * Improve umtx performance by polling for 4uS before going to sleep.
+ * This can avoid many IPIs in typical pthreads mutex situations.
+ */
+#ifdef _RDTSC_SUPPORTED_
+static int umtx_delay = 4000;
+SYSCTL_INT(_kern, OID_AUTO, umtx_delay, CTLFLAG_RW, &umtx_delay, 0, "");
+#endif
 
 /*
  * If the contents of the userland-supplied pointer matches the specified
@@ -101,12 +109,11 @@ sys_umtx_sleep(struct umtx_sleep_args *uap)
 {
     struct lwbuf lwb_cache;
     struct lwbuf *lwb;
-    struct vm_page_action action;
     vm_page_t m;
     void *waddr;
     int offset;
     int timeout;
-    int error = EBUSY;
+    int error;
 
     if (uap->timeout < 0)
        return (EINVAL);
@@ -125,12 +132,23 @@ sys_umtx_sleep(struct umtx_sleep_args *uap)
      * Otherwise the physical page we sleep on my not match the page
      * being woken up.
      *
+     * The returned page is held, and this hold count prevents it from
+     * being paged out.  This is important since we are sleeping on what
+     * is essentially a physical address (which avoids all sorts of
+     * collision-space issues).
+     *
      * WARNING! We can only use vm_fault_page*() for reading data.  We
      *         cannot use it for writing data because there is no pmap
      *         interlock to protect against flushes/pageouts.
+     *
+     *         (XXX with recent code work, this may no longer be an issue)
+     *
+     * WARNING! If the user program replaces the mapping of the underlying
+     *         uap->ptr, the physical address may change and the umtx code
+     *         will not be able to match wakeups with tsleeps.
      */
     m = vm_fault_page_quick((vm_offset_t)uap->ptr,
-                           VM_PROT_READ|VM_PROT_WRITE, &error, NULL);
+                           VM_PROT_READ | VM_PROT_WRITE, &error, NULL);
     if (m == NULL) {
        error = EFAULT;
        goto done;
@@ -138,38 +156,72 @@ sys_umtx_sleep(struct umtx_sleep_args *uap)
     lwb = lwbuf_alloc(m, &lwb_cache);
     offset = (vm_offset_t)uap->ptr & PAGE_MASK;
 
-    /*
-     * The critical section is required to interlock the tsleep against
-     * a wakeup from another cpu.  The lfence forces synchronization.
-     */
+    error = EBUSY;
     if (*(int *)(lwbuf_kva(lwb) + offset) == uap->value) {
+#ifdef _RDTSC_SUPPORTED_
+       /*
+        * Poll a little while before sleeping, most mutexes are
+        * short-lived.
+        */
+       if (umtx_delay) {
+               int64_t tsc_target;
+               int good = 0;
+
+               tsc_target = tsc_get_target(umtx_delay);
+               while (tsc_test_target(tsc_target) == 0) {
+                       cpu_lfence();
+                       if (*(int *)(lwbuf_kva(lwb) + offset) != uap->value) {
+                               good = 1;
+                               break;
+                       }
+                       cpu_pause();
+               }
+               if (good) {
+                       error = EBUSY;
+                       goto skip;
+               }
+       }
+#endif
+       /*
+        * Calculate the timeout.  This will be acccurate to within ~2 ticks.
+        */
        if ((timeout = uap->timeout) != 0) {
            timeout = (timeout / 1000000) * hz +
                      ((timeout % 1000000) * hz + 999999) / 1000000;
        }
+
+       /*
+        * Calculate the physical address of the mutex.  This gives us
+        * good distribution between unrelated processes using the
+        * feature.
+        */
        waddr = (void *)((intptr_t)VM_PAGE_TO_PHYS(m) + offset);
 
        /*
         * Wake us up if the memory location COWs while we are sleeping.
+        * Use a critical section to tighten up the interlock.  Also,
+        * tsleep_remove() requires the caller be in a critical section.
         */
        crit_enter();
-       vm_page_init_action(m, &action, umtx_sleep_page_action_cow, waddr);
-       vm_page_register_action(&action, VMEVENT_COW);
 
        /*
         * We must interlock just before sleeping.  If we interlock before
         * registration the lock operations done by the registration can
         * interfere with it.
+        *
+        * We cannot leave our interlock hanging on return because this
+        * will interfere with umtx_wakeup() calls with limited wakeup
+        * counts.
         */
        tsleep_interlock(waddr, PCATCH | PDOMAIN_UMTX);
-       if (*(int *)(lwbuf_kva(lwb) + offset) == uap->value &&
-           action.event == VMEVENT_COW) {
+       cpu_lfence();
+       if (*(int *)(lwbuf_kva(lwb) + offset) == uap->value) {
                error = tsleep(waddr, PCATCH | PINTERLOCKED | PDOMAIN_UMTX,
                               "umtxsl", timeout);
        } else {
+               tsleep_remove(curthread);
                error = EBUSY;
        }
-       vm_page_unregister_action(&action);
        crit_exit();
        /* Always break out in case of signal, even if restartable */
        if (error == ERESTART)
@@ -177,24 +229,13 @@ sys_umtx_sleep(struct umtx_sleep_args *uap)
     } else {
        error = EBUSY;
     }
-
+skip:
     lwbuf_free(lwb);
     vm_page_unhold(m);
 done:
     return(error);
 }
 
-/*
- * If this page is being copied it may no longer represent the page
- * underlying our virtual address.  Wake up any umtx_sleep()'s
- * that were waiting on its physical address to force them to retry.
- */
-static void
-umtx_sleep_page_action_cow(vm_page_t m, vm_page_action_t action)
-{
-    wakeup_domain(action->data, PDOMAIN_UMTX);
-}
-
 /*
  * umtx_wakeup { const int *ptr, int count }
  *
@@ -227,7 +268,7 @@ sys_umtx_wakeup(struct umtx_wakeup_args *uap)
     if ((vm_offset_t)uap->ptr & (sizeof(int) - 1))
        return (EFAULT);
     m = vm_fault_page_quick((vm_offset_t)uap->ptr,
-                           VM_PROT_READ, &error, NULL);
+                           VM_PROT_READ | VM_PROT_WRITE, &error, NULL);
     if (m == NULL) {
        error = EFAULT;
        goto done;
@@ -235,12 +276,16 @@ sys_umtx_wakeup(struct umtx_wakeup_args *uap)
     offset = (vm_offset_t)uap->ptr & PAGE_MASK;
     waddr = (void *)((intptr_t)VM_PAGE_TO_PHYS(m) + offset);
 
+#if 1
     if (uap->count == 1) {
        wakeup_domain_one(waddr, PDOMAIN_UMTX);
     } else {
        /* XXX wakes them all up for now */
        wakeup_domain(waddr, PDOMAIN_UMTX);
     }
+#else
+    wakeup_domain(waddr, PDOMAIN_UMTX);
+#endif
     vm_page_unhold(m);
     error = 0;
 done:
index 29a1bb2..fb0d3d0 100644 (file)
@@ -322,8 +322,9 @@ RetryFault:
        /*
         * Find the vm_map_entry representing the backing store and resolve
         * the top level object and page index.  This may have the side
-        * effect of executing a copy-on-write on the map entry and/or
-        * creating a shadow object, but will not COW any actual VM pages.
+        * effect of executing a copy-on-write on the map entry,
+        * creating a shadow object, or splitting an anonymous entry for
+        * performance, but will not COW any actual VM pages.
         *
         * On success fs.map is left read-locked and various other fields 
         * are initialized but not otherwise referenced or locked.
@@ -774,11 +775,15 @@ vm_fault_page(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
         *
         * This works great for normal programs but will always return
         * NULL for host lookups of vkernel maps in VMM mode.
+        *
+        * NOTE: pmap_fault_page_quick() might not busy the page.  If
+        *       VM_PROT_WRITE or VM_PROT_OVERRIDE_WRITE is set in
+        *       fault_type and pmap_fault_page_quick() returns non-NULL,
+        *       it will safely dirty the returned vm_page_t for us.  We
+        *       cannot safely dirty it here (it might not be busy).
         */
        fs.m = pmap_fault_page_quick(map->pmap, vaddr, fault_type, busyp);
        if (fs.m) {
-               if (fault_type & (VM_PROT_WRITE|VM_PROT_OVERRIDE_WRITE))
-                       vm_page_dirty(fs.m);
                *errorp = 0;
                return(fs.m);
        }
@@ -2075,7 +2080,6 @@ readrest:
                                KKASSERT(fs->first_shared == 0);
                                vm_page_copy(fs->m, fs->first_m);
                                vm_page_protect(fs->m, VM_PROT_NONE);
-                               vm_page_event(fs->m, VMEVENT_COW);
                        }
 
                        /*
@@ -2415,7 +2419,6 @@ vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map,
                        panic("vm_fault_copy_wired: page missing");
 
                vm_page_copy(src_m, dst_m);
-               vm_page_event(src_m, VMEVENT_COW);
 
                /*
                 * Enter it in the pmap...
index 08ce9dc..aa6df20 100644 (file)
@@ -115,35 +115,11 @@ static vm_page_t vm_page_select_cache(u_short pg_color);
 static vm_page_t _vm_page_list_find2(int basequeue, int index);
 static void _vm_page_deactivate_locked(vm_page_t m, int athead);
 
-MALLOC_DEFINE(M_ACTIONHASH, "acthash", "vmpage action hash");
-
 /*
  * Array of tailq lists
  */
 __cachealign struct vpgqueues vm_page_queues[PQ_COUNT];
 
-LIST_HEAD(vm_page_action_list, vm_page_action);
-
-/*
- * Action hash for user umtx support.  Contention is governed by both
- * tsleep/wakeup handling (kern/kern_synch.c) and action_hash[] below.
- * Because action_hash[] represents active table locks, a modest fixed
- * value well in excess of MAXCPU works here.
- *
- * There is also scan overhead depending on the number of threads in
- * umtx*() calls, so we also size the hash table based on maxproc.
- */
-struct vm_page_action_hash {
-       struct vm_page_action_list list;
-       struct lock     lk;
-} __cachealign;
-
-#define VMACTION_MINHSIZE      256
-
-struct vm_page_action_hash     *action_hash;
-static int vmaction_hsize;
-static int vmaction_hmask;
-
 static volatile int vm_pages_waiting;
 static struct alist vm_contig_alist;
 static struct almeta vm_contig_ameta[ALIST_RECORDS_65536];
@@ -562,8 +538,6 @@ vm_numa_organize(vm_paddr_t ran_beg, vm_paddr_t bytes, int physid)
  * allocations.
  *
  * We leave vm_dma_reserved bytes worth of free pages in the reserve pool.
- *
- * Also setup the action_hash[] table here (which is only used by userland)
  */
 static void
 vm_page_startup_finish(void *dummy __unused)
@@ -574,7 +548,6 @@ vm_page_startup_finish(void *dummy __unused)
        alist_blk_t xcount;
        alist_blk_t bfree;
        vm_page_t m;
-       int i;
 
        spin_lock(&vm_contig_spin);
        for (;;) {
@@ -643,33 +616,6 @@ vm_page_startup_finish(void *dummy __unused)
                (intmax_t)(vmstats.v_dma_pages - vm_contig_alist.bl_free) *
                (PAGE_SIZE / 1024),
                (intmax_t)vm_contig_alist.bl_free * (PAGE_SIZE / 1024));
-
-       /*
-        * Scale the action_hash[] array.  Primary contention occurs due
-        * to cpu locks, scaled to ncpus, and scan overhead may be incurred
-        * depending on the number of threads, which we scale to maxproc.
-        *
-        * NOTE: Action lock might recurse due to callback, so allow
-        *       recursion.
-        */
-       vmaction_hsize = VMACTION_MINHSIZE;
-       if (vmaction_hsize < ncpus * 2)
-               vmaction_hsize = ncpus * 2;
-       if (vmaction_hsize < maxproc / 16)
-               vmaction_hsize = maxproc / 16;
-       vmaction_hmask = 1;
-       while (vmaction_hmask < vmaction_hsize)
-               vmaction_hmask = (vmaction_hmask << 1) | 1;
-       vmaction_hsize = vmaction_hmask + 1;
-
-       action_hash = kmalloc(sizeof(action_hash[0]) * vmaction_hsize,
-                             M_ACTIONHASH,
-                             M_WAITOK | M_ZERO);
-
-       for (i = 0; i < vmaction_hsize; i++) {
-               LIST_INIT(&action_hash[i].list);
-               lockinit(&action_hash[i].lk, "actlk", 0, LK_CANRECURSE);
-       }
 }
 SYSINIT(vm_pgend, SI_SUB_PROC0_POST, SI_ORDER_ANY,
        vm_page_startup_finish, NULL);
@@ -3274,85 +3220,6 @@ vm_page_test_dirty(vm_page_t m)
        }
 }
 
-/*
- * Register an action, associating it with its vm_page
- */
-void
-vm_page_register_action(vm_page_action_t action, vm_page_event_t event)
-{
-       struct vm_page_action_hash *hash;
-       int hv;
-
-       hv = (int)((intptr_t)action->m >> 8) & vmaction_hmask;
-       hash = &action_hash[hv];
-
-       lockmgr(&hash->lk, LK_EXCLUSIVE);
-       vm_page_flag_set(action->m, PG_ACTIONLIST);
-       action->event = event;
-       LIST_INSERT_HEAD(&hash->list, action, entry);
-       lockmgr(&hash->lk, LK_RELEASE);
-}
-
-/*
- * Unregister an action, disassociating it from its related vm_page
- */
-void
-vm_page_unregister_action(vm_page_action_t action)
-{
-       struct vm_page_action_hash *hash;
-       int hv;
-
-       hv = (int)((intptr_t)action->m >> 8) & vmaction_hmask;
-       hash = &action_hash[hv];
-       lockmgr(&hash->lk, LK_EXCLUSIVE);
-       if (action->event != VMEVENT_NONE) {
-               action->event = VMEVENT_NONE;
-               LIST_REMOVE(action, entry);
-
-               if (LIST_EMPTY(&hash->list))
-                       vm_page_flag_clear(action->m, PG_ACTIONLIST);
-       }
-       lockmgr(&hash->lk, LK_RELEASE);
-}
-
-/*
- * Issue an event on a VM page.  Corresponding action structures are
- * removed from the page's list and called.
- *
- * If the vm_page has no more pending action events we clear its
- * PG_ACTIONLIST flag.
- */
-void
-vm_page_event_internal(vm_page_t m, vm_page_event_t event)
-{
-       struct vm_page_action_hash *hash;
-       struct vm_page_action *scan;
-       struct vm_page_action *next;
-       int hv;
-       int all;
-
-       hv = (int)((intptr_t)m >> 8) & vmaction_hmask;
-       hash = &action_hash[hv];
-       all = 1;
-
-       lockmgr(&hash->lk, LK_EXCLUSIVE);
-       LIST_FOREACH_MUTABLE(scan, &hash->list, entry, next) {
-               if (scan->m == m) {
-                       if (scan->event == event) {
-                               scan->event = VMEVENT_NONE;
-                               LIST_REMOVE(scan, entry);
-                               scan->func(m, scan);
-                               /* XXX */
-                       } else {
-                               all = 0;
-                       }
-               }
-       }
-       if (all)
-               vm_page_flag_clear(m, PG_ACTIONLIST);
-       lockmgr(&hash->lk, LK_RELEASE);
-}
-
 #include "opt_ddb.h"
 #ifdef DDB
 #include <ddb/ddb.h>
index 1205328..3d06ef1 100644 (file)
 
 #endif
 
-typedef enum vm_page_event { VMEVENT_NONE, VMEVENT_COW } vm_page_event_t;
-
-struct vm_page_action {
-       LIST_ENTRY(vm_page_action) entry;
-       struct vm_page          *m;
-       vm_page_event_t         event;
-       void                    (*func)(struct vm_page *,
-                                       struct vm_page_action *);
-       void                    *data;
-};
-
-typedef struct vm_page_action *vm_page_action_t;
-
 /*
  *     Management of resident (logical) pages.
  *
@@ -301,12 +288,11 @@ extern struct vpgqueues vm_page_queues[PQ_COUNT];
 #define PG_RAM         0x00002000      /* read ahead mark */
 #define PG_SWAPPED     0x00004000      /* backed by swap */
 #define PG_NOTMETA     0x00008000      /* do not back with swap */
-#define PG_ACTIONLIST  0x00010000      /* lookaside action list present */
+#define PG_UNUSED10000 0x00010000
 #define PG_SBUSY       0x00020000      /* soft-busy also set */
 #define PG_NEED_COMMIT 0x00040000      /* clean page requires commit */
 
-#define PG_KEEP_NEWPAGE_MASK   (PG_BUSY | PG_SBUSY |           \
-                                PG_WANTED | PG_ACTIONLIST)
+#define PG_KEEP_NEWPAGE_MASK   (PG_BUSY | PG_SBUSY | PG_WANTED)
 
 
 /*
@@ -451,10 +437,7 @@ void vm_page_zero_invalid(vm_page_t m, boolean_t setvalid);
 void vm_page_free_toq(vm_page_t m);
 void vm_page_free_contig(vm_page_t m, unsigned long size);
 vm_page_t vm_page_free_fromq_fast(void);
-void vm_page_event_internal(vm_page_t, vm_page_event_t);
 void vm_page_dirty(vm_page_t m);
-void vm_page_register_action(vm_page_action_t action, vm_page_event_t event);
-void vm_page_unregister_action(vm_page_action_t action);
 void vm_page_sleep_busy(vm_page_t m, int also_m_busy, const char *msg);
 void VM_PAGE_DEBUG_EXT(vm_page_busy_wait)(vm_page_t m,
                        int also_m_busy, const char *wmsg VM_PAGE_DEBUG_ARGS);
index 8dac2c4..c7b1930 100644 (file)
@@ -166,24 +166,6 @@ vm_paging_needed(void)
     return 0;
 }
 
-static __inline
-void
-vm_page_event(vm_page_t m, vm_page_event_t event)
-{
-    if (m->flags & PG_ACTIONLIST)
-       vm_page_event_internal(m, event);
-}
-
-static __inline
-void
-vm_page_init_action(vm_page_t m, vm_page_action_t action,
-                   void (*func)(vm_page_t, vm_page_action_t), void *data)
-{
-    action->m = m;
-    action->func = func;
-    action->data = data;
-}
-
 /*
  * Clear dirty bits in the VM page but truncate the
  * end to a DEV_BSIZE'd boundary.