RB_GENERATE2(lwp_rb_tree, lwp, u.lwp_rbnode, rb_lwp_compare, lwpid_t, lwp_tid);
+/*
+ * When forking, memory underpinning umtx-supported mutexes may be set
+ * COW causing the physical address to change. We must wakeup any threads
+ * blocked on the physical address to allow them to re-resolve their VM.
+ */
+static void
+wake_umtx_threads(struct proc *p1)
+{
+ struct lwp *lp;
+ struct thread *td;
+
+ RB_FOREACH(lp, lwp_rb_tree, &p1->p_lwp_tree) {
+ td = lp->lwp_thread;
+ if (td && (td->td_flags & TDF_TSLEEPQ) &&
+ (td->td_wdomain & PDOMAIN_MASK) == PDOMAIN_UMTX) {
+ wakeup_domain(td->td_wchan, PDOMAIN_UMTX);
+ }
+ }
+}
+
/*
* fork() system call
*/
}
vm_fork(p1, 0, flags);
+ wake_umtx_threads(p1);
/*
* Close all file descriptors.
PHOLD(p1);
vm_fork(p1, p2, flags);
+ wake_umtx_threads(p1);
/*
* Create the first lwp associated with the new proc.
#include <sys/sysunion.h>
#include <sys/sysent.h>
#include <sys/syscall.h>
+#include <sys/sysctl.h>
#include <sys/module.h>
#include <cpu/lwbuf.h>
#include <machine/vmm.h>
-static void umtx_sleep_page_action_cow(vm_page_t m, vm_page_action_t action);
+/*
+ * Improve umtx performance by polling for 4uS before going to sleep.
+ * This can avoid many IPIs in typical pthreads mutex situations.
+ */
+#ifdef _RDTSC_SUPPORTED_
+static int umtx_delay = 4000;
+SYSCTL_INT(_kern, OID_AUTO, umtx_delay, CTLFLAG_RW, &umtx_delay, 0, "");
+#endif
/*
* If the contents of the userland-supplied pointer matches the specified
{
struct lwbuf lwb_cache;
struct lwbuf *lwb;
- struct vm_page_action action;
vm_page_t m;
void *waddr;
int offset;
int timeout;
- int error = EBUSY;
+ int error;
if (uap->timeout < 0)
return (EINVAL);
* Otherwise the physical page we sleep on my not match the page
* being woken up.
*
+ * The returned page is held, and this hold count prevents it from
+ * being paged out. This is important since we are sleeping on what
+ * is essentially a physical address (which avoids all sorts of
+ * collision-space issues).
+ *
* WARNING! We can only use vm_fault_page*() for reading data. We
* cannot use it for writing data because there is no pmap
* interlock to protect against flushes/pageouts.
+ *
+ * (XXX with recent code work, this may no longer be an issue)
+ *
+ * WARNING! If the user program replaces the mapping of the underlying
+ * uap->ptr, the physical address may change and the umtx code
+ * will not be able to match wakeups with tsleeps.
*/
m = vm_fault_page_quick((vm_offset_t)uap->ptr,
- VM_PROT_READ|VM_PROT_WRITE, &error, NULL);
+ VM_PROT_READ | VM_PROT_WRITE, &error, NULL);
if (m == NULL) {
error = EFAULT;
goto done;
lwb = lwbuf_alloc(m, &lwb_cache);
offset = (vm_offset_t)uap->ptr & PAGE_MASK;
- /*
- * The critical section is required to interlock the tsleep against
- * a wakeup from another cpu. The lfence forces synchronization.
- */
+ error = EBUSY;
if (*(int *)(lwbuf_kva(lwb) + offset) == uap->value) {
+#ifdef _RDTSC_SUPPORTED_
+ /*
+ * Poll a little while before sleeping, most mutexes are
+ * short-lived.
+ */
+ if (umtx_delay) {
+ int64_t tsc_target;
+ int good = 0;
+
+ tsc_target = tsc_get_target(umtx_delay);
+ while (tsc_test_target(tsc_target) == 0) {
+ cpu_lfence();
+ if (*(int *)(lwbuf_kva(lwb) + offset) != uap->value) {
+ good = 1;
+ break;
+ }
+ cpu_pause();
+ }
+ if (good) {
+ error = EBUSY;
+ goto skip;
+ }
+ }
+#endif
+ /*
+ * Calculate the timeout. This will be acccurate to within ~2 ticks.
+ */
if ((timeout = uap->timeout) != 0) {
timeout = (timeout / 1000000) * hz +
((timeout % 1000000) * hz + 999999) / 1000000;
}
+
+ /*
+ * Calculate the physical address of the mutex. This gives us
+ * good distribution between unrelated processes using the
+ * feature.
+ */
waddr = (void *)((intptr_t)VM_PAGE_TO_PHYS(m) + offset);
/*
* Wake us up if the memory location COWs while we are sleeping.
+ * Use a critical section to tighten up the interlock. Also,
+ * tsleep_remove() requires the caller be in a critical section.
*/
crit_enter();
- vm_page_init_action(m, &action, umtx_sleep_page_action_cow, waddr);
- vm_page_register_action(&action, VMEVENT_COW);
/*
* We must interlock just before sleeping. If we interlock before
* registration the lock operations done by the registration can
* interfere with it.
+ *
+ * We cannot leave our interlock hanging on return because this
+ * will interfere with umtx_wakeup() calls with limited wakeup
+ * counts.
*/
tsleep_interlock(waddr, PCATCH | PDOMAIN_UMTX);
- if (*(int *)(lwbuf_kva(lwb) + offset) == uap->value &&
- action.event == VMEVENT_COW) {
+ cpu_lfence();
+ if (*(int *)(lwbuf_kva(lwb) + offset) == uap->value) {
error = tsleep(waddr, PCATCH | PINTERLOCKED | PDOMAIN_UMTX,
"umtxsl", timeout);
} else {
+ tsleep_remove(curthread);
error = EBUSY;
}
- vm_page_unregister_action(&action);
crit_exit();
/* Always break out in case of signal, even if restartable */
if (error == ERESTART)
} else {
error = EBUSY;
}
-
+skip:
lwbuf_free(lwb);
vm_page_unhold(m);
done:
return(error);
}
-/*
- * If this page is being copied it may no longer represent the page
- * underlying our virtual address. Wake up any umtx_sleep()'s
- * that were waiting on its physical address to force them to retry.
- */
-static void
-umtx_sleep_page_action_cow(vm_page_t m, vm_page_action_t action)
-{
- wakeup_domain(action->data, PDOMAIN_UMTX);
-}
-
/*
* umtx_wakeup { const int *ptr, int count }
*
if ((vm_offset_t)uap->ptr & (sizeof(int) - 1))
return (EFAULT);
m = vm_fault_page_quick((vm_offset_t)uap->ptr,
- VM_PROT_READ, &error, NULL);
+ VM_PROT_READ | VM_PROT_WRITE, &error, NULL);
if (m == NULL) {
error = EFAULT;
goto done;
offset = (vm_offset_t)uap->ptr & PAGE_MASK;
waddr = (void *)((intptr_t)VM_PAGE_TO_PHYS(m) + offset);
+#if 1
if (uap->count == 1) {
wakeup_domain_one(waddr, PDOMAIN_UMTX);
} else {
/* XXX wakes them all up for now */
wakeup_domain(waddr, PDOMAIN_UMTX);
}
+#else
+ wakeup_domain(waddr, PDOMAIN_UMTX);
+#endif
vm_page_unhold(m);
error = 0;
done:
/*
* Find the vm_map_entry representing the backing store and resolve
* the top level object and page index. This may have the side
- * effect of executing a copy-on-write on the map entry and/or
- * creating a shadow object, but will not COW any actual VM pages.
+ * effect of executing a copy-on-write on the map entry,
+ * creating a shadow object, or splitting an anonymous entry for
+ * performance, but will not COW any actual VM pages.
*
* On success fs.map is left read-locked and various other fields
* are initialized but not otherwise referenced or locked.
*
* This works great for normal programs but will always return
* NULL for host lookups of vkernel maps in VMM mode.
+ *
+ * NOTE: pmap_fault_page_quick() might not busy the page. If
+ * VM_PROT_WRITE or VM_PROT_OVERRIDE_WRITE is set in
+ * fault_type and pmap_fault_page_quick() returns non-NULL,
+ * it will safely dirty the returned vm_page_t for us. We
+ * cannot safely dirty it here (it might not be busy).
*/
fs.m = pmap_fault_page_quick(map->pmap, vaddr, fault_type, busyp);
if (fs.m) {
- if (fault_type & (VM_PROT_WRITE|VM_PROT_OVERRIDE_WRITE))
- vm_page_dirty(fs.m);
*errorp = 0;
return(fs.m);
}
KKASSERT(fs->first_shared == 0);
vm_page_copy(fs->m, fs->first_m);
vm_page_protect(fs->m, VM_PROT_NONE);
- vm_page_event(fs->m, VMEVENT_COW);
}
/*
panic("vm_fault_copy_wired: page missing");
vm_page_copy(src_m, dst_m);
- vm_page_event(src_m, VMEVENT_COW);
/*
* Enter it in the pmap...
static vm_page_t _vm_page_list_find2(int basequeue, int index);
static void _vm_page_deactivate_locked(vm_page_t m, int athead);
-MALLOC_DEFINE(M_ACTIONHASH, "acthash", "vmpage action hash");
-
/*
* Array of tailq lists
*/
__cachealign struct vpgqueues vm_page_queues[PQ_COUNT];
-LIST_HEAD(vm_page_action_list, vm_page_action);
-
-/*
- * Action hash for user umtx support. Contention is governed by both
- * tsleep/wakeup handling (kern/kern_synch.c) and action_hash[] below.
- * Because action_hash[] represents active table locks, a modest fixed
- * value well in excess of MAXCPU works here.
- *
- * There is also scan overhead depending on the number of threads in
- * umtx*() calls, so we also size the hash table based on maxproc.
- */
-struct vm_page_action_hash {
- struct vm_page_action_list list;
- struct lock lk;
-} __cachealign;
-
-#define VMACTION_MINHSIZE 256
-
-struct vm_page_action_hash *action_hash;
-static int vmaction_hsize;
-static int vmaction_hmask;
-
static volatile int vm_pages_waiting;
static struct alist vm_contig_alist;
static struct almeta vm_contig_ameta[ALIST_RECORDS_65536];
* allocations.
*
* We leave vm_dma_reserved bytes worth of free pages in the reserve pool.
- *
- * Also setup the action_hash[] table here (which is only used by userland)
*/
static void
vm_page_startup_finish(void *dummy __unused)
alist_blk_t xcount;
alist_blk_t bfree;
vm_page_t m;
- int i;
spin_lock(&vm_contig_spin);
for (;;) {
(intmax_t)(vmstats.v_dma_pages - vm_contig_alist.bl_free) *
(PAGE_SIZE / 1024),
(intmax_t)vm_contig_alist.bl_free * (PAGE_SIZE / 1024));
-
- /*
- * Scale the action_hash[] array. Primary contention occurs due
- * to cpu locks, scaled to ncpus, and scan overhead may be incurred
- * depending on the number of threads, which we scale to maxproc.
- *
- * NOTE: Action lock might recurse due to callback, so allow
- * recursion.
- */
- vmaction_hsize = VMACTION_MINHSIZE;
- if (vmaction_hsize < ncpus * 2)
- vmaction_hsize = ncpus * 2;
- if (vmaction_hsize < maxproc / 16)
- vmaction_hsize = maxproc / 16;
- vmaction_hmask = 1;
- while (vmaction_hmask < vmaction_hsize)
- vmaction_hmask = (vmaction_hmask << 1) | 1;
- vmaction_hsize = vmaction_hmask + 1;
-
- action_hash = kmalloc(sizeof(action_hash[0]) * vmaction_hsize,
- M_ACTIONHASH,
- M_WAITOK | M_ZERO);
-
- for (i = 0; i < vmaction_hsize; i++) {
- LIST_INIT(&action_hash[i].list);
- lockinit(&action_hash[i].lk, "actlk", 0, LK_CANRECURSE);
- }
}
SYSINIT(vm_pgend, SI_SUB_PROC0_POST, SI_ORDER_ANY,
vm_page_startup_finish, NULL);
}
}
-/*
- * Register an action, associating it with its vm_page
- */
-void
-vm_page_register_action(vm_page_action_t action, vm_page_event_t event)
-{
- struct vm_page_action_hash *hash;
- int hv;
-
- hv = (int)((intptr_t)action->m >> 8) & vmaction_hmask;
- hash = &action_hash[hv];
-
- lockmgr(&hash->lk, LK_EXCLUSIVE);
- vm_page_flag_set(action->m, PG_ACTIONLIST);
- action->event = event;
- LIST_INSERT_HEAD(&hash->list, action, entry);
- lockmgr(&hash->lk, LK_RELEASE);
-}
-
-/*
- * Unregister an action, disassociating it from its related vm_page
- */
-void
-vm_page_unregister_action(vm_page_action_t action)
-{
- struct vm_page_action_hash *hash;
- int hv;
-
- hv = (int)((intptr_t)action->m >> 8) & vmaction_hmask;
- hash = &action_hash[hv];
- lockmgr(&hash->lk, LK_EXCLUSIVE);
- if (action->event != VMEVENT_NONE) {
- action->event = VMEVENT_NONE;
- LIST_REMOVE(action, entry);
-
- if (LIST_EMPTY(&hash->list))
- vm_page_flag_clear(action->m, PG_ACTIONLIST);
- }
- lockmgr(&hash->lk, LK_RELEASE);
-}
-
-/*
- * Issue an event on a VM page. Corresponding action structures are
- * removed from the page's list and called.
- *
- * If the vm_page has no more pending action events we clear its
- * PG_ACTIONLIST flag.
- */
-void
-vm_page_event_internal(vm_page_t m, vm_page_event_t event)
-{
- struct vm_page_action_hash *hash;
- struct vm_page_action *scan;
- struct vm_page_action *next;
- int hv;
- int all;
-
- hv = (int)((intptr_t)m >> 8) & vmaction_hmask;
- hash = &action_hash[hv];
- all = 1;
-
- lockmgr(&hash->lk, LK_EXCLUSIVE);
- LIST_FOREACH_MUTABLE(scan, &hash->list, entry, next) {
- if (scan->m == m) {
- if (scan->event == event) {
- scan->event = VMEVENT_NONE;
- LIST_REMOVE(scan, entry);
- scan->func(m, scan);
- /* XXX */
- } else {
- all = 0;
- }
- }
- }
- if (all)
- vm_page_flag_clear(m, PG_ACTIONLIST);
- lockmgr(&hash->lk, LK_RELEASE);
-}
-
#include "opt_ddb.h"
#ifdef DDB
#include <ddb/ddb.h>
#endif
-typedef enum vm_page_event { VMEVENT_NONE, VMEVENT_COW } vm_page_event_t;
-
-struct vm_page_action {
- LIST_ENTRY(vm_page_action) entry;
- struct vm_page *m;
- vm_page_event_t event;
- void (*func)(struct vm_page *,
- struct vm_page_action *);
- void *data;
-};
-
-typedef struct vm_page_action *vm_page_action_t;
-
/*
* Management of resident (logical) pages.
*
#define PG_RAM 0x00002000 /* read ahead mark */
#define PG_SWAPPED 0x00004000 /* backed by swap */
#define PG_NOTMETA 0x00008000 /* do not back with swap */
-#define PG_ACTIONLIST 0x00010000 /* lookaside action list present */
+#define PG_UNUSED10000 0x00010000
#define PG_SBUSY 0x00020000 /* soft-busy also set */
#define PG_NEED_COMMIT 0x00040000 /* clean page requires commit */
-#define PG_KEEP_NEWPAGE_MASK (PG_BUSY | PG_SBUSY | \
- PG_WANTED | PG_ACTIONLIST)
+#define PG_KEEP_NEWPAGE_MASK (PG_BUSY | PG_SBUSY | PG_WANTED)
/*
void vm_page_free_toq(vm_page_t m);
void vm_page_free_contig(vm_page_t m, unsigned long size);
vm_page_t vm_page_free_fromq_fast(void);
-void vm_page_event_internal(vm_page_t, vm_page_event_t);
void vm_page_dirty(vm_page_t m);
-void vm_page_register_action(vm_page_action_t action, vm_page_event_t event);
-void vm_page_unregister_action(vm_page_action_t action);
void vm_page_sleep_busy(vm_page_t m, int also_m_busy, const char *msg);
void VM_PAGE_DEBUG_EXT(vm_page_busy_wait)(vm_page_t m,
int also_m_busy, const char *wmsg VM_PAGE_DEBUG_ARGS);
return 0;
}
-static __inline
-void
-vm_page_event(vm_page_t m, vm_page_event_t event)
-{
- if (m->flags & PG_ACTIONLIST)
- vm_page_event_internal(m, event);
-}
-
-static __inline
-void
-vm_page_init_action(vm_page_t m, vm_page_action_t action,
- void (*func)(vm_page_t, vm_page_action_t), void *data)
-{
- action->m = m;
- action->func = func;
- action->data = data;
-}
-
/*
* Clear dirty bits in the VM page but truncate the
* end to a DEV_BSIZE'd boundary.